In [134]:
#import necessary libraries
import numpy as np
import pandas as pd

import nltk
import time
#Tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

In [64]:
from tensorflow.python.client import device_lib
#print(device_lib.list_local_devices())

## Retrieving Data

In [5]:
#Read csv
train = pd.read_csv (r'train.csv')
test = pd.read_csv (r'test.csv')

In [48]:
df_train = pd.DataFrame(train, columns= ['id','raw_address','POI/street'])
print(df_train)

            id                                        raw_address  \
0            0  jl kapuk timur delta sili iii lippo cika 11 a ...   
1            1                                 aye, jati sampurna   
2            2               setu siung 119 rt 5 1 13880 cipayung   
3            3                               toko dita, kertosono   
4            4                                      jl. orde baru   
...        ...                                                ...   
299995  299995               jend ahmad yani 331 kertasari ciamis   
299996  299996                 raya cila kko, cilandak timur kel.   
299997  299997                     tanjung gusta jl. yaya 2 no 17   
299998  299998  jalan cipadu jaya taman asri gang bijaksana 3 ...   
299999  299999          jaya maspion permata blok beryl b2  no.58   

                                       POI/street  
0       /jl kapuk timur delta sili iii lippo cika  
1                                               /  
2              

In [49]:
df_test = pd.DataFrame(test, columns= ['id','raw_address'])
print(df_test)

          id                                        raw_address
0          0              s. par 53 sidanegara 4 cilacap tengah
1          1              angg per, baloi indah kel. lubuk baja
2          2                              asma laun, mand imog,
3          3      ud agung rej, raya nga sri wedari karanganyar
4          4                         cut mutia, 35 baiturrahman
...      ...                                                ...
49995  49995                    toko mbak farid semboro semboro
49996  49996     vie - tk. ridho kids, vete 3 cari, 16720 ciawi
49997  49997                mart dan roti bakar malabar, nasio,
49998  49998  graha indah pamulang jl. mujair raya bambu apu...
49999  49999                                               adi,

[50000 rows x 2 columns]


In [7]:
#Call all the raw addresses
df_train.raw_address

0         jl kapuk timur delta sili iii lippo cika 11 a ...
1                                        aye, jati sampurna
2                      setu siung 119 rt 5 1 13880 cipayung
3                                      toko dita, kertosono
4                                             jl. orde baru
                                ...                        
299995                 jend ahmad yani 331 kertasari ciamis
299996                   raya cila kko, cilandak timur kel.
299997                       tanjung gusta jl. yaya 2 no 17
299998    jalan cipadu jaya taman asri gang bijaksana 3 ...
299999            jaya maspion permata blok beryl b2  no.58
Name: raw_address, Length: 300000, dtype: object

In [8]:
#Get a rows of addresses
df_train.loc[2:6]

Unnamed: 0,id,raw_address
2,2,setu siung 119 rt 5 1 13880 cipayung
3,3,"toko dita, kertosono"
4,4,jl. orde baru
5,5,"raya samb gede, 299 toko bb kids"
6,6,"kem mel raya, no 4 bojong rawalumbu rt 1 36 ra..."


In [9]:
df_train.raw_address.shape

(300000,)

## Tokenization

In [26]:
def tokenize(contents):
  allAddresses = contents.raw_address
  tokens = []
  for address in allAddresses:
    #print(address.split(", "))
    for text in address.split(", "):
      tokens.append(text)
  return tokens

In [29]:
#Get Tokens
train_tokens = tokenize(df_train)

In [31]:
#Number of tokens found
len(train_tokens)

474418

In [45]:
# number of UNIQUE tokens found
vocabulary = set(train_tokens)
len(vocabulary)

377844

In [47]:
frequency_dist = nltk.FreqDist(train_tokens)
#Show top 50 tokens
sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True)[0:50]

['jend sudi',
 'ahmad yani',
 'jend ahmad yani',
 'pem',
 'gajah mada',
 'soek hatta',
 'vete',
 'yos suda',
 'imam bon',
 'dip',
 'gatot subr',
 'depok',
 'pahl',
 'teuku umar',
 'man',
 'no 1',
 'hayam wuruk',
 'pram',
 'brig kata',
 'jend sudi,',
 'merd',
 'kuta',
 'rw 1',
 'denpasar barat',
 'cengkareng',
 'setia budi',
 'ir. h. jua',
 'pang sudi',
 'kebon jeruk',
 'rw 4',
 'nasio',
 'ban',
 'jawa barat',
 'ahmad yani,',
 'mawar',
 'mas',
 'jati',
 'gar',
 'raya ser',
 'toko kelon',
 'taman sari',
 'peri kem',
 'denpasar selatan',
 'siliw',
 'urip sumoh',
 'waru',
 'tangerang',
 'cipayung',
 'bas rah',
 'balikpapan selatan']

## Set Up the Model

In [182]:
raws = df_train['raw_address'].to_numpy()
results = df_train['POI/street'].to_numpy()

In [184]:
# TOKENIZE TEXT SEQUENCE
tokenizer = Tokenizer()
tokenizer.fit_on_texts(raws)
sequences = tokenizer.texts_to_sequences(raws)

# convert to numpy array
sequences = np.array(sequences)
x = sequences

In [203]:
# TOKENIZE TEXT SEQUENCE
sequences = tokenizer.texts_to_sequences(results)

# convert to numpy array
sequences = np.array(sequences)
y = sequences

In [204]:
x

array([list([8, 282, 12, 876, 7664, 39, 2147, 656, 31, 45, 6855, 107, 291]),
       list([9761, 48, 494]),
       list([401, 16058, 1538, 2, 13, 4, 9368, 174]), ...,
       list([42, 3560, 8, 1277, 5, 1, 61]),
       list([90, 1941, 16, 29, 129, 58, 15684, 7, 1728, 210, 1, 383]),
       list([16, 7004, 600, 28, 13091, 1708, 1, 426])], dtype=object)

In [186]:
print("X: ",x.shape)
print("Y: ",y.shape)

X:  (300000,)
Y:  (300000,)


In [187]:
word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
print(word_index)
print(vocab_size)

ingsir': 80624, '336a': 80625, 'mampan': 80626, 'penjarin': 80627, 'gimbal': 80628, 'qoryah': 80629, '43tg9': 80630, 'dwina': 80631, 'dispenda': 80632, 'halteu': 80633, 'baswey': 80634, '144ab': 80635, 'nurfau': 80636, 'chasanah': 80637, '46138': 80638, 'gadri': 80639, '93122': 80640, 'leang': 80641, '1166': 80642, 'pilli': 80643, 'emwe': 80644, 'religious': 80645, 'abshor': 80646, 'irsad': 80647, 'samiran': 80648, 'kimiko': 80649, 'carano': 80650, 'coms': 80651, '674': 80652, 'darmika': 80653, '12kel': 80654, 'jans': 80655, 'permono': 80656, 'wayd': 80657, 'atut': 80658, 'widyaningsih': 80659, 'indoba': 80660, 'samira': 80661, '37m': 80662, 'langa': 80663, 'allstate': 80664, 'benks': 80665, 'kalikuning': 80666, 'lumer': 80667, 'rezekimu': 80668, 'maskot': 80669, 'romza': 80670, 'clotes': 80671, 'ngemp': 80672, 'pundhi': 80673, 'semprul': 80674, 'pohea': 80675, 'pandanwang': 80676, 'kedhai': 80677, 'pertumbukan': 80678, '61373': 80679, 'gbdi': 80680, 'kany': 80681, 'oucha': 80682, 'dan

In [188]:
print("Raw address: ",raws[5])
print(x[5])
print("POI/Street: ",results[5])
print(y[5])

Raw address:  raya samb gede, 299 toko bb kids
[3, 3221, 128, 7032, 23, 3222, 2181]
POI/Street:  toko bb kids/raya samb gede
[23, 3222, 2181, 3, 3221, 128]


In [189]:
#Variables
RANDOM_STATE = 50
EPOCHS = 150
BATCH_SIZE = 2048
TRAINING_LENGTH = 50
TRAIN_FRACTION = 0.7
LSTM_CELLS = 64
VERBOSE = 0
SAVE_MODEL = True
# OUTPUT DIMENSION OF THE EMBEDDING LAYER
EM_OUTPUT_LENGTH = 50

In [190]:
# GET THE SEQUENCE LENGTH
seq_length = x.shape[0]
seq_length

300000

In [191]:
# Fit Model function
def fit_model(batch_size, X, y, model_name, epochs):
    model = Sequential()
    model.add(Embedding(vocab_size, EM_OUTPUT_LENGTH, input_length=seq_length))
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(128))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))

    optimizer = Adam(learning_rate=0.001)

    model.compile(loss='categorical_crossentropy', optimizer = optimizer, metrics = ['accuracy'])

    # save the weights for different configuration
    filepath = f"{model_name}-{epochs}_epoch-{batch_size}_batch_size"

    csv_logger = CSVLogger(filepath+".csv",append=False,separator=',')
    checkpoint = ModelCheckpoint(filepath+".h5", monitor='loss', verbose=1, save_best_only=True, mode='min')
    reduce_lr = ReduceLROnPlateau(monitor='accuracy', factor=0.2, patience=1, min_lr=1e-8)
    early_stopping = EarlyStopping(monitor='loss',
                                    mode='min',
                                    verbose=1,
                                    patience=5)
    desired_callbacks = [checkpoint, early_stopping, csv_logger, reduce_lr]
    start = time.perf_counter()
    model.fit(X, y, 
              batch_size = batch_size, 
              epochs = epochs, 
              callbacks=desired_callbacks)
    time_taken = time.perf_counter() - start
    print(f"Total time taken for {epochs} epochs: {time_taken} seconds")
    print(f"Average time taken per epochs: {time_taken/epochs} seconds")
    print(f"Total time taken to train: {time_taken} seconds")

    return model

In [192]:
# LOAD PRE-TRAINED H5 FILES
# Write a function to load different variant of the pre-trained model
def load_pretrain_model(model_name,batch_size,epochs, optimizer_type):
    filepath = f"{WEIGHTS_DIR}{model_name}-{epochs}_epoch-{batch_size}_batch_size-{optimizer_type}.h5"
    # debug
    #print(filepath)

    # load the model
    model = load_model(filepath)
    #model.summary()

    return model

In [207]:
model = fit_model(batch_size=128, X=x, y=y, model_name="LSTM", epochs=1)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [198]:
x

array([list([8, 282, 12, 876, 7664, 39, 2147, 656, 31, 45, 6855, 107, 291]),
       list([9761, 48, 494]),
       list([401, 16058, 1538, 2, 13, 4, 9368, 174]), ...,
       list([42, 3560, 8, 1277, 5, 1, 61]),
       list([90, 1941, 16, 29, 129, 58, 15684, 7, 1728, 210, 1, 383]),
       list([16, 7004, 600, 28, 13091, 1708, 1, 426])], dtype=object)