### This is from a Kaggle competition: https://www.kaggle.com/c/nlp-getting-started/submit

In [1]:
import pandas as pd

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [2]:
df_train = df_train

In [3]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
import sidetable
df_train.stb.missing(style=True)

Unnamed: 0,missing,total,percent
location,2533,7613,33.27%
keyword,61,7613,0.80%
id,0,7613,0.00%
text,0,7613,0.00%
target,0,7613,0.00%


In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
import numpy as np
def get_pad_sq(df):
    text = np.array(df.text)
    tokenizer = Tokenizer(num_words=len(df.text.unique()))
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    df = sequence.pad_sequences(sequences, maxlen=150)
    return df

In [6]:
X_train = get_pad_sq(df_train)
test = get_pad_sq(df_test)
y_train = np.array(df_train.target)

In [8]:
'''
Credit source: 
    https://www.tensorflow.org/tutorials/keras/keras_tuner
    https://medium.com/@mrunal68/text-sentiments-classification-with-cnn-and-lstm-f92652bc29fd
    https://blog.tensorflow.org/2020/01/hyperparameter-tuning-with-keras-tuner.html
'''

from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, MaxPooling1D, LSTM, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

def build_model(hp):
    # create the model
    embedding_vector_length = 32
    vocab_size = 10000
    hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_vector_length, input_length=150))
    model.add(MaxPooling1D())
    model.add(LSTM(64, return_sequences = False))
    model.add(Dense(units=hp_units, activation="relu"))
    model.add(Dropout(0.25))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                loss='binary_crossentropy',
                metrics=['accuracy'])
    return model

In [9]:
import kerastuner as kt

tuner = kt.Hyperband(build_model,
                     objective='val_accuracy',
                     max_epochs=16,
                     overwrite=True)

In [10]:
# checkpoint = ModelCheckpoint('best_model.h5', monitor='val_accuracy',verbose=0, save_best_only=True)
es = EarlyStopping(monitor='val_accuracy', verbose=0, patience=10)
callbacks_list = [es]

In [11]:
tuner.search(X_train, y_train, epochs=64, batch_size=32, validation_split=0.1, callbacks=callbacks_list)

Trial 30 Complete [00h 00m 39s]
val_accuracy: 0.9007092118263245

Best val_accuracy So Far: 1.0
Total elapsed time: 00h 08m 03s
INFO:tensorflow:Oracle triggered exit


In [12]:
# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

In [13]:
# Build the model with the optimal hyperparameters and train it on the data for 64 epochs
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=64, batch_size=32, validation_split=0.1)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


In [14]:
from keras.models import save_model
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(X_train, y_train, epochs=best_epoch)
hypermodel.save('best_model.h5')



In [15]:
eval_result = hypermodel.evaluate(X_train, y_train)
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: [0.671535074710846, 0.5995033979415894]


In [16]:
# Load the model and predict
from keras.models import load_model
hypermodel = load_model('best_model.h5')
y_pred = hypermodel.predict(test)

In [17]:
y_pred = pd.DataFrame([1 if x >= 0.5 else 0 for x in y_pred], columns=['target'], index=df_test.id)

In [18]:
# saving the dataframe 
y_pred.to_csv('Predictions.csv') 

In [19]:
import winsound
duration = 2000  # milliseconds
freq = 3000  # Hz
winsound.Beep(freq, duration)