In [1]:
import keras_tuner as kt
# initialize data for hyperparameter tuning
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
import regex as re
import html
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Reshape, SimpleRNN
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from tensorflow import string
from tensorflow_hub import KerasLayer

psy = pd.read_csv('datasets/Youtube01-Psy.csv')
katy_perry = pd.read_csv('datasets/Youtube02-KatyPerry.csv')
lmfao = pd.read_csv('datasets/Youtube03-LMFAO.csv')
eminem = pd.read_csv('datasets/Youtube04-Eminem.csv')
shakira = pd.read_csv('datasets/Youtube05-Shakira.csv')
df_1 = pd.concat([psy, katy_perry, lmfao, eminem, shakira], ignore_index=True)
df_1 = df_1[['CONTENT', 'CLASS']]
df_1 = df_1.rename(columns={'CONTENT': 'comment', 'CLASS': 'target'})
df_2 = pd.read_csv('datasets/comments_1.csv')
df_2 = df_2[['Comment', 'Spam']]
df_2 = df_2.rename(columns={'Comment': 'comment', 'Spam': 'target'})
df = pd.concat([df_1, df_2], ignore_index=True)

en_stopwords = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def pre_process(text):
    html_entities_removed = html.unescape(text.lower())
    html_tags_removed = re.sub('<.*?>', ' ', html_entities_removed)
    link_keywords = re.sub(r'(?<=(http|[wW]{3}))\S+', ' ', html_tags_removed)
    nonlatin_chars_removed = re.sub(r'[^A-Za-z\s]+', '  ', link_keywords)
    tokenized = word_tokenize(nonlatin_chars_removed)
    stopwords_removed = [word for word in tokenized if not word in en_stopwords]
    lemmatized = [lemmatizer.lemmatize(word) for word in stopwords_removed]
    output = ' '.join(lemmatized)
    return output

df['cleaned'] = df['comment'].apply(pre_process)
df = df.replace('', float('NaN')).dropna().drop_duplicates()
X = df['cleaned'].values
y = df['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [4]:
def build_model(hp):
	model = Sequential()
	model.add(KerasLayer('nnlm_en_dim50_2', input_shape=[], dtype=string, trainable=True))
	model.add(Reshape((1,50)))
	model.add(SimpleRNN(hp.Choice('Layers',[32,64,128])))
	model.add(Dense(hp.Choice('Hidden Layers',[17,40,61]), activation='relu'))
	model.add(Dense(1, activation='sigmoid'))
	
	model.compile(
		optimizer=Adam(learning_rate=hp.Choice('Learning Rate', [0.0005, 0.00075, 0.001])),
		loss='binary_crossentropy', metrics=['accuracy']
	)
	return model

build_model(kt.HyperParameters())

tuner = kt.RandomSearch(
	hypermodel=build_model,
	objective='val_loss',
	max_trials=300,
	seed=42,
	directory='hyperparameter_tuning',
	project_name='ytcs'
)

tuner.search_space_summary()

Search space summary
Default search space size: 3
Layers (Choice)
{'default': 32, 'conditions': [], 'values': [32, 64, 128], 'ordered': True}
Hidden Layers (Choice)
{'default': 17, 'conditions': [], 'values': [17, 40, 61], 'ordered': True}
Learning Rate (Choice)
{'default': 0.0005, 'conditions': [], 'values': [0.0005, 0.00075, 0.001], 'ordered': True}


In [5]:
tuner.search(
    X_train,
    y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[
        EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=1)
    ]
)

Trial 27 Complete [00h 00m 31s]
val_loss: 0.25419744849205017

Best val_loss So Far: 0.23909778892993927
Total elapsed time: 00h 14m 30s
INFO:tensorflow:Oracle triggered exit


In [6]:
tuner.results_summary()

Results summary
Results in hyperparameter_tuning\ytcs
Showing 10 best trials
<keras_tuner.engine.objective.Objective object at 0x000002D09779BCD0>
Trial summary
Hyperparameters:
Layers: 32
Hidden Layers: 40
Learning Rate: 0.00075
Score: 0.23909778892993927
Trial summary
Hyperparameters:
Layers: 32
Hidden Layers: 40
Learning Rate: 0.0005
Score: 0.24265210330486298
Trial summary
Hyperparameters:
Layers: 32
Hidden Layers: 40
Learning Rate: 0.001
Score: 0.24302244186401367
Trial summary
Hyperparameters:
Layers: 64
Hidden Layers: 40
Learning Rate: 0.001
Score: 0.24554672837257385
Trial summary
Hyperparameters:
Layers: 64
Hidden Layers: 40
Learning Rate: 0.00075
Score: 0.24828709661960602
Trial summary
Hyperparameters:
Layers: 32
Hidden Layers: 61
Learning Rate: 0.00075
Score: 0.2494523823261261
Trial summary
Hyperparameters:
Layers: 64
Hidden Layers: 17
Learning Rate: 0.0005
Score: 0.2509346008300781
Trial summary
Hyperparameters:
Layers: 32
Hidden Layers: 61
Learning Rate: 0.001
Score: 0.2

In [7]:
model = build_model(tuner.get_best_hyperparameters()[0])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer_1 (KerasLayer)  (None, 50)                48190600  
                                                                 
 reshape_1 (Reshape)         (None, 1, 50)             0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 32)                2656      
                                                                 
 dense_2 (Dense)             (None, 40)                1320      
                                                                 
 dense_3 (Dense)             (None, 1)                 41        
                                                                 
Total params: 48,194,617
Trainable params: 48,194,617
Non-trainable params: 0
_________________________________________________________________
