In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.optimizers import Adam, RMSprop
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras import layers
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout

In [4]:
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("validation.csv")
test_df = pd.read_csv("test.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(train_df["Comment_Adj"])
tokenizer.fit_on_texts(val_df["Comment_Adj"])
X_train = tokenizer.texts_to_sequences(train_df["Comment_Adj"])
X_val = tokenizer.texts_to_sequences(val_df["Comment_Adj"])

vocab_size = len(tokenizer.word_index) + 1

maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_val = pad_sequences(X_val, padding='post', maxlen=maxlen)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df["Result_Bin"])
y_val = label_encoder.fit_transform(val_df["Result_Bin"])

In [None]:
def set_all_seeds(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    tf.random.set_seed(seed_value)

set_all_seeds(42)

def CNN_model_adj(embedding=200, filter=16, kernel=4, num_1=40, lr=0.01, dropout_rate=0.5):
    set_all_seeds(42)
    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size, 
                               output_dim=embedding, 
                               input_length=maxlen))
    model.add(Conv1D(filters=filter, kernel_size=kernel, activation="relu"))
    model.add(MaxPooling1D(pool_size=2))
    model.add(layers.Flatten())
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(num_1, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=lr),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


filters = [16, 32, 48]
kernels = [4, 6]
num_1 = [40, 70, 100]
lrs = [0.01, 0.001]
dropout_rate = [0.5, 0.6]

best_accuracy = 0
best_history = None
best_model_desc = ""
best_model_cnn = None

for filter in filters:
    for kernel in kernels:
        for num in num_1:
            for lr in lrs:
                for rate in dropout_rate:
                    model_desc = f"filter = {filter}, kernel = {kernel}, num_1 = {num}, lr = {lr}, dropout_rate = {rate}"
                    model = CNN_model_adj(embedding = 200, filter = filter, kernel = kernel, num_1 = num, lr = lr, dropout_rate = rate)
                    history = model.fit(X_train, y_train,
                    epochs=30,
                    verbose=False,
                    validation_data=(X_val, y_val),
                    batch_size=1000, callbacks = [early_stopping])
                    val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=False)

                    if val_accuracy > best_accuracy:
                        best_accuracy = val_accuracy
                        best_history = history
                        best_model_desc = model_desc
                        best_model_cnn = model


print(f"The best model has parameters: {best_model_desc} with accuracy = {round(best_accuracy, 4)}.")