In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.utils import to_categorical
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import Adam

In [None]:
# import data and splits

df = pd.read_csv('clean_data.csv')
train_df = df.groupby('label', group_keys=False).apply(lambda x: x.sample(frac=0.9, random_state=20723))
test_df = df.drop(train_df.index, axis=0)

def vectorize(data, tokenizer=None, max_len=40):
    if tokenizer is None:
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(data)
    sequences = tokenizer.texts_to_sequences(data)
    padded = pad_sequences(sequences, maxlen=max_len)
    return padded, tokenizer

X_train, tokenizer = vectorize(train_df['text'])
y_train = to_categorical(train_df['label'].apply(lambda x: x-1))
X_test, _ = vectorize(test_df['text'], tokenizer)
y_test = test_df['label']

vocab_size = len(tokenizer.word_index)+1

In [None]:
def create_model(embedding_dim, lstm_units, lstm_dropout, learning_rate, num_neurons, num_layers):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
    for _ in range(num_layers):
        model.add(Bidirectional(LSTM(units=lstm_units, dropout=lstm_dropout, return_sequences=True)))
    model.add(Bidirectional(LSTM(units=lstm_units, dropout=lstm_dropout)))
    model.add(Dense(units=num_neurons, activation='relu'))
    model.add(Dense(units=5, activation='softmax'))
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

In [None]:
param_grid = {
    'embedding_dim': [100, 200],
    'lstm_units': [64, 128],
    'lstm_dropout': [0.2, 0.3],
    'learning_rate': [0.001, 0.01, 0.1],
    'num_neurons': [64, 128, 256],
    'num_layers': [0, 1],
}

In [None]:
model = KerasClassifier(build_fn=create_model, verbose=1, epochs=10, batch_size=16)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
# Print the best hyperparameters and accuracy
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

In [None]:
# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: ", accuracy)