In [None]:
from itertools import product
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

df = pd.read_parquet("../../datasets/amazon_user_reviews_text_sentiment.parquet")

texts = df["text_cleaned"].astype(str).values
labels = df["sentiment"].values

X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Tokenization
max_vocab_size = 20000
max_length = 100

tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding="post")

def create_rnn_model(rnn_units=128, dense_units=64, dropout_rate=0.2, learning_rate=0.001):
    model = Sequential([
        Embedding(input_dim=max_vocab_size, output_dim=128, input_length=max_length, mask_zero=True),
        SimpleRNN(rnn_units),
        Dropout(dropout_rate),
        Dense(dense_units, activation='relu'),
        Dense(3, activation='softmax')
    ])

    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

param_grid = {
    'rnn_units': [64, 128, 256],
    'dense_units': [32, 64, 128],
    'dropout_rate': [0, 0.2, 0.5],
    'learning_rate': [0.0005, 0.001, 0.01]
}

best_acc = 0
best_params = None
prog = 0

for rnn_units, dense_units, dropout_rate, learning_rate in product(*param_grid.values()):
    model = create_rnn_model(rnn_units, dense_units, dropout_rate, learning_rate)
    history = model.fit(X_train_pad, y_train, validation_split=0.2, epochs=5, batch_size=64, verbose=0)
    val_acc = max(history.history['val_accuracy'])
    prog += 1
    print(prog)
    if val_acc > best_acc:
        best_acc = val_acc
        best_params = {
            'rnn_units': rnn_units,
            'dense_units': dense_units,
            'dropout_rate': dropout_rate,
            'learning_rate': learning_rate
        }


print("Best validation accuracy:", best_acc)
print("Best hyperparameters:", best_params)



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
Best validation accuracy: 0.6195651888847351
Best hyperparameters: {'rnn_units': 128, 'dense_units': 128, 'dropout_rate': 0.2, 'learning_rate': 0.0005}
