In [None]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import Constant
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import class_weight
from sklearn.metrics import hamming_loss, f1_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt

In [None]:

# Load the dataset
df_raw = pd.read_csv('Reviews.csv')
df = df_raw[['Score', 'Text']].copy()

# Preprocess text
def clean_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text, re.I | re.A)
    text = text.lower().strip()
    return text

df.loc[:, 'Clean_Text'] = df['Text'].apply(clean_text)  # Avoid SettingWithCopyWarning

# Tokenization and padding
max_words = 20000
max_len = 200
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['Clean_Text'])
sequences = tokenizer.texts_to_sequences(df['Clean_Text'])
x = pad_sequences(sequences, maxlen=max_len)

# One-hot encode labels
y = df[['Score']].values
encoder = OneHotEncoder(sparse_output=False)  # Updated parameter
y_one_hot_encode = encoder.fit_transform(y)

# Train-test split
x_train, x_val, y_train, y_val = train_test_split(x, y_one_hot_encode, test_size=0.3, random_state=42)

# Handle class imbalance
class_weights = class_weight.compute_class_weight(
    'balanced', classes=np.unique(y_train.argmax(axis=1)), y=y_train.argmax(axis=1)
)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

# Load pretrained embeddings (GloVe example)
embedding_dim = 100
embedding_index = {}
with open("glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        word, *vector = line.split()
        embedding_index[word] = np.array(vector, dtype="float32")

# Create embedding matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Build CNN + BiLSTM model
model = Sequential([
    Embedding(input_dim=max_words,
              output_dim=embedding_dim,
              input_length=max_len,
              trainable=True),  # Trainable embedding layer without pretrained weights
    Conv1D(filters=128, kernel_size=5, activation='relu', kernel_regularizer='l2'),
    Dropout(0.5),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dense(128, activation='relu', kernel_regularizer='l2'),
    Dropout(0.5),
    Dense(5, activation='softmax')  # For multiclass classification
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model with early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(
    x_train, y_train,
    epochs=10,
    batch_size=512,
    validation_data=(x_val, y_val),
    class_weight=class_weights_dict,
    callbacks=[early_stopping]
)

# Evaluate the model
y_pred_prob = model.predict(x_val)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_val, axis=1)

# Calculate metrics
hamming = hamming_loss(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='macro')
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
conf_matrix = confusion_matrix(y_true, y_pred)

print(f"Hamming Loss: {hamming}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print("Confusion Matrix:")
print(conf_matrix)

# Visualize misclassified samples
misclassified_idx = np.where(y_true != y_pred)[0][:5]
for idx in misclassified_idx:
    raw_sentence = [i for i in tokenizer.sequences_to_texts([x_val[idx]])[0].split(' ') if i != 'UNK']
    print(f"Predicted: {y_pred[idx]}, Actual: {y_true[idx]}, Sentence: {raw_sentence}")

# Plot training vs validation metrics
plt.figure(figsize=(10, 6))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()




Epoch 1/10
[1m778/778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1028s[0m 1s/step - accuracy: 0.4237 - loss: 1.8050 - val_accuracy: 0.5774 - val_loss: 1.1030
Epoch 2/10
[1m778/778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m803s[0m 1s/step - accuracy: 0.6453 - loss: 1.1030 - val_accuracy: 0.6848 - val_loss: 0.8816
Epoch 3/10
[1m778/778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m743s[0m 955ms/step - accuracy: 0.6623 - loss: 1.0402 - val_accuracy: 0.6191 - val_loss: 0.9974
Epoch 4/10
[1m778/778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m762s[0m 979ms/step - accuracy: 0.6749 - loss: 0.9935 - val_accuracy: 0.5681 - val_loss: 1.0992
Epoch 5/10
[1m778/778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m761s[0m 978ms/step - accuracy: 0.6798 - loss: 0.9600 - val_accuracy: 0.6278 - val_loss: 0.9516
[1m5017/5330[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m9s[0m 29ms/step