In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler


# --- 1. Define Constants and Features ---
FILE_NAME = '../4-prep_model_data/modelling_data.csv'
TARGET_COL = 'stars_x'
GROUP_COL = 'business_id'
TEXT_COL = 'text'

# The additional features you requested to include
BOOLEAN_F = ['has_exclamation', 'has_question', 'is_shouting']
CATEGORICAL_F = ['food_sentiment', 'service_sentiment', 'atmosphere_sentiment', 'overall_sentiment']
NUMERICAL_F = ['grade_level']

# All features that will be used as input (X)
ALL_INPUT_FEATURES = [TEXT_COL] + BOOLEAN_F + CATEGORICAL_F + NUMERICAL_F

In [2]:
# --- 2. Load and Prepare Data ---
try:
    df = pd.read_csv(FILE_NAME)
    print(f"Data loaded successfully from '{FILE_NAME}'.")
except FileNotFoundError:
    print(f"ERROR: File '{FILE_NAME}' not found. Please ensure the file is uploaded.")
    # Exit the script if the file cannot be loaded
    exit()

# Data Cleaning and Preparation for Robust Modeling
df.dropna(subset=[TARGET_COL, GROUP_COL, TEXT_COL], inplace=True)
# Fill NaNs for categorical/boolean/numerical columns to prevent data loss in the remaining rows
df[CATEGORICAL_F] = df[CATEGORICAL_F].fillna('missing_category')
df[BOOLEAN_F] = df[BOOLEAN_F].fillna(False)
df[NUMERICAL_F] = df[NUMERICAL_F].fillna(df[NUMERICAL_F].mean()) # Fill numerical NaNs with the mean

# Define X, y, and groups after cleaning
y = df[TARGET_COL] 
X = df[ALL_INPUT_FEATURES]
groups = df[GROUP_COL] 

Data loaded successfully from '../4-prep_model_data/modelling_data.csv'.


In [3]:
# --- 3. Stratified Group Split (80/20) ---
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

try:
    # Get the indices for the 80/20 split, respecting star rating stratification and business grouping
    train_index, test_index = next(sgkf.split(X, y, groups))
except ValueError as e:
    # Fallback if a group or class is too small to stratify
    from sklearn.model_selection import train_test_split
    print("\nWARNING: StratifiedGroupKFold failed. Falling back to standard stratified split.")
    train_index, test_index = train_test_split(df.index, test_size=0.2, stratify=y, random_state=42)

# Apply the indices to create the training and testing sets
X_train, X_test = X.loc[train_index], X.loc[test_index]
y_train, y_test = y.loc[train_index], y.loc[test_index]
business_ids_test = groups.loc[test_index] 

print(f"\nTraining Set Size: {len(X_train)} | Testing Set Size: {len(X_test)}")
print(f"Test set unique businesses: {business_ids_test.nunique()}")


Training Set Size: 37828 | Testing Set Size: 8629
Test set unique businesses: 134


In [4]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import torch
from collections import Counter

[nltk_data] Downloading package punkt to C:\Users\Serena
[nltk_data]     Wong\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def nltk_tokenizer(text):
    tokens = word_tokenize(text)
    return [t.lower() for t in tokens if t.isalnum()]  # lowercase + remove punctuation

# Build vocabulary from training set
all_tokens = [token for text in X_train['text'] for token in nltk_tokenizer(text)]
token_counts = Counter(all_tokens)

vocab_size = 10000
most_common = token_counts.most_common(vocab_size-2)  # leave 0=<pad>, 1=<unk>
vocab = {word: i+2 for i, (word, _) in enumerate(most_common)}
vocab["<pad>"] = 0
vocab["<unk>"] = 1

In [6]:
import gensim.downloader as api
import numpy as np

# Load GloVe
glove_model = api.load("glove-wiki-gigaword-100")  # 100-dim embeddings
embed_dim = 100

# Build embedding matrix
embedding_matrix = np.zeros((vocab_size, embed_dim))

for word, idx in vocab.items():
    if word in glove_model:
        embedding_matrix[idx] = glove_model[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embed_dim,))

embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)



In [7]:
def text_to_sequence(text, vocab, maxlen=200):
    tokens = [t for t in nltk_tokenizer(text) if t.isalnum()]
    seq = [vocab.get(token, vocab["<unk>"]) for token in tokens]
    # Pad or truncate
    if len(seq) < maxlen:
        seq += [vocab["<pad>"]] * (maxlen - len(seq))
    else:
        seq = seq[:maxlen]
    return seq

X_train_seq = torch.tensor([text_to_sequence(t, vocab) for t in X_train['text']], dtype=torch.long)
X_test_seq = torch.tensor([text_to_sequence(t, vocab) for t in X_test['text']], dtype=torch.long)

In [8]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np

bool_cols = ["has_exclamation", "has_question", "is_shouting"]
sentiment_cols = ["food_sentiment", "service_sentiment", "atmosphere_sentiment", "overall_sentiment"]

numeric_col = ["grade_level"]


# Fit on training data
onehot = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
scaler = StandardScaler()

X_train_cat = onehot.fit_transform(X_train[BOOLEAN_F + CATEGORICAL_F])
X_test_cat = onehot.transform(X_test[BOOLEAN_F + CATEGORICAL_F])

X_train_num = scaler.fit_transform(X_train[numeric_col])
X_test_num = scaler.transform(X_test[numeric_col])

# Combine
X_train_meta = np.hstack([X_train_cat, X_train_num])
X_test_meta = np.hstack([X_test_cat, X_test_num])

# Convert to tensors
X_train_meta = torch.tensor(X_train_meta, dtype=torch.float32)
X_test_meta = torch.tensor(X_test_meta, dtype=torch.float32)

In [9]:
import torch.nn as nn
import torch.nn.functional as F

class ReviewLSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, meta_dim, lstm_hidden_dim=256, fc_hidden_dim=128, num_classes=5, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding.weight = nn.Parameter(embedding_matrix)
        self.embedding.weight.requires_grad = False  # or False to freeze

        self.lstm = nn.LSTM(embed_dim, lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=2, dropout=0.3)
        self.text_dropout = nn.Dropout(0.4)
        self.text_fc = nn.Sequential(
            nn.Linear(lstm_hidden_dim*2, fc_hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

        self.meta_fc = nn.Sequential(
            nn.Linear(meta_dim, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 64),
            nn.ReLU()
        )

        self.combined_fc = nn.Sequential(
            nn.Linear(fc_hidden_dim + 64, fc_hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(fc_hidden_dim, num_classes)
        )

    def forward(self, text, meta):
        embedded = self.embedding(text)
        _, (hidden, _) = self.lstm(embedded)
        backward_last = hidden[-1]  # last forward & backward
        forward_last = hidden[-2]
        hidden_cat = torch.cat([forward_last, backward_last], dim=1)
        hidden_cat = self.text_dropout(hidden_cat)
        text_feat = self.text_fc(hidden_cat)

        meta_feat = self.meta_fc(meta)

        combined = torch.cat([text_feat, meta_feat], dim=1)
        output = self.combined_fc(combined)
        return output

In [31]:
vocab_size = len(vocab)
embed_dim = 100
hidden_dim = 128
meta_dim = X_train_meta.shape[1]
output_dim = 5
num_epochs = 50
batch_size = 32       # smaller batch
accum_steps = 2       # gradient accumulation
patience = 5          # early stopping patience
min_delta = 1e-4  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ReviewLSTMModel(vocab_size=vocab_size, 
                        embed_dim=embed_dim, 
                        meta_dim=meta_dim, 
                        lstm_hidden_dim=hidden_dim, 
                        fc_hidden_dim=256,
                        num_classes=output_dim)

# Load weights onto CPU
model.load_state_dict(torch.load("best_model.pth", map_location=torch.device('cpu')))
model.eval()  # Important: set to evaluation mode

# If you have meta/text tensors on CPU, you can do predictions now
device = torch.device("cpu")
model.to(device)


ReviewLSTMModel(
  (embedding): Embedding(10000, 100, padding_idx=0)
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (text_dropout): Dropout(p=0.4, inplace=False)
  (text_fc): Sequential(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
  )
  (meta_fc): Sequential(
    (0): Linear(in_features=18, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.1, inplace=False)
    (4): Linear(in_features=64, out_features=64, bias=True)
    (5): ReLU()
  )
  (combined_fc): Sequential(
    (0): Linear(in_features=320, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=256, out_features=5, bias=True)
  )
)

In [32]:
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, text_tensors, meta_features, labels):
        self.text_tensors = text_tensors
        self.meta_features = meta_features

        if isinstance(labels, (pd.Series, pd.DataFrame)):
            self.labels = torch.tensor(labels.values - 1, dtype=torch.long)
        else:
            self.labels = labels.long()
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        text = self.text_tensors[idx]
        meta = self.meta_features[idx]
        label = self.labels[idx]
        return text, meta, label

In [33]:
from torch.utils.data import DataLoader

# Shift labels 1-5 â†’ 0-4
y_test_tensor = torch.tensor(y_test.values - 1, dtype=torch.long)

# Create datasets
test_dataset = ReviewDataset(X_test_seq, X_test_meta, y_test_tensor)

test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [34]:
model.eval()
y_pred = []
with torch.no_grad():
    for text_batch, meta_batch, _ in test_loader:
        text_batch = text_batch.to(device)
        meta_batch = meta_batch.to(device)
        outputs = model(text_batch, meta_batch)
        preds = torch.argmax(outputs, dim=1)
        y_pred.extend(preds.cpu().numpy())

# Keep both on 0-4 scale
y_pred = np.array(y_pred)
y_true = y_test_tensor.numpy()  # already 0-4



In [35]:
y_pred[:10]

array([2, 3, 3, 3, 4, 3, 4, 3, 3, 1])

In [37]:
y_true[:10]

array([3, 4, 4, 2, 4, 3, 3, 4, 4, 2])

In [40]:
from sklearn.metrics import classification_report
# Create the final DataFrame for misclassification analysis
misclassification_df = pd.DataFrame({
    'business_id': business_ids_test.values,
    'Review_Text': X_test[TEXT_COL].values,
    'True_Star_Rating': y_test.values,
    'Predicted_Star_Rating': y_pred
})

misclassification_df['Is_Misclassified'] = (misclassification_df['True_Star_Rating'] != misclassification_df['Predicted_Star_Rating'])

# Save the DataFrame to a file for your detailed analysis
OUTPUT_FILE = 'misclassification_analysis_lstm.csv'
misclassification_df.to_csv(OUTPUT_FILE, index=False)

print("\nMisclassification Analysis Complete.")
print(f"Results saved to '{OUTPUT_FILE}' for your next step.")
print("\n--- Model Performance Summary ---")
print("Accuracy:", classification_report(y_true, y_pred, output_dict=True)['accuracy'])
print("Sample of Misclassified Reviews (True vs. Predicted):")
print(misclassification_df[misclassification_df['Is_Misclassified']][['business_id', 'True_Star_Rating', 'Predicted_Star_Rating']].head())


Misclassification Analysis Complete.
Results saved to 'misclassification_analysis_lstm.csv' for your next step.

--- Model Performance Summary ---
Accuracy: 0.6307799281492641
Sample of Misclassified Reviews (True vs. Predicted):
              business_id  True_Star_Rating  Predicted_Star_Rating
0  V7IHpr1xzFIf_jp876HoAw                 4                      2
1  V7IHpr1xzFIf_jp876HoAw                 5                      3
2  V7IHpr1xzFIf_jp876HoAw                 5                      3
4  s9G06FPW74Prlp8s1h5nEA                 5                      4
5  s9G06FPW74Prlp8s1h5nEA                 4                      3
