In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler


# --- 1. Define Constants and Features ---
FILE_NAME = '../4-prep_model_data/modelling_data.csv'
TARGET_COL = 'stars_x'
GROUP_COL = 'business_id'
TEXT_COL = 'text'

BOOLEAN_F = ['has_exclamation', 'has_question', 'is_shouting']
CATEGORICAL_F = ['food_sentiment', 'service_sentiment', 'atmosphere_sentiment', 'overall_sentiment']
NUMERICAL_F = ['grade_level']

# All features that will be used as input (X)
ALL_INPUT_FEATURES = [TEXT_COL] + BOOLEAN_F + CATEGORICAL_F + NUMERICAL_F

In [2]:
# --- 2. Load and Prepare Data ---
try:
    df = pd.read_csv(FILE_NAME)
    print(f"Data loaded successfully from '{FILE_NAME}'.")
except FileNotFoundError:
    print(f"ERROR: File '{FILE_NAME}' not found. Please ensure the file is uploaded.")
    # Exit the script if the file cannot be loaded
    exit()

# Data Cleaning and Preparation for Robust Modeling
df.dropna(subset=[TARGET_COL, GROUP_COL, TEXT_COL], inplace=True)
# Fill NaNs for categorical/boolean/numerical columns to prevent data loss in the remaining rows
df[CATEGORICAL_F] = df[CATEGORICAL_F].fillna('missing_category')
df[BOOLEAN_F] = df[BOOLEAN_F].fillna(False)
df[NUMERICAL_F] = df[NUMERICAL_F].fillna(df[NUMERICAL_F].mean()) # Fill numerical NaNs with the mean

# Define X, y, and groups after cleaning
y = df[TARGET_COL] 
X = df[ALL_INPUT_FEATURES]
groups = df[GROUP_COL] 

Data loaded successfully from '../4-prep_model_data/modelling_data.csv'.


In [3]:
# --- 3. Stratified Group Split (80/20) ---
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

try:
    # Get the indices for the 80/20 split, respecting star rating stratification and business grouping
    train_index, test_index = next(sgkf.split(X, y, groups))
except ValueError as e:
    # Fallback if a group or class is too small to stratify
    from sklearn.model_selection import train_test_split
    print("\nWARNING: StratifiedGroupKFold failed. Falling back to standard stratified split.")
    train_index, test_index = train_test_split(df.index, test_size=0.2, stratify=y, random_state=42)

# Apply the indices to create the training and testing sets
X_train, X_test = X.loc[train_index], X.loc[test_index]
y_train, y_test = y.loc[train_index], y.loc[test_index]
business_ids_test = groups.loc[test_index] 

print(f"\nTraining Set Size: {len(X_train)} | Testing Set Size: {len(X_test)}")
print(f"Test set unique businesses: {business_ids_test.nunique()}")


Training Set Size: 37828 | Testing Set Size: 8629
Test set unique businesses: 134


In [4]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import torch
from collections import Counter

[nltk_data] Downloading package punkt to C:\Users\Serena
[nltk_data]     Wong\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def nltk_tokenizer(text):
    tokens = word_tokenize(text)
    return [t.lower() for t in tokens if t.isalnum()]  # lowercase + remove punctuation

# Build vocabulary from training set
all_tokens = [token for text in X_train['text'] for token in nltk_tokenizer(text)]
token_counts = Counter(all_tokens)

vocab_size = 10000
most_common = token_counts.most_common(vocab_size-2)  # leave 0=<pad>, 1=<unk>
vocab = {word: i+2 for i, (word, _) in enumerate(most_common)}
vocab["<pad>"] = 0
vocab["<unk>"] = 1

In [6]:
import gensim.downloader as api
import numpy as np

# Load GloVe
glove_model = api.load("glove-wiki-gigaword-100")  # 100-dim embeddings
embed_dim = 100

# Build embedding matrix
embedding_matrix = np.zeros((vocab_size, embed_dim))

for word, idx in vocab.items():
    if word in glove_model:
        embedding_matrix[idx] = glove_model[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embed_dim,))

embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

In [7]:
def text_to_sequence(text, vocab, maxlen=200):
    tokens = [t for t in nltk_tokenizer(text) if t.isalnum()]
    seq = [vocab.get(token, vocab["<unk>"]) for token in tokens]
    # Pad or truncate
    if len(seq) < maxlen:
        seq += [vocab["<pad>"]] * (maxlen - len(seq))
    else:
        seq = seq[:maxlen]
    return seq

X_train_seq = torch.tensor([text_to_sequence(t, vocab) for t in X_train['text']], dtype=torch.long)
X_test_seq = torch.tensor([text_to_sequence(t, vocab) for t in X_test['text']], dtype=torch.long)

In [8]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np

bool_cols = ["has_exclamation", "has_question", "is_shouting"]
sentiment_cols = ["food_sentiment", "service_sentiment", "atmosphere_sentiment", "overall_sentiment"]

numeric_col = ["grade_level"]


# Fit on training data
onehot = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
scaler = StandardScaler()

X_train_cat = onehot.fit_transform(X_train[BOOLEAN_F + CATEGORICAL_F])
X_test_cat = onehot.transform(X_test[BOOLEAN_F + CATEGORICAL_F])

X_train_num = scaler.fit_transform(X_train[numeric_col])
X_test_num = scaler.transform(X_test[numeric_col])

# Combine
X_train_meta = np.hstack([X_train_cat, X_train_num])
X_test_meta = np.hstack([X_test_cat, X_test_num])

# Convert to tensors
X_train_meta = torch.tensor(X_train_meta, dtype=torch.float32)
X_test_meta = torch.tensor(X_test_meta, dtype=torch.float32)

In [9]:
import torch.nn as nn
import torch.nn.functional as F


class ReviewsCNN(nn.Module):
    def __init__(
        self, 
        vocab_size, 
        embed_dim, 
        embedding_matrix,
        meta_input_dim,
        meta_hidden_dim=64,
        num_filters=100, 
        kernel_sizes=[3,4,5], 
        text_hidden_dim=128,
        combined_hidden_dim=128,
        dropout=0.5,
        num_class=5
    ):
        super().__init__()

        # -- Embedding --
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding.weight = nn.Parameter(embedding_matrix)
        self.embedding.weight.requires_grad = True

        # -- CNN --
        self.convs = nn.ModuleList([
            nn.Conv1d(
                in_channels=embed_dim, 
                out_channels=num_filters, 
                kernel_size=k,
                padding=k//2
            )
            for k in kernel_sizes
        ])
        self.bns = nn.ModuleList([
            nn.BatchNorm1d(num_filters) for _ in kernel_sizes
        ])
        self.text_dropout = nn.Dropout(dropout)
        self.text_fc = nn.Linear(num_filters * len(kernel_sizes)*2, text_hidden_dim)

        # -- Meta MLP --
        self.meta_fc = nn.Sequential(
            nn.Linear(meta_input_dim, meta_hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(meta_hidden_dim, meta_hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

        # -- Combined --
        self.combined_fc = nn.Sequential(
            nn.Linear(text_hidden_dim + meta_hidden_dim, combined_hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(combined_hidden_dim, num_class)
        )

    def forward(self, text, meta):
        # -- Text CNN -- 
        text = self.embedding(text)
        text = text.permute(0, 2, 1)

        conv_outputs = [
            torch.relu(self.bns[i](self.convs[i](text)))
            for i in range(len(self.convs))
        ]
        pooled = [
                torch.cat([
                    torch.max(out, dim=2)[0],
                    torch.mean(out, dim=2)
                ], dim=1)
                for out in conv_outputs
            ]
        text_features = torch.cat(pooled, dim=1)
        text_features = self.text_dropout(text_features)
        text_hidden = self.text_fc(text_features)

        # -- Meta branch -- 
        meta_features = self.meta_fc(meta)

        # -- Combined --
        combined = torch.cat([text_hidden, meta_features], dim=1)

        # -- Final --
        out = self.combined_fc(combined)
        
        return out

In [None]:

vocab_size = len(vocab)
embed_dim = 100
hidden_dim = 256
meta_dim = X_train_meta.shape[1]
output_dim = 5

num_filters = 128
kernel_sizes = [3,5]

num_epochs = 50
batch_size = 32
accum_steps = 2
patience = 5
min_delta = 1e-4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ReviewsCNN(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    embedding_matrix=embedding_matrix,
    meta_input_dim=meta_dim,
    meta_hidden_dim=128,
    num_filters=num_filters,
    kernel_sizes=kernel_sizes,
    text_hidden_dim=hidden_dim,
    combined_hidden_dim=300,
    dropout=0.3,
    num_class=output_dim
).to(device)

# Load weights onto CPU
model.load_state_dict(torch.load("best_cnn_model.pth", map_location=torch.device('cpu')))
model.eval()  

device = torch.device("cpu")
model.to(device)

ReviewsCNN(
  (embedding): Embedding(10000, 100, padding_idx=0)
  (convs): ModuleList(
    (0): Conv1d(100, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): Conv1d(100, 128, kernel_size=(5,), stride=(1,), padding=(2,))
  )
  (bns): ModuleList(
    (0-1): 2 x BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (text_dropout): Dropout(p=0.3, inplace=False)
  (text_fc): Linear(in_features=512, out_features=256, bias=True)
  (meta_fc): Sequential(
    (0): Linear(in_features=18, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=128, out_features=128, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
  )
  (combined_fc): Sequential(
    (0): Linear(in_features=384, out_features=300, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=300, out_features=5, bias=True)
  )
)

In [18]:
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, text_tensors, meta_features, labels):
        self.text_tensors = text_tensors
        self.meta_features = meta_features

        if isinstance(labels, (pd.Series, pd.DataFrame)):
            self.labels = torch.tensor(labels.values - 1, dtype=torch.long)
        else:
            self.labels = labels.long()
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        text = self.text_tensors[idx]
        meta = self.meta_features[idx]
        label = self.labels[idx]
        return text, meta, label


In [19]:
from torch.utils.data import DataLoader

# Shift labels 1-5 â†’ 0-4
y_train_tensor = torch.tensor(y_train.values - 1, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values - 1, dtype=torch.long)

# Create datasets
train_dataset = ReviewDataset(X_train_seq, X_train_meta, y_train_tensor)
test_dataset = ReviewDataset(X_test_seq, X_test_meta, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [20]:
model.eval()
y_pred = []
with torch.no_grad():
    for text_batch, meta_batch, _ in test_loader:
        text_batch = text_batch.to(device)
        meta_batch = meta_batch.to(device)
        outputs = model(text_batch, meta_batch)
        preds = torch.argmax(outputs, dim=1)
        y_pred.extend(preds.cpu().numpy())

# Keep both on 0-4 scale
y_pred = np.array(y_pred)
y_true = y_test_tensor.numpy()  # already 0-4


In [None]:
from sklearn.metrics import classification_report
# Create the final DataFrame for misclassification analysis
misclassification_df = pd.DataFrame({
    'business_id': business_ids_test.values,
    'Review_Text': X_test[TEXT_COL].values,
    'True_Star_Rating': y_test.values,
    'Predicted_Star_Rating': y_pred
})

misclassification_df['Is_Misclassified'] = (misclassification_df['True_Star_Rating'] != misclassification_df['Predicted_Star_Rating'])

# Save the DataFrame to a file
OUTPUT_FILE = 'misclassification_analysis_cnn.csv'
misclassification_df.to_csv(OUTPUT_FILE, index=False)

print("\nMisclassification Analysis Complete.")
print(f"Results saved to '{OUTPUT_FILE}' for your next step.")
print("\n--- Model Performance Summary ---")
print("Accuracy:", classification_report(y_true, y_pred, output_dict=True)['accuracy'])
print("Sample of Misclassified Reviews (True vs. Predicted):")
print(misclassification_df[misclassification_df['Is_Misclassified']][['business_id', 'True_Star_Rating', 'Predicted_Star_Rating']].head())


Misclassification Analysis Complete.
Results saved to 'misclassification_analysis_cnn.csv' for your next step.

--- Model Performance Summary ---
Accuracy: 0.6631127593000348
Sample of Misclassified Reviews (True vs. Predicted):
              business_id  True_Star_Rating  Predicted_Star_Rating
0  V7IHpr1xzFIf_jp876HoAw                 4                      3
1  V7IHpr1xzFIf_jp876HoAw                 5                      4
2  V7IHpr1xzFIf_jp876HoAw                 5                      3
4  s9G06FPW74Prlp8s1h5nEA                 5                      4
5  s9G06FPW74Prlp8s1h5nEA                 4                      3
