In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# Deep Learning specific imports
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPool1D, Dense, concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

# --- 1. Define Constants and Features ---
FILE_NAME = '../4-prep_model_data/modelling_data.csv'
TARGET_COL = 'stars_x'
GROUP_COL = 'business_id'
TEXT_COL = 'text'

# Feature lists
BOOLEAN_F = ['has_exclamation', 'has_question', 'is_shouting']
CATEGORICAL_F = ['food_sentiment', 'service_sentiment', 'atmosphere_sentiment', 'overall_sentiment']
NUMERICAL_F = ['grade_level']
ALL_INPUT_FEATURES = [TEXT_COL] + BOOLEAN_F + CATEGORICAL_F + NUMERICAL_F

# Deep Learning Hyperparameters
MAX_WORDS = 10000        # Max vocabulary size for the tokenizer
MAX_SEQ_LENGTH = 150     # Max length of a review (sentences longer than this are truncated)
EMBEDDING_DIM = 100      # Size of the word embedding vector
NUM_CLASSES = 5          # Number of star ratings (1 to 5)
FILTERS = 128            # Number of filters (feature detectors) for the CNN
KERNEL_SIZES = [3, 4, 5] # Size of the n-grams the CNN will look for

2025-11-13 11:25:45.592958: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# --- 2. Load and Prepare Data ---
try:
    df = pd.read_csv(FILE_NAME)
    print(f"Data loaded successfully from '{FILE_NAME}'.")
except FileNotFoundError:
    print(f"ERROR: File '{FILE_NAME}' not found. Please ensure the file is uploaded.")
    exit()

# Data Cleaning and Preparation
df.dropna(subset=[TARGET_COL, GROUP_COL, TEXT_COL], inplace=True)
df[CATEGORICAL_F] = df[CATEGORICAL_F].fillna('missing_category')
df[BOOLEAN_F] = df[BOOLEAN_F].fillna(False)
df[NUMERICAL_F] = df[NUMERICAL_F].fillna(df[NUMERICAL_F].mean()) 

for col in BOOLEAN_F:
    df[col] = df[col].astype(str)

# Map stars_x (1 to 5) to classes (0 to 4) for categorical cross-entropy loss
# Keras requires class indices starting from 0
df['class_label'] = df[TARGET_COL] - 1 

y = df['class_label'] 
X = df[ALL_INPUT_FEATURES]
groups = df[GROUP_COL] 

Data loaded successfully from '4-prep_model_data/modelling_data.csv'.


In [3]:
# --- 3. Stratified Group Split (80/20) ---
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

try:
    train_index, test_index = next(sgkf.split(X, y, groups))
except ValueError as e:
    from sklearn.model_selection import train_test_split
    print("\nWARNING: StratifiedGroupKFold failed. Falling back to standard stratified split.")
    # Use the class labels for stratification
    train_index, test_index = train_test_split(df.index, test_size=0.2, stratify=y, random_state=42)

X_train, X_test = X.loc[train_index], X.loc[test_index]
y_train, y_test = y.loc[train_index], y.loc[test_index]
business_ids_test = groups.loc[test_index] 

print(f"\nTraining Set Size: {len(X_train)} | Testing Set Size: {len(X_test)}")


Training Set Size: 37828 | Testing Set Size: 8629


In [4]:
# --- 4. Deep Learning Text Preprocessing (Tokenizer and Padding) ---
print("\nPerforming Deep Learning Text Preprocessing...")
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<unk>")
tokenizer.fit_on_texts(X_train[TEXT_COL])

# Convert text to sequences (integers)
X_train_seq = tokenizer.texts_to_sequences(X_train[TEXT_COL])
X_test_seq = tokenizer.texts_to_sequences(X_test[TEXT_COL])

# Pad sequences (make all reviews the same length)
X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_SEQ_LENGTH, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=MAX_SEQ_LENGTH, padding='post', truncating='post')

# Convert target labels to one-hot encoding (required for multi-class classification in Keras)
Y_train_categorical = to_categorical(y_train, num_classes=NUM_CLASSES)
Y_test_categorical = to_categorical(y_test, num_classes=NUM_CLASSES)
VOCAB_SIZE = len(tokenizer.word_index) + 1 # Actual size of the vocabulary


Performing Deep Learning Text Preprocessing...


In [5]:
# --- 5. Preprocess Meta Features (One-Hot and Scaling) ---
# We use the same preprocessing setup as before for consistency
preprocessor_meta = ColumnTransformer(
    transformers=[
        ('cat_pipe', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_F + BOOLEAN_F),
        ('num_pipe', StandardScaler(), NUMERICAL_F)
    ],
    remainder='drop' 
)

# Fit and transform the meta features
X_train_meta = preprocessor_meta.fit_transform(X_train[BOOLEAN_F + CATEGORICAL_F + NUMERICAL_F])
X_test_meta = preprocessor_meta.transform(X_test[BOOLEAN_F + CATEGORICAL_F + NUMERICAL_F])
META_FEATURE_DIM = X_train_meta.shape[1]

In [12]:
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPool1D, Dense, concatenate, Dropout, SpatialDropout1D, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

text_input = Input(shape=(MAX_SEQ_LENGTH,), name='text_input')
x = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM,
              input_length=MAX_SEQ_LENGTH, weights=[embedding_matrix], trainable=False)(text_input)
x = SpatialDropout1D(0.2)(x)

conv_blocks = []
for k in KERNEL_SIZES:
    conv = Conv1D(filters=FILTERS, kernel_size=k, activation='relu')(x)
    conv = Conv1D(filters=FILTERS, kernel_size=k, activation='relu')(conv)
    pool = GlobalMaxPool1D()(conv)
    conv_blocks.append(pool)

cnn_output = concatenate(conv_blocks)
cnn_output = Dense(128, activation='relu')(cnn_output)
cnn_output = BatchNormalization()(cnn_output)
cnn_output = Dropout(0.5)(cnn_output)

meta_input = Input(shape=(META_FEATURE_DIM,), name='meta_input')
meta_output = Dense(16, activation='relu')(meta_input)

combined = concatenate([cnn_output, meta_output])
combined = Dense(64, activation='relu')(combined)
combined = Dropout(0.4)(combined)
final_output = Dense(NUM_CLASSES, activation='softmax')(combined)

model = Model(inputs=[text_input, meta_input], outputs=final_output)
optimizer = Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])


NameError: name 'embedding_matrix' is not defined

In [11]:
# --- 7. Model Training and Evaluation ---
print("\nStarting TextCNN Training...")
# Use a validation split to monitor for overfitting
history = model.fit(
    [X_train_padded, X_train_meta], 
    Y_train_categorical,
    epochs=10, 
    batch_size=32, 
    validation_split=0.1, # Monitor performance on 10% of the training data
    verbose=1
)
print("Training complete!")

# Evaluate on the test set
loss, accuracy = model.evaluate([X_test_padded, X_test_meta], Y_test_categorical, verbose=0)
print(f"\nTest Accuracy (TextCNN): {accuracy:.4f}")

# Generate predictions for the classification report and analysis
y_pred_proba = model.predict([X_test_padded, X_test_meta])
y_pred_labels = np.argmax(y_pred_proba, axis=1) # Convert one-hot back to single label (0 to 4)
y_true_labels = np.argmax(Y_test_categorical, axis=1) # True labels (0 to 4)

# Revert labels to original star rating (1 to 5) for the report
y_pred_stars = y_pred_labels + 1
y_true_stars = y_true_labels + 1

print("\n" + "="*50)
print("CLASSIFICATION REPORT (TextCNN: Text + Meta)")
print("="*50)
print(classification_report(y_true_stars, y_pred_stars, zero_division=0))


Starting TextCNN Training...
Epoch 1/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 65ms/step - accuracy: 0.4311 - loss: 1.3820 - val_accuracy: 0.4832 - val_loss: 1.1744
Epoch 2/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 64ms/step - accuracy: 0.5250 - loss: 1.0919 - val_accuracy: 0.6196 - val_loss: 0.9042
Epoch 3/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 64ms/step - accuracy: 0.5954 - loss: 0.9298 - val_accuracy: 0.6460 - val_loss: 0.8241
Epoch 4/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 63ms/step - accuracy: 0.6371 - loss: 0.8434 - val_accuracy: 0.6667 - val_loss: 0.7795
Epoch 5/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 65ms/step - accuracy: 0.6669 - loss: 0.7759 - val_accuracy: 0.6669 - val_loss: 0.7621
Epoch 6/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 61ms/step - accuracy: 0.6982 - loss: 0.7130 - val_accuracy: 

In [8]:
# --- 8. Misclassification Analysis Prep ---
misclassification_df_cnn = pd.DataFrame({
    'business_id': business_ids_test.values,
    'Review_Text': X_test[TEXT_COL].values,
    'True_Star_Rating': y_true_stars,
    'Predicted_Star_Rating': y_pred_stars
})

misclassification_df_cnn['Is_Misclassified'] = (misclassification_df_cnn['True_Star_Rating'] != misclassification_df_cnn['Predicted_Star_Rating'])

OUTPUT_FILE = 'misclassification_analysis_textcnn.csv'
misclassification_df_cnn.to_csv(OUTPUT_FILE, index=False)

print("\nMisclassification Analysis Complete for TextCNN.")
print(f"Results saved to '{OUTPUT_FILE}'.")
print("\nSample of Misclassified Reviews (True vs. Predicted):")
print(misclassification_df_cnn[misclassification_df_cnn['Is_Misclassified']][['business_id', 'True_Star_Rating', 'Predicted_Star_Rating']].head())


Misclassification Analysis Complete for TextCNN.
Results saved to 'misclassification_analysis_textcnn.csv'.

Sample of Misclassified Reviews (True vs. Predicted):
               business_id  True_Star_Rating  Predicted_Star_Rating
1   V7IHpr1xzFIf_jp876HoAw                 5                      4
3   V7IHpr1xzFIf_jp876HoAw                 3                      5
9   s9G06FPW74Prlp8s1h5nEA                 3                      2
15  41RbEZa99W2d_kTnYTp_mw                 5                      4
17  41RbEZa99W2d_kTnYTp_mw                 4                      5
