In [2]:
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report

# --- 1. Define Constants and Features ---
FILE_NAME = '../4-prep_model_data/modelling_data.csv'
TARGET_COL = 'stars_x'
GROUP_COL = 'business_id'
TEXT_COL = 'text'

# The additional features you requested to include
BOOLEAN_F = ['has_exclamation', 'has_question', 'is_shouting']
CATEGORICAL_F = ['food_sentiment', 'service_sentiment', 'atmosphere_sentiment', 'overall_sentiment']
NUMERICAL_F = ['grade_level']

# All features that will be used as input (X)
ALL_INPUT_FEATURES = [TEXT_COL] + BOOLEAN_F + CATEGORICAL_F + NUMERICAL_F


ModuleNotFoundError: No module named 'pandas'

In [2]:
# --- 2. Load and Prepare Data ---
try:
    df = pd.read_csv(FILE_NAME)
    print(f"Data loaded successfully from '{FILE_NAME}'.")
except FileNotFoundError:
    print(f"ERROR: File '{FILE_NAME}' not found. Please ensure the file is uploaded.")
    # Exit the script if the file cannot be loaded
    exit()

# Data Cleaning and Preparation for Robust Modeling
df.dropna(subset=[TARGET_COL, GROUP_COL, TEXT_COL], inplace=True)
# Fill NaNs for categorical/boolean/numerical columns to prevent data loss in the remaining rows
df[CATEGORICAL_F] = df[CATEGORICAL_F].fillna('missing_category')
df[BOOLEAN_F] = df[BOOLEAN_F].fillna(False)
df[NUMERICAL_F] = df[NUMERICAL_F].fillna(df[NUMERICAL_F].mean()) # Fill numerical NaNs with the mean

# Convert boolean columns to string/object type for proper One-Hot Encoding
for col in BOOLEAN_F:
    df[col] = df[col].astype(str)

# Define X, y, and groups after cleaning
y = df[TARGET_COL] 
X = df[ALL_INPUT_FEATURES]
groups = df[GROUP_COL] 

Data loaded successfully from 'modelling_data.csv'.


In [3]:

# --- 3. Stratified Group Split (80/20) ---
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

try:
    # Get the indices for the 80/20 split, respecting star rating stratification and business grouping
    train_index, test_index = next(sgkf.split(X, y, groups))
except ValueError as e:
    # Fallback if a group or class is too small to stratify
    from sklearn.model_selection import train_test_split
    print("\nWARNING: StratifiedGroupKFold failed. Falling back to standard stratified split.")
    train_index, test_index = train_test_split(df.index, test_size=0.2, stratify=y, random_state=42)

# Apply the indices to create the training and testing sets
X_train, X_test = X.loc[train_index], X.loc[test_index]
y_train, y_test = y.loc[train_index], y.loc[test_index]
business_ids_test = groups.loc[test_index] 

print(f"\nTraining Set Size: {len(X_train)} | Testing Set Size: {len(X_test)}")
print(f"Test set unique businesses: {business_ids_test.nunique()}")


Training Set Size: 37828 | Testing Set Size: 8629
Test set unique businesses: 134


In [4]:
# --- 4. Build the ColumnTransformer (The Feature Combiner) ---
# This defines how each type of feature is preprocessed
preprocessor = ColumnTransformer(
    transformers=[
        # 1. Text Feature (TF-IDF)
        ('text_pipe', TfidfVectorizer(
            stop_words='english',
            ngram_range=(1, 2),        # Use unigrams and bigrams
            max_features=10000         # Limit vocabulary size
        ), TEXT_COL),
        
        # 2. Categorical and Boolean Features (One-Hot Encoding)
        ('cat_pipe', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_F + BOOLEAN_F),
        
        # 3. Numerical Features (Scaling)
        ('num_pipe', StandardScaler(), NUMERICAL_F)
    ],
    remainder='drop' 
)

In [5]:
# --- 5. Build and Train the Full SVM Pipeline --- 
# The Pipeline chains the preprocessing step and the SVM classifier
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svm', SVC(kernel='linear', C=1.0, random_state=42)) 
])

print("\nStarting model training with ALL combined features (Text, Sentiment, Meta Data)...")
model_pipeline.fit(X_train, y_train)
print("Training complete!")


Starting model training with ALL combined features (Text, Sentiment, Meta Data)...
Training complete!


In [6]:
# --- 6. Evaluate and Prepare for Misclassification Analysis ---
y_pred = model_pipeline.predict(X_test)

print("\n" + "="*50)
print("CLASSIFICATION REPORT (Combined Features: Text + Meta)")
print("="*50)
print(classification_report(y_test, y_pred, zero_division=0))

# Create the final DataFrame for misclassification analysis
misclassification_df = pd.DataFrame({
    'business_id': business_ids_test.values,
    'Review_Text': X_test[TEXT_COL].values,
    'True_Star_Rating': y_test.values,
    'Predicted_Star_Rating': y_pred
})

misclassification_df['Is_Misclassified'] = (misclassification_df['True_Star_Rating'] != misclassification_df['Predicted_Star_Rating'])

# Save the DataFrame to a file for your detailed analysis
OUTPUT_FILE = 'misclassification_analysis_combined_features.csv'
misclassification_df.to_csv(OUTPUT_FILE, index=False)

print("\nMisclassification Analysis Complete.")
print(f"Results saved to '{OUTPUT_FILE}' for your next step.")
print("\n--- Model Performance Summary ---")
print("Accuracy:", classification_report(y_test, y_pred, output_dict=True)['accuracy'])
print("Sample of Misclassified Reviews (True vs. Predicted):")
print(misclassification_df[misclassification_df['Is_Misclassified']][['business_id', 'True_Star_Rating', 'Predicted_Star_Rating']].head())


CLASSIFICATION REPORT (Combined Features: Text + Meta)
              precision    recall  f1-score   support

           1       0.71      0.78      0.74      1137
           2       0.45      0.41      0.43       755
           3       0.50      0.35      0.41      1024
           4       0.51      0.48      0.49      2089
           5       0.75      0.83      0.79      3624

    accuracy                           0.64      8629
   macro avg       0.59      0.57      0.57      8629
weighted avg       0.63      0.64      0.63      8629


Misclassification Analysis Complete.
Results saved to 'misclassification_analysis_combined_features.csv' for your next step.

--- Model Performance Summary ---
Accuracy: 0.6448024104763008
Sample of Misclassified Reviews (True vs. Predicted):
              business_id  True_Star_Rating  Predicted_Star_Rating
0  V7IHpr1xzFIf_jp876HoAw                 4                      3
1  V7IHpr1xzFIf_jp876HoAw                 5                      4
2  V7IHpr1