In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report

FILE_NAME = '../4-prep_model_data/modelling_data.csv'
TARGET_COL = 'stars_x'
GROUP_COL = 'business_id'
TEXT_COL = 'text'

BOOLEAN_F = ['has_exclamation', 'has_question', 'is_shouting']
CATEGORICAL_F = ['food_sentiment', 'service_sentiment', 'atmosphere_sentiment', 'overall_sentiment']
NUMERICAL_F = ['grade_level']
ALL_INPUT_FEATURES = [TEXT_COL] + BOOLEAN_F + CATEGORICAL_F + NUMERICAL_F

In [None]:
try:
    df = pd.read_csv(FILE_NAME)
    print(f"Data loaded successfully from '{FILE_NAME}'.")
except FileNotFoundError:
    print(f"ERROR: File '{FILE_NAME}' not found. Please ensure the file is uploaded.")
    exit()

df.dropna(subset=[TARGET_COL, GROUP_COL, TEXT_COL], inplace=True)
df[CATEGORICAL_F] = df[CATEGORICAL_F].fillna('missing_category')
df[BOOLEAN_F] = df[BOOLEAN_F].fillna(False)
df[NUMERICAL_F] = df[NUMERICAL_F].fillna(df[NUMERICAL_F].mean()) 

for col in BOOLEAN_F:
    df[col] = df[col].astype(str)

y = df[TARGET_COL] 
X = df[ALL_INPUT_FEATURES]
groups = df[GROUP_COL] 

Data loaded successfully from '../4-prep_model_data/modelling_data.csv'.


In [None]:
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

try:
    train_index, test_index = next(sgkf.split(X, y, groups))
except ValueError as e:
    from sklearn.model_selection import train_test_split
    print("\nWARNING: StratifiedGroupKFold failed. Falling back to standard stratified split.")
    train_index, test_index = train_test_split(df.index, test_size=0.2, stratify=y, random_state=42)

X_train, X_test = X.loc[train_index], X.loc[test_index]
y_train, y_test = y.loc[train_index], y.loc[test_index]
business_ids_test = groups.loc[test_index] 

print(f"\nTraining Set Size: {len(X_train)} | Testing Set Size: {len(X_test)}")


Training Set Size: 37828 | Testing Set Size: 8629


In [None]:

preprocessor = ColumnTransformer(
    transformers=[
        # 1. Text Feature (TF-IDF)
        ('text_pipe', TfidfVectorizer(
            stop_words='english',
            ngram_range=(1, 2),        
            max_features=10000         
        ), TEXT_COL),
        
        ('cat_pipe', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_F + BOOLEAN_F),
        
        ('num_pipe', StandardScaler(), NUMERICAL_F)
    ],
    remainder='drop' 
)

In [None]:

# NOTE: C is the regularization inverse, max_iter increased for convergence
# multi_class='multinomial' and solver='saga' are robust for multi-class classification
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lr', LogisticRegression(
        C=1.0, 
        max_iter=1000, 
        solver='saga', 
        multi_class='multinomial', 
        random_state=42, 
        n_jobs=-1
    )) 
])

print("\nStarting model training with Logistic Regression (Combined Features)...")
model_pipeline.fit(X_train, y_train)
print("Training complete!")


Starting model training with Logistic Regression (Combined Features)...




Training complete!


In [None]:


y_pred = model_pipeline.predict(X_test)

print("\n" + "="*50)
print("CLASSIFICATION REPORT (Logistic Regression: Text + Meta)")
print("="*50)
print(classification_report(y_test, y_pred, zero_division=0))

misclassification_df_lr = pd.DataFrame({
    'business_id': business_ids_test.values,
    'Review_Text': X_test[TEXT_COL].values,
    'True_Star_Rating': y_test.values,
    'Predicted_Star_Rating': y_pred
})

misclassification_df_lr['Is_Misclassified'] = (misclassification_df_lr['True_Star_Rating'] != misclassification_df_lr['Predicted_Star_Rating'])

OUTPUT_FILE = 'misclassification_analysis_logistic_regression.csv'
misclassification_df_lr.to_csv(OUTPUT_FILE, index=False)

print("\nMisclassification Analysis Complete for Logistic Regression.")
print(f"Results saved to '{OUTPUT_FILE}'.")
print("\nAccuracy:", classification_report(y_test, y_pred, output_dict=True)['accuracy'])
print("Sample of Misclassified Reviews:")
print(misclassification_df_lr[misclassification_df_lr['Is_Misclassified']][['business_id', 'True_Star_Rating', 'Predicted_Star_Rating']].head())


CLASSIFICATION REPORT (Logistic Regression: Text + Meta)
              precision    recall  f1-score   support

           1       0.70      0.80      0.75      1137
           2       0.45      0.36      0.40       755
           3       0.49      0.30      0.37      1024
           4       0.51      0.47      0.49      2089
           5       0.74      0.84      0.79      3624

    accuracy                           0.64      8629
   macro avg       0.58      0.56      0.56      8629
weighted avg       0.62      0.64      0.63      8629


Misclassification Analysis Complete for Logistic Regression.
Results saved to 'misclassification_analysis_logistic_regression.csv'.

Accuracy: 0.6429481979371886
Sample of Misclassified Reviews:
              business_id  True_Star_Rating  Predicted_Star_Rating
0  V7IHpr1xzFIf_jp876HoAw                 4                      3
1  V7IHpr1xzFIf_jp876HoAw                 5                      4
2  V7IHpr1xzFIf_jp876HoAw                 5             