In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

try:
    df = pd.read_csv('../4-prep_model_data/modelling_data.csv')
    print("Data loaded successfully from 'modelling_data.csv'.")
except FileNotFoundError:
    print("Error: 'modelling_data.csv' not found. Please ensure the file is in the current directory.")

Data loaded successfully from 'modelling_data.csv'.


In [None]:
# Target variable
y = df['stars_x'] 

# Features for the model (only 'text' as per your initial plan for a classic text model)
X_text = df['text'] 

groups = df['business_id'] 

# Drop NaN values that might prevent the split from working (especially in 'stars_x' or 'business_id')
df.dropna(subset=['stars_x', 'business_id', 'text'], inplace=True) 

# Re-align variables after dropping NAs (if any were dropped)
y = df['stars_x']
X_text = df['text']
groups = df['business_id']

In [None]:

# Stratified Group Split (80/20)
# We use n_splits=5 so one fold is 20% for testing.
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

# Get the indices for the first split (80% train / 20% test)
try:
    train_index, test_index = next(sgkf.split(X_text, y, groups))
except ValueError as e:
    print(f"\nCRITICAL ERROR during split: {e}")
    print("This often means one of your groups (business_id) has only a single star rating,")
    print("or one star rating has too few samples to be represented in all splits.")
    from sklearn.model_selection import train_test_split
    train_index, test_index = train_test_split(df.index, test_size=0.2, stratify=y, random_state=42)
    print("Falling back to standard stratified split (business separation compromised).")


X_train, X_test = X_text.loc[train_index], X_text.loc[test_index]
y_train, y_test = y.loc[train_index], y.loc[test_index]

business_ids_test = groups.loc[test_index] 

print(f"\nTotal Data Points: {len(df)}")
print(f"Training Set Size: {len(X_train)} ({len(X_train) / len(df) * 100:.1f}%)")
print(f"Testing Set Size: {len(X_test)} ({len(X_test) / len(df) * 100:.1f}%)")
print(f"Unique businesses in training set: {groups.loc[train_index].nunique()}")
print(f"Unique businesses in testing set: {groups.loc[test_index].nunique()}")


Total Data Points: 46457
Training Set Size: 37828 (81.4%)
Testing Set Size: 8629 (18.6%)
Unique businesses in training set: 570
Unique businesses in testing set: 134


In [None]:
# A Pipeline simplifies the workflow: chains TF-IDF (Vectorization) and SVM (Model)
model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english', 
        ngram_range=(1, 2), # Using single words and pairs of words (unigrams and bigrams)
        max_features=10000  # Limit features to 10,000 for efficiency
    )),
    # SVM Classifier (SVC) - Multi-class classification
    ('svm', SVC(kernel='linear', C=1.0, random_state=42)) 
])

print("\nStarting model training...")
model_pipeline.fit(X_train, y_train)
print("Training complete!")



Starting model training...


In [None]:


y_pred = model_pipeline.predict(X_test)

print("\n" + "="*50)
print("CLASSIFICATION REPORT ON TEST SET (Star Rating Prediction)")
print("="*50)
print(classification_report(y_test, y_pred, zero_division=0))

misclassification_df = pd.DataFrame({
    'business_id': business_ids_test.values,
    'Review_Text': X_test.values,
    'True_Star_Rating': y_test.values,
    'Predicted_Star_Rating': y_pred
})

misclassification_df['Is_Misclassified'] = (misclassification_df['True_Star_Rating'] != misclassification_df['Predicted_Star_Rating'])

print("\nMisclassification Analysis DataFrame created.")
print("You can now analyze this DataFrame to find businesses with poor model performance.")
print("Sample of Misclassified Reviews (True vs. Predicted):")
print(misclassification_df[misclassification_df['Is_Misclassified']][['business_id', 'True_Star_Rating', 'Predicted_Star_Rating']].head())


CLASSIFICATION REPORT ON TEST SET (Star Rating Prediction)
              precision    recall  f1-score   support

           1       0.70      0.75      0.72      1137
           2       0.42      0.32      0.36       755
           3       0.45      0.34      0.39      1024
           4       0.48      0.47      0.48      2089
           5       0.73      0.81      0.77      3624

    accuracy                           0.62      8629
   macro avg       0.56      0.54      0.54      8629
weighted avg       0.61      0.62      0.61      8629


Misclassification Analysis DataFrame created.
You can now analyze this DataFrame to find businesses with poor model performance.
Sample of Misclassified Reviews (True vs. Predicted):
              business_id  True_Star_Rating  Predicted_Star_Rating
0  V7IHpr1xzFIf_jp876HoAw                 4                      3
1  V7IHpr1xzFIf_jp876HoAw                 5                      4
2  V7IHpr1xzFIf_jp876HoAw                 5                      4

In [None]:
misclassification_df = pd.DataFrame({
    'business_id': business_ids_test.values,
    'Review_Text': X_test.values,
    'True_Star_Rating': y_test.values,
    'Predicted_Star_Rating': y_pred
})

misclassification_df['Is_Misclassified'] = (misclassification_df['True_Star_Rating'] != misclassification_df['Predicted_Star_Rating'])
misclassification_rate_per_business = misclassification_df.groupby('business_id')['Is_Misclassified'].mean().sort_values(ascending=False)

print("\nTop 5 Businesses by Misclassification Rate:")
print(misclassification_rate_per_business.head())


Top 5 Businesses by Misclassification Rate:
business_id
V7IHpr1xzFIf_jp876HoAw    1.00
k1hO7GVnNQLn8Ujx3wEW1Q    1.00
x4jqqs-Hr7YXYJEqXVTR8w    0.80
PbMrgxJDW_3hVXUFg7U95w    0.75
ybQem1jFkGPI1F-PEfEZBQ    0.75
Name: Is_Misclassified, dtype: float64
