 # Error Analysis

In [1]:
# Import libraries
import pandas as pd
import numpy as np

In [3]:
# Import final test dataframe
test = pd.read_csv('/content/drive/MyDrive/data/new_test_final.csv')

In [4]:
# Import final train datafram
train = pd.read_csv('/content/drive/MyDrive/data/new_train_final.csv')

In [5]:
# Define X
X_train = train.drop(columns=['imdb_id', 'rating_category', 'cleaned_review'])
X_test = test.drop(columns=['imdb_id', 'rating_category', 'cleaned_review'])

In [6]:
y_train = train['rating_category']
y_test = test['rating_category']

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

In [8]:
# Encode y_test
y_test_encoded = le.transform(y_test)

In [9]:
import joblib

# Load the models
loaded_logistic_model = joblib.load('/content/drive/MyDrive/data/logreg_vt_model.pkl')
loaded_rf_model = joblib.load('/content/drive/MyDrive/data/rf_vt_model.pkl')
loaded_xgb_model = joblib.load('/content/drive/MyDrive/data/xgboost_vt_model.pkl')


In [10]:
# Double checking our models
test_score_logistic = loaded_logistic_model.score(X_test, y_test)
test_score_rf = loaded_rf_model.score(X_test, y_test)
test_score_xgb = loaded_xgb_model.score(X_test, y_test_encoded)

print(f'Logistic Regression Test Accuracy: {test_score_logistic}')
print(f'Random Forest Test Accuracy: {test_score_rf}')
print(f'XGBoost Test Accuracy: {test_score_xgb}')


Logistic Regression Test Accuracy: 0.639601885804086
Random Forest Test Accuracy: 0.6225772655840754
XGBoost Test Accuracy: 0.6094814038763751


In [12]:
# Make predictions
y_pred_logreg = loaded_logistic_model.predict(X_test)

In [13]:
misclassified_indices = np.where(y_pred_logreg != y_test_encoded)[0]

In [14]:
# Inspect features
misclassified_samples = X_test.iloc[misclassified_indices]

In [16]:
# Check actual vs predicted
actual_labels = y_test_encoded[misclassified_indices]
predicted_labels = y_pred_logreg[misclassified_indices]

In [21]:
decoded_predicted_labels = le.inverse_transform(y_test_encoded)

In [17]:
# Create dataframe with actual and predicted labels
misclassified_df = pd.DataFrame({
    'Actual': actual_labels,
    'Predicted': predicted_labels
})


In [23]:
# Decode labels
decoded_predicted_labels = le.inverse_transform(y_test_encoded)

# Add to dataframe
misclassified_df['Predicted_Decoded'] = decoded_predicted_labels

In [24]:
misclassified_df

Unnamed: 0,Actual,Predicted,Predicted_Decoded
0,2,Good,Good
1,3,Poor,Poor
2,0,Average,Average
3,0,Average,Average
4,0,Average,Average
...,...,...,...
7631,2,Excellent,Good
7632,2,Excellent,Good
7633,2,Average,Good
7634,3,Poor,Poor


In [27]:
# Extract the rows in X_test for misclassified samples
misclassified_samples = X_test.iloc[misclassified_indices]

# Sum to get the frequency of each feature
feature_sums = misclassified_samples.sum(axis=0)

# Sort to get the most common features
sorted_feature_sums = feature_sums.sort_values(ascending=False)

# Show the top 10 most frequent unigrams/features
print("Top 10 most frequent features in misclassified samples:")
print(sorted_feature_sums.head(20))

Top 10 most frequent features in misclassified samples:
language_en          5831.000000
genre_Drama          4120.000000
genre_Comedy         2443.000000
genre_Romance        1285.000000
genre_Action         1281.000000
genre_Crime          1270.000000
genre_Thriller       1001.000000
unigram_make          995.580326
genre_Horror          963.000000
unigram_good          943.283290
genre_Adventure       938.000000
unigram_time          727.503568
unigram_story         708.463377
unigram_character     707.180478
genre_Mystery         575.000000
unigram_great         560.436039
unigram_really        548.110402
unigram_scene         510.580939
unigram_think         493.689848
unigram_love          483.296195
dtype: float64
