In [234]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score

## Load data

In [235]:
df = pd.read_csv('data/anime_df_preprocessed.csv', index_col=0)
df.head()

Unnamed: 0,rating,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,...,Yaoi,Yuri,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV,scaled_episodes,scaled_members
0,9.37,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0.0,0.197867
1,9.26,1,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,1,0.034673,0.782769
2,9.25,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0.027518,0.112683
3,9.17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0.012658,0.664323
4,9.16,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0.027518,0.14918


In [237]:
X = df.drop('rating', axis=1)
y = df['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [238]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8901, 51), (2968, 51), (8901,), (2968,))

In [239]:
regressionResult = {}

## Linear Regression

In [240]:
linearRegression = LinearRegression()
linearRegression.fit(X_train, y_train)
cv_scores = cross_val_score(linearRegression, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
y_pred = linearRegression.predict(X_test)

In [241]:
print(f"Cross-Validation MSE Scores: {cv_scores}")
print(f"Mean Cross-Validation MSE: {cv_scores.mean():.4f}")
print(f"Standard Deviation of Cross-Validation MSE: {cv_scores.std():.4f}")

Cross-Validation MSE Scores: [-0.65388622 -0.67908963 -0.65883397 -0.66485128 -0.7014031 ]
Mean Cross-Validation MSE: -0.6716
Standard Deviation of Cross-Validation MSE: 0.0171


In [242]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
regressionResult['LinearRegression'] = {"mse": mse, "r2": r2}
print(f"Test Set Mean Squared Error: {mse:.4f}")
print(f"Test Set R-squared: {r2:.4f}")

Test Set Mean Squared Error: 0.6545
Test Set R-squared: 0.3660


## Ridge Regression

In [243]:
ridgeRegression = RidgeCV()
ridgeRegression.fit(X_train, y_train)
y_pred = ridgeRegression.predict(X_test)

In [244]:
print(f"Best alpha: {ridgeRegression.alpha_}")

Best alpha: 1.0


In [245]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
regressionResult['RidgeCV'] = {"mse": mse, "r2": r2}
print(f"Test Set Mean Squared Error: {mse:.4f}")
print(f"Test Set R-squared: {r2:.4f}")

Test Set Mean Squared Error: 0.6537
Test Set R-squared: 0.3668


## GradientBoostingRegressor

In [246]:
gbRegressor = GradientBoostingRegressor()
gbRegressor.fit(X_train, y_train)
y_pred = gbRegressor.predict(X_test)

In [247]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"Test Set Mean Squared Error: {mse:.4f}")
print(f"Test Set R-squared: {r2:.4f}")
gbRegressor.get_params()

Test Set Mean Squared Error: 0.4169
Test Set R-squared: 0.5961


{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [None]:
gbRegressor2 = GradientBoostingRegressor()

param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 1.0],
    'max_depth': [3, 5, 7, 9],
}
randomSearch = RandomizedSearchCV(estimator=gbRegressor2, param_distributions=param_grid, cv=5, n_jobs=-1,scoring='neg_mean_squared_error')
randomSearch.fit(X_train, y_train)

In [None]:
print(f"Best hyperparameters: {randomSearch.best_params_}")

In [None]:
best_gb = randomSearch.best_estimator_
y_pred = best_gb.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

regressionResult['GradientBoostingRegressor'] = {"mse": mse, "r2": r2}

# Print evaluation metrics
print(f"Test Set Mean Squared Error: {mse:.4f}")
print(f"Test Set R-squared: {r2:.4f}")

## RandomForestRegressor

In [None]:
rfRegressor = RandomForestRegressor()

param_grid= {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'bootstrap': [True, False]
}

randomSearch = RandomizedSearchCV(estimator=rfRegressor, param_distributions=param_grid, cv=5, n_jobs=-1,scoring='neg_mean_squared_error')
randomSearch.fit(X_train, y_train)

In [None]:
print(f"Best hyperparameters: {randomSearch.best_params_}")

In [None]:
best_rf = randomSearch.best_estimator_
y_pred = best_rf.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

regressionResult['RandomForestRegressor'] = {"mse": mse, "r2": r2}

# Print evaluation metrics
print(f"Test Set Mean Squared Error: {mse:.4f}")
print(f"Test Set R-squared: {r2:.4f}")

## KNN

In [None]:
knn = KNeighborsRegressor()

param_grid= {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

randomSearch = RandomizedSearchCV(estimator=knn, param_distributions=param_grid, cv=5, n_jobs=-1,scoring='neg_mean_squared_error')
randomSearch.fit(X_train, y_train)

In [None]:
print(f"Best hyperparameters: {randomSearch.best_params_}")

In [None]:
best_knn = randomSearch.best_estimator_
y_pred = best_knn.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

regressionResult['KNeighborsRegressor'] = {"mse": mse, "r2": r2}

# Print evaluation metrics
print(f"Test Set Mean Squared Error: {mse:.4f}")
print(f"Test Set R-squared: {r2:.4f}")

## Results

In [None]:
pd.DataFrame(regressionResult).T

## Bin rating into low, mid, high

In [None]:
# Define bins and labels
bins = [0, 5, 8, 10]  # Define bin edges
labels = ['low', 'mid', 'high']  # Define bin labels

# Bin the data
df['binned_rating'] = pd.cut(df['rating'], bins=bins, labels=labels, include_lowest=True)

In [None]:
df.head()

In [None]:
X = df.drop(columns=['rating','binned_rating'])
y = df['binned_rating']

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## GradientBoostingClassifier

In [None]:
gbClassifier = GradientBoostingClassifier()
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 1.0],
    'max_depth': [3, 5, 7, 9],
}
randomSearch = RandomizedSearchCV(estimator=gbClassifier, param_distributions=param_grid, cv=5, n_jobs=-1)
randomSearch.fit(X_train, y_train)

In [None]:
print(f"Best hyperparameters: {randomSearch.best_params_}")

In [None]:
best_gb = randomSearch.best_estimator_
y_pred = best_gb.predict(X_test)

In [None]:
report = classification_report(y_test, y_pred)
print(f"Classification Report:\n{report}")
cm = confusion_matrix(y_test, y_pred)

In [None]:
sns.heatmap(cm/np.sum(cm), annot=True, 
            fmt='.2%', cmap='Blues')
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()

## RandomForestClassifier

In [None]:
rfClassifier = RandomForestClassifier()

param_grid= {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'bootstrap': [True, False]
}

randomSearch = RandomizedSearchCV(estimator=rfClassifier, param_distributions=param_grid, cv=5, n_jobs=-1)
randomSearch.fit(X_train, y_train)

In [None]:
print(f"Best hyperparameters: {randomSearch.best_params_}")

In [None]:
best_rf = randomSearch.best_estimator_
y_pred = best_rf.predict(X_test)

In [None]:
report = classification_report(y_test, y_pred)
print(f"Classification Report:\n{report}")
cm = confusion_matrix(y_test, y_pred)

In [None]:
sns.heatmap(cm/np.sum(cm), annot=True, 
            fmt='.2%', cmap='Blues')
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()