# DSTI ML Project A22 Cohort: Book Rating Predictor
The aim of the project is to train and evaluate different models that predict a book’s rating from a GoodReads dataset

In [9]:
# Ensure to select the .venv-book kernel before importing
# Kernel > Change kernel > .venv-book

import os
import pickle
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, LassoLars, ElasticNet, Ridge

## 3) Model Selection

### 3.1) Variation of Feature Sets

In [4]:
df_prepro = pd.read_csv("../data/books_preprocessed.csv", on_bad_lines='skip')
# As seen previously, in all case we will drop the following columns
print(f"Dataset contains: {df_prepro.shape[0]} rows and {df_prepro.shape[1]} columns")

corr = abs(df_prepro.corr())
corr2 = pd.DataFrame(corr.iloc[1:-1, 0])
corr2.style.background_gradient(cmap='coolwarm')

Dataset contains: 11121 rows and 18 columns


Unnamed: 0,average_rating
bookID,0.036259
isbn13,0.001967
num_pages,0.15043
num_pages_p_10,0.103904
ratings_count,0.038228
ratings_count_p_10,0.115798
text_reviews_count,0.033669
text_reviews_count_p_10,0.069308
publication_date,0.030897
publication_year,0.03173


According to the above table 
- ```num_pages```
- ```ratings_count_p_10```
- ```title_len```
- ```text_reviews_count_p_10``` 
- ```author_2```

have the highest correlation with the target.

However for the "engineered" features, we have lost some information about the dataset, hence higher correlation with the target will not necessarily translate into producing a better model, thats why we will have this experimental approach.

In [5]:
df1 = df_prepro.drop(columns=["isbn13", "num_authors", "title", "publication_year"])
df2 = df_prepro.drop(columns=["isbn13", "num_authors", "title", "publication_year", "bookID"])
df3 = df_prepro.drop(columns=["isbn13", "num_authors", "title", "publication_year", "bookID", "ratings_count_p_10"])
df4 = df_prepro.drop(columns=["isbn13", "num_authors", "title", "publication_year", "bookID", "ratings_count_p_10", "text_reviews_count_p_10"])
df5 = df_prepro.drop(columns=["isbn13", "num_authors", "title", "publication_year", "bookID", "ratings_count_p_10", "text_reviews_count_p_10", "title_len"])

df_list = [df1, df2, df3, df4, df5]

### 3.2) Data split

In [6]:
def data_split(
    df: pd.DataFrame, 
    targets: str, 
    test_size: float = 0.2, 
    scaler = None
) -> tuple:
    features = []
    for var in df.columns:
        if var != targets:
            features.append(var)

    x, y = df[features], df[targets]
    if scaler:
        x = scaler.fit_transform(x)
    # split 80% of the data to the training set and 20% of the data to test set 
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=999)

    return x_train, x_test, y_train, y_test

### 3.3) Model Training & Evaluation

Because we are trying to predict a quantity (```average_rating```), our problem is a **Regression** problem, hence we will apply the different regression models from the sklearn package and compare the performance. The models will be:
- LinearRegression (Linear)
- LassoLars (Linear)
- ElasticNet (Linear)
- Ridge (Linear)
- RandomForestRegressor (Ensemble)
- SVR linear or rbf (SVM)

For evaluation we will use the following metrics:
- Mean Absolute Error (MAE): We want to **minimize**
- Root Mean Square Error (RMSE): We want to **minimize**
- R2: We want to **maximize**

<u>Note</u>:
- *MAE* and *RMSE* measure how accurate the predictions are according to the ground truth
- *R2* measures how much of the features variation is explained by the target in the model.

In [7]:
def model_comparator(train_test_data: tuple, models_dict: dict) -> pd.DataFrame:
    eval_table = {}
    models = {}
    x_train, x_test = train_test_data[0], train_test_data[1]
    y_train, y_test = train_test_data[2], train_test_data[3]
    
    for model_name, model in models_dict.items():
        model.fit(x_train, y_train)
        models[model_name] = model
        predictions = model.predict(x_test)

        mae = metrics.mean_absolute_error(y_test, predictions)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, predictions))
        r2 = metrics.r2_score(y_test, predictions)

        eval_table[model_name] = [mae, rmse, r2]
    
    eval_df = pd.DataFrame.from_dict(eval_table, orient="index", columns=["MAE (min)", "RMSE (min)", "R2 (max)"])
    
    return models, eval_df

In [30]:
# Define models
regression_models = {
    "LinearRegression": LinearRegression(),
    "LassoLars": LassoLars(alpha=0.8),
    "ElasticNet": ElasticNet(alpha=0.8),
    "Ridge": Ridge(alpha=0.5),
    "RandomForest": RandomForestRegressor(),
    "SVR_rbf": SVR(kernel="rbf"),
    # "SVR_linear": SVR(kernel="linear"), # /!\ Takes too long to fit   
}
# Setup random seed
np.random.seed(42)
# Split data
train_test_data = data_split(df=df_prepro, targets="average_rating", test_size=0.2, scaler=StandardScaler())
# Train & evaluate models
trained_models, eval_df = model_comparator(train_test_data, regression_models)
# Apply heatmap for EACH column 
eval_df.style.background_gradient(cmap='coolwarm',axis=0)

Unnamed: 0,MAE (min),RMSE (min),R2 (max)
LinearRegression,0.232033,0.387602,0.063756
LassoLars,0.244431,0.401092,-0.002549
ElasticNet,0.244431,0.401092,-0.002549
Ridge,0.23202,0.387612,0.063705
RandomForest,0.217063,0.362639,0.180465
SVR_rbf,0.223324,0.378858,0.105518


We clearly see that the `RandomForestRegressor()` is the best model when training over the full dataset as it produces the minimum *MAE* and *RMSE* and the maximum *R2*. 

We will now check, which feature selection gives the best performance.

### 3.4) Feature Selection for model

In [10]:
# Define models
result = pd.DataFrame(columns=["MAE (min)", "RMSE (min)", "R2 (max)"])

for i, df in enumerate(df_list):
    regression_models = {
        f"RandomForest_df{i+1}": RandomForestRegressor(),
    }
    # Setup random seed
    np.random.seed(42)
    # Split data
    train_test_data = data_split(df=df, targets="average_rating", test_size=0.2, scaler=StandardScaler())
    # Train & evaluate models
    trained_model, eval_df = model_comparator(train_test_data, regression_models)
    result = pd.concat([result, eval_df], axis=0)
    result = result.astype(float)
# Apply heatmap for EACH column 
result.style.background_gradient(cmap='coolwarm',axis=0)

Unnamed: 0,MAE (min),RMSE (min),R2 (max)
RandomForest_df1,0.217028,0.358669,0.198312
RandomForest_df2,0.218908,0.356993,0.205788
RandomForest_df3,0.218598,0.354323,0.217625
RandomForest_df4,0.218591,0.355697,0.211545
RandomForest_df5,0.225199,0.36335,0.177248


We can see that the **dataset_3** gives the best performance.

Up to now we have used the default parameters of the ```RandomForestRegressor()``` with ```n_estimators``` = 100.

We will try to vary this parameter to increase the performance in the following using dataset_3:

### 3.5) Model Finetuning

In [11]:
# /!\ The cell takes about 8 mins to run
# Define models
regression_models = {
    'RandomForest_50': RandomForestRegressor(n_estimators=50),
    'RandomForest_100': RandomForestRegressor(n_estimators=100),
    'RandomForest_500': RandomForestRegressor(n_estimators=500),
    'RandomForest_1000': RandomForestRegressor(n_estimators=1000),
}
# Setup random seed
np.random.seed(42)
# Split data
train_test_data = data_split(df=df3, targets="average_rating", test_size=0.2, scaler=StandardScaler())
# Train & evaluate models
trained_models, eval_df = model_comparator(train_test_data, regression_models)

# Apply heatmap for EACH column 
eval_df.style.background_gradient(cmap='coolwarm',axis=0)

Unnamed: 0,MAE (min),RMSE (min),R2 (max)
RandomForest_50,0.2193,0.35855,0.198844
RandomForest_100,0.219362,0.35265,0.224994
RandomForest_500,0.218663,0.354661,0.216127
RandomForest_1000,0.21849,0.355925,0.210531


We can see that both models with ```n_estimators``` = 100 and 500 have the best balanced performances. Hence we will save them.

In [19]:
# Save the model to disk
modelpkl = '../models/book_ratings_RDFR_100_df3.pt'
if not os.path.isfile(modelpkl):
    pickle.dump(trained_models["RandomForest_100"], open(modelpkl, 'wb'))

We can see from the above table that the RandomForest with ```n_estimators``` **100**, **500** and **1000** have the best balanced performance.\
However, there are other parameters in the RandomForest that can be finetuned to select our final model, and for this we will do a gridsearch.

### 3.6) Model Finetuning through GridSearch

In [13]:
# /!\ 1h30 to run this cell
# Define the search space of parameters
param_grid = {
    'n_estimators': [100, 500, 1000],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3],
    'bootstrap': [True, False],
}
# Define the grid 
model_grid = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=3, verbose=2, n_jobs=4)
# Setup random seed
np.random.seed(42)
# Split data
x_train, x_test, y_train, y_test = data_split(df=df3, targets="average_rating", test_size=0.2,  scaler=StandardScaler())
# Fit df3
model_grid.fit(x_train, y_train)
pd.DataFrame.from_dict(model_grid.best_params_, orient="index").T

Fitting 3 folds for each of 54 candidates, totalling 162 fits


Unnamed: 0,bootstrap,min_samples_leaf,min_samples_split,n_estimators
0,True,3,6,500


In [14]:
# Evaluate "Best" Model from GridSearch
predictions = model_grid.predict(x_test)
r2 = metrics.r2_score(y_test, predictions)
mae = metrics.mean_absolute_error(y_test, predictions)
rmse = np.sqrt(metrics.mean_squared_error(y_test, predictions))

best_perf = pd.DataFrame([mae, rmse, r2], index=["MAE (min)", "RMSE (min)", "R2 (max)"], columns=["RandomForest_Grid"]).T

if "RandomForest_Grid" not in eval_df.index:
    eval_df = pd.concat([eval_df, best_perf], axis=0)
# Apply heatmap across EACH column
eval_df.style.background_gradient(cmap='coolwarm',axis=0)

Unnamed: 0,MAE (min),RMSE (min),R2 (max)
RandomForest_50,0.2193,0.35855,0.198844
RandomForest_100,0.219362,0.35265,0.224994
RandomForest_500,0.218663,0.354661,0.216127
RandomForest_1000,0.21849,0.355925,0.210531
RandomForest_Grid,0.218436,0.35544,0.212682


### 3.6) Conclusion
- **Model Selection**: We will use a `RandomForestRegressor()` with <u>**default parameters**</u>.
- **Feature Selection**: We will use `dataset_3` for our model i.e we will <u>keep</u> the following features:
    - `num_pages`
    - `num_pages_p_10`
    - `ratings_count`
    - `text_reviews_count`
    - `text_reviews_count_p_10`
    - `publication_date`
    - `language_code`
    - `title_len`
    - `author_1`
    - `author_2`
    - `publisher`


