# DSTI ML Project A22 Cohort: Book Rating Predictor
The aim of the project is to train and evaluate different models that predict a book’s rating from a GoodReads dataset

In [1]:
# Ensure to select the .venv-book kernel before importing
# Kernel > Change kernel > .venv-book

import datetime
import pickle
import dvc.api
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn import preprocessing

## 4) Deployment
### 4.1) Preprocessing Function

In [2]:
def books_prepro(df: pd.DataFrame) -> pd.DataFrame:
    # Consants
    p = 10
    REF_DATE = datetime.date(2023, 8, 4)
    labelEncode = preprocessing.LabelEncoder()
    labelEncodeVar = ["title", "author_1", "author_2", "publisher"]
    Qvars = ["num_pages", "ratings_count", "text_reviews_count"]

    # Drop isbn column
    df = df.drop(columns=["isbn", "isbn13",  "bookID"])

    # Aplly Quantile Regrouping
    for var in Qvars:
        var_Q = pd.qcut(df[var], p, labels=False)
        var_Q = var_Q.rename(f"{var}_p_{p}")
        df = pd.concat([df, var_Q], axis=1)

    for index, row in df.iterrows():
        # Language code standardization
        if "en" in row["language_code"]:
            df.loc[index, "language_code"] =  1
        else:
            df.loc[index, "language_code"] = 0
        
        # Get title length
        df.loc[index, "title_len"] = len(row["title"])

        # Split and get number of authors
        authors = row["authors"].split("/")
        for i, author in enumerate(authors[0:2]):
            df.loc[index, f"author_{i+1}"] = author

        # Date Transformation
        p_date = row["publication_date"]
        try:
            p_date2 = datetime.datetime.strptime(p_date,'%m/%d/%Y').date()
            num_days = (REF_DATE - p_date2).days
            df.at[index, "publication_date"] = num_days
        except Exception as e:
            df = df.drop(index=index)
    
    df["publication_date"] = df["publication_date"].astype(int)   
    df["title_len"] = df["title_len"].astype(int)
    df = df.drop(columns=["authors"])

    # Label Encoding
    for var in labelEncodeVar:
        df[var] = labelEncode.fit_transform(df[var])

    df = df.drop(columns=["title", "ratings_count_p_10"])

    return df   

### 4.2) Loading Data The Cloud
We will use the DVC API to connect to the Github Repository where it will get the information to go to the cloud (Gdrive) and fetch the data.

In [3]:
with dvc.api.open(
    path='data/books.csv',
    repo="https://github.com/yoelturner99/book_ratings.git",
    mode='rb',
    rev="v1",
    encoding="utf-8"
) as csv_file:
    df = pd.read_csv(csv_file, on_bad_lines='skip')
    # Clean the columns name
    for col in df.columns:
        df = df.rename(columns = {col: col.strip()})
    print(f"Shape of original dataset: {df.shape}")

# df = pd.read_csv("../data/books.csv", on_bad_lines='skip')
# # Clean the columns name
# for col in df.columns:
#     df = df.rename(columns = {col: col.strip()})
# print(f"Shape of original dataset: {df.shape}")

Shape of original dataset: (11123, 12)


In [4]:
df_prepro = books_prepro(df)
print(f"Shape of preprocessed dataset: {df_prepro.shape}")

Shape of preprocessed dataset: (11121, 12)


### 4.3) Loading Model From Cloud
We will use the DVC API to connect to the Github Repository where it will get the information to go to the cloud (Gdrive) and fetch the model.

In [5]:
# /!\ About 2-4 mins to run the cell
modelpkl = dvc.api.read(
    path='models/book_ratings_RDFR_100_df3.pt',
    repo="https://github.com/yoelturner99/book_ratings.git",
    mode='rb',
    rev="v1"
)
model = pickle.loads(modelpkl)

# with open("../models/book_ratings_RDFR_100_df3.pt", 'rb') as modelpkl:
#     model = pickle.load(modelpkl)

### 4.4) Making Predictions

In [6]:
# We convert to numpy arrays because upon trainng the inputs werescaled using a scaler.fit_transform()
# Hence they were in array format and not dataframe format
target = np.array(df_prepro["average_rating"])
features = np.array(df_prepro.drop(columns="average_rating"))
predictions = model.predict(features)

In [7]:
mae = metrics.mean_absolute_error(target, predictions)
rmse = np.sqrt(metrics.mean_squared_error(target, predictions))
r2 = metrics.r2_score(target, predictions)

print(f"MAE: {mae:.4}     RMSE: {rmse:.4}     R2: {r2:.4}")
result = pd.DataFrame({'Observed': target.tolist(), 'Predicted': predictions.tolist()})
result['Diff'] = result['Observed'] - result['Predicted']
result.head(10)

MAE: 0.2473     RMSE: 0.3708     R2: -0.1195


Unnamed: 0,Observed,Predicted,Diff
0,4.57,4.0533,0.5167
1,4.49,4.0533,0.4367
2,4.42,4.0533,0.3667
3,4.56,4.0533,0.5067
4,4.78,4.0533,0.7267
5,3.74,4.0502,-0.3102
6,4.73,4.0533,0.6767
7,4.38,4.0533,0.3267
8,4.38,4.0533,0.3267
9,4.22,4.0533,0.1667
