### Rating Prediction using Machine Learning

We will extract features and build a model for ratings prediction.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from lightgbm import LGBMRanker, LGBMRegressor, LGBMClassifier
from tqdm import tqdm
from xgboost import XGBRanker
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error


ratings = pd.read_csv("/content/drive/My Drive/datasets/ml-latest-small/ratings.csv", sep=",")

ratings.head()

We will only use the ratings dataset to extract the features. Since we are not using any other content information such as movie plot, genres, directors, etc., this might look like rather limited, however, the results as we will show below are not bad.

Below are some general information about these features.

In [None]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings.head()

In [None]:
ratings.rating.value_counts()

In [None]:
ratings['timestamp'].dt.day_name().value_counts()


In [None]:
ratings.timestamp.dt.hour.value_counts()

Now, we will extract features for each user and item and prepare a dataset for model fitting.

In [None]:
# Copied from bturan19's kaggle nb.
def get_feature_by_user(df):
    res = list()
    for i, v in tqdm(df.groupby('userId')):
        res.append(
            (
                i,
                len(v['movieId']),
                (v['rating'] == 5).sum(),
                (v['rating'] == 4).sum(),
                (v['rating'] == 3).sum(),
                (v['rating'] == 2).sum(),
                (v['rating'] == 1).sum(),
                (v['timestamp'].dt.dayofweek == 0).sum(),
                (v['timestamp'].dt.dayofweek == 1).sum(),
                (v['timestamp'].dt.dayofweek == 2).sum(),
                (v['timestamp'].dt.dayofweek == 3).sum(),
                (v['timestamp'].dt.dayofweek == 4).sum(),
                (v['timestamp'].dt.dayofweek == 5).sum(),
                (v['timestamp'].dt.dayofweek == 6).sum(),
                (v['timestamp'].dt.hour > 17).sum()

            )
        )

    res = pd.DataFrame(
        res,
        columns=[
            'userId', 'reviewed_products', '5_star_ratings_gave', '4_star_ratings_gave',
            '3_star_ratings_gave', '2_star_ratings_gave', '1_star_ratings_gave',
            'monday_review_count_user', 'tuesday_review_count_user', 'wednesday_review_count_user', 'thursday_review_count_user',
            'friday_review_count_user', 'saturday_review_count_user', 'sunday_review_count_user','evening_reviews_by_user'
        ])
    return res

In [None]:
user_features = get_feature_by_user(ratings)

In [None]:
user_features.head()

In [None]:
# Copied from bturan19's kaggle nb.

def get_feature_by_product(df):
    res = list()
    for i, v in tqdm(df.groupby('movieId')):
        res.append(
            (
                i,
                len(v['userId']),
                (v['rating'] == 5).sum(),
                (v['rating'] == 4).sum(),
                (v['rating'] == 3).sum(),
                (v['rating'] == 2).sum(),
                (v['rating'] == 1).sum(),
                (v['timestamp'].dt.dayofweek == 0).sum(),
                (v['timestamp'].dt.dayofweek == 1).sum(),
                (v['timestamp'].dt.dayofweek == 2).sum(),
                (v['timestamp'].dt.dayofweek == 3).sum(),
                (v['timestamp'].dt.dayofweek == 4).sum(),
                (v['timestamp'].dt.dayofweek == 5).sum(),
                (v['timestamp'].dt.dayofweek == 6).sum(),
                (v['timestamp'].dt.hour > 17).sum()
            )
        )

    res = pd.DataFrame(
        res,
        columns=[
            'movieId', 'user_count', '1_star_ratings_recieved', '2_star_ratings_recieved',
            '3_star_ratings_recieved', '4_star_ratings_recieved', '5_star_ratings_recieved',
            'monday_review_count_item', 'tuesday_review_count_item', 'wednesday_review_count_item', 'thursday_review_count_item',
            'friday_review_count_item', 'saturday_review_count_item', 'sunday_review_count_item','evening_reviews_by_movie'
        ])
    return res

In [None]:
movie_features = get_feature_by_product(ratings)

In [None]:
movie_features.head(5)

In [None]:
merged = pd.merge(ratings, user_features, on=['userId'])
merged = pd.merge(merged, movie_features, on=['movieId'])
merged.head()

In [None]:
y = merged.rating
merged = merged.drop(columns=["userId","movieId","rating","timestamp"])
merged.head()

We will use LGBM which is a fast and easy to use gradient boosting framework.

In [None]:
model = LGBMRegressor()
cross_val_score(model,merged,y)

These results are very good, however, we did not pay attention not to leak data from test set. Below is a better approach.

In [None]:
ratings_train, ratings_test = train_test_split(ratings, test_size=5000)

In [None]:
user_features = get_feature_by_user(ratings_train)
movie_features = get_feature_by_product(ratings_train)
merged = pd.merge(ratings_train, user_features, on=['userId'])
merged = pd.merge(merged, movie_features, on=['movieId'])
merged.head()

In [None]:
y = merged.rating
merged = merged.drop(columns=["userId","movieId","rating","timestamp"])
merged.head()

In [None]:
model = LGBMRegressor()
model.fit(merged, y)

In [None]:
user_features = get_feature_by_user(ratings_test)
movie_features = get_feature_by_product(ratings_test)
merged = pd.merge(ratings_test, user_features, on=['userId'])
merged = pd.merge(merged, movie_features, on=['movieId'])
merged.head()

In [None]:
y = merged.rating
merged = merged.drop(columns=["userId","movieId","rating","timestamp"])
merged.head()

In [None]:
preds = model.predict(merged)

In [None]:
preds

In [None]:
mean_absolute_error(y, preds)

In [None]:
imps = model.feature_importances_
sorted_idx = np.argsort(imps)[::-1]
sorted_vals = np.sort(imps)[::-1]

d = {"feature_name":merged.columns[sorted_idx], "value":sorted_vals}
imp_df = pd.DataFrame(d)
imp_df[:20]