### Rating Prediction using Machine Learning

We will extract features and build a model for ratings prediction.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from lightgbm import LGBMRanker, LGBMRegressor, LGBMClassifier
from tqdm import tqdm
from xgboost import XGBRanker
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error


ratings = pd.read_csv("/content/drive/My Drive/datasets/ml-latest-small/ratings.csv", sep=",")

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


We will only use the ratings dataset to extract the features. Since we are not using any other content information such as movie plot, genres, directors, etc., this might look like rather limited, however, the results as we will show below are not bad.

Below are some general information about these features.

In [4]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30 18:45:03
1,1,3,4.0,2000-07-30 18:20:47
2,1,6,4.0,2000-07-30 18:37:04
3,1,47,5.0,2000-07-30 19:03:35
4,1,50,5.0,2000-07-30 18:48:51


In [5]:
ratings.rating.value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
4.0,26818
3.0,20047
5.0,13211
3.5,13136
4.5,8551
2.0,7551
2.5,5550
1.0,2811
1.5,1791
0.5,1370


In [6]:
ratings['timestamp'].dt.day_name().value_counts()


Unnamed: 0_level_0,count
timestamp,Unnamed: 1_level_1
Monday,17583
Tuesday,16411
Sunday,16209
Friday,14455
Wednesday,14014
Saturday,11458
Thursday,10706


In [7]:
ratings.timestamp.dt.hour.value_counts()

Unnamed: 0_level_0,count
timestamp,Unnamed: 1_level_1
20,6533
17,6347
21,6271
19,6202
18,5726
22,5387
16,5166
2,5135
1,4959
0,4481


Now, we will extract features for each user and item and prepare a dataset for model fitting.

In [12]:
# Copied from bturan19's kaggle nb.
def get_feature_by_user(df):
    res = list()
    for i, v in tqdm(df.groupby('userId')):
        res.append(
            (
                i,
                len(v['movieId']),
                (v['rating'] == 5).sum(),
                (v['rating'] == 4).sum(),
                (v['rating'] == 3).sum(),
                (v['rating'] == 2).sum(),
                (v['rating'] == 1).sum(),
                (v['timestamp'].dt.dayofweek == 0).sum(),
                (v['timestamp'].dt.dayofweek == 1).sum(),
                (v['timestamp'].dt.dayofweek == 2).sum(),
                (v['timestamp'].dt.dayofweek == 3).sum(),
                (v['timestamp'].dt.dayofweek == 4).sum(),
                (v['timestamp'].dt.dayofweek == 5).sum(),
                (v['timestamp'].dt.dayofweek == 6).sum(),
                (v['timestamp'].dt.hour > 17).sum()

            )
        )

    res = pd.DataFrame(
        res,
        columns=[
            'userId', 'reviewed_products', '5_star_ratings_gave', '4_star_ratings_gave',
            '3_star_ratings_gave', '2_star_ratings_gave', '1_star_ratings_gave',
            'monday_review_count_user', 'tuesday_review_count_user', 'wednesday_review_count_user', 'thursday_review_count_user',
            'friday_review_count_user', 'saturday_review_count_user', 'sunday_review_count_user','evening_reviews_by_user'
        ])
    return res

In [13]:
user_features = get_feature_by_user(ratings)

100%|██████████| 610/610 [00:01<00:00, 338.40it/s]


In [14]:
user_features.head()

Unnamed: 0,userId,reviewed_products,5_star_ratings_gave,4_star_ratings_gave,3_star_ratings_gave,2_star_ratings_gave,1_star_ratings_gave,monday_review_count_user,tuesday_review_count_user,wednesday_review_count_user,thursday_review_count_user,friday_review_count_user,saturday_review_count_user,sunday_review_count_user,evening_reviews_by_user
0,1,232,124,76,26,5,1,0,1,0,0,0,0,231,231
1,2,29,6,9,4,1,0,0,0,0,0,0,29,0,29
2,3,39,10,1,1,1,0,0,0,0,0,39,0,0,0
3,4,216,64,64,39,26,23,86,85,37,0,3,0,5,60
4,5,44,10,13,17,3,1,0,0,0,0,44,0,0,0


In [10]:
# Copied from bturan19's kaggle nb.

def get_feature_by_product(df):
    res = list()
    for i, v in tqdm(df.groupby('movieId')):
        res.append(
            (
                i,
                len(v['userId']),
                (v['rating'] == 5).sum(),
                (v['rating'] == 4).sum(),
                (v['rating'] == 3).sum(),
                (v['rating'] == 2).sum(),
                (v['rating'] == 1).sum(),
                (v['timestamp'].dt.dayofweek == 0).sum(),
                (v['timestamp'].dt.dayofweek == 1).sum(),
                (v['timestamp'].dt.dayofweek == 2).sum(),
                (v['timestamp'].dt.dayofweek == 3).sum(),
                (v['timestamp'].dt.dayofweek == 4).sum(),
                (v['timestamp'].dt.dayofweek == 5).sum(),
                (v['timestamp'].dt.dayofweek == 6).sum(),
                (v['timestamp'].dt.hour > 17).sum()
            )
        )

    res = pd.DataFrame(
        res,
        columns=[
            'movieId', 'user_count', '1_star_ratings_recieved', '2_star_ratings_recieved',
            '3_star_ratings_recieved', '4_star_ratings_recieved', '5_star_ratings_recieved',
            'monday_review_count_item', 'tuesday_review_count_item', 'wednesday_review_count_item', 'thursday_review_count_item',
            'friday_review_count_item', 'saturday_review_count_item', 'sunday_review_count_item','evening_reviews_by_movie'
        ])
    return res

In [16]:
movie_features = get_feature_by_product(ratings)

100%|██████████| 9724/9724 [00:35<00:00, 273.41it/s]


In [17]:
movie_features.head(5)

Unnamed: 0,movieId,user_count,1_star_ratings_recieved,2_star_ratings_recieved,3_star_ratings_recieved,4_star_ratings_recieved,5_star_ratings_recieved,monday_review_count_item,tuesday_review_count_item,wednesday_review_count_item,thursday_review_count_item,friday_review_count_item,saturday_review_count_item,sunday_review_count_item,evening_reviews_by_movie
0,1,215,47,82,34,6,0,42,31,25,24,38,21,34,68
1,2,110,7,36,28,5,1,16,19,10,10,24,10,21,35
2,3,52,6,12,20,2,3,8,11,7,4,10,6,6,16
3,4,7,0,0,4,1,1,0,0,1,1,0,3,2,3
4,5,49,3,8,25,6,0,7,9,5,7,6,5,10,11


In [18]:
merged = pd.merge(ratings, user_features, on=['userId'])
merged = pd.merge(merged, movie_features, on=['movieId'])
merged.head()

Unnamed: 0,userId,movieId,rating,timestamp,reviewed_products,5_star_ratings_gave,4_star_ratings_gave,3_star_ratings_gave,2_star_ratings_gave,1_star_ratings_gave,...,4_star_ratings_recieved,5_star_ratings_recieved,monday_review_count_item,tuesday_review_count_item,wednesday_review_count_item,thursday_review_count_item,friday_review_count_item,saturday_review_count_item,sunday_review_count_item,evening_reviews_by_movie
0,1,1,4.0,2000-07-30 18:45:03,232,124,76,26,5,1,...,6,0,42,31,25,24,38,21,34,68
1,1,3,4.0,2000-07-30 18:20:47,232,124,76,26,5,1,...,2,3,8,11,7,4,10,6,6,16
2,1,6,4.0,2000-07-30 18:37:04,232,124,76,26,5,1,...,2,1,12,16,17,13,15,10,19,29
3,1,47,5.0,2000-07-30 19:03:35,232,124,76,26,5,1,...,8,2,36,31,22,30,26,24,34,59
4,1,50,5.0,2000-07-30 18:48:51,232,124,76,26,5,1,...,3,2,32,35,27,26,33,20,31,68


In [19]:
y = merged.rating
merged = merged.drop(columns=["userId","movieId","rating","timestamp"])
merged.head()

Unnamed: 0,reviewed_products,5_star_ratings_gave,4_star_ratings_gave,3_star_ratings_gave,2_star_ratings_gave,1_star_ratings_gave,monday_review_count_user,tuesday_review_count_user,wednesday_review_count_user,thursday_review_count_user,...,4_star_ratings_recieved,5_star_ratings_recieved,monday_review_count_item,tuesday_review_count_item,wednesday_review_count_item,thursday_review_count_item,friday_review_count_item,saturday_review_count_item,sunday_review_count_item,evening_reviews_by_movie
0,232,124,76,26,5,1,0,1,0,0,...,6,0,42,31,25,24,38,21,34,68
1,232,124,76,26,5,1,0,1,0,0,...,2,3,8,11,7,4,10,6,6,16
2,232,124,76,26,5,1,0,1,0,0,...,2,1,12,16,17,13,15,10,19,29
3,232,124,76,26,5,1,0,1,0,0,...,8,2,36,31,22,30,26,24,34,59
4,232,124,76,26,5,1,0,1,0,0,...,3,2,32,35,27,26,33,20,31,68


We will use LGBM which is a fast and easy to use gradient boosting framework.

In [20]:
model = LGBMRegressor()
cross_val_score(model,merged,y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008652 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2204
[LightGBM] [Info] Number of data points in the train set: 80668, number of used features: 28
[LightGBM] [Info] Start training from score 3.476323
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008494 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2212
[LightGBM] [Info] Number of data points in the train set: 80669, number of used features: 28
[LightGBM] [Info] Start training from score 3.492110
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015981 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

array([0.29339961, 0.33092996, 0.37279246, 0.34712955, 0.34633895])

These results are very good, however, we did not pay attention not to leak data from test set. Below is a better approach.

In [21]:
ratings_train, ratings_test = train_test_split(ratings, test_size=5000)

In [22]:
user_features = get_feature_by_user(ratings_train)
movie_features = get_feature_by_product(ratings_train)
merged = pd.merge(ratings_train, user_features, on=['userId'])
merged = pd.merge(merged, movie_features, on=['movieId'])
merged.head()

100%|██████████| 610/610 [00:03<00:00, 170.89it/s]
100%|██████████| 9560/9560 [00:32<00:00, 289.71it/s]


Unnamed: 0,userId,movieId,rating,timestamp,reviewed_products,5_star_ratings_gave,4_star_ratings_gave,3_star_ratings_gave,2_star_ratings_gave,1_star_ratings_gave,...,4_star_ratings_recieved,5_star_ratings_recieved,monday_review_count_item,tuesday_review_count_item,wednesday_review_count_item,thursday_review_count_item,friday_review_count_item,saturday_review_count_item,sunday_review_count_item,evening_reviews_by_movie
0,414,176371,5.0,2017-11-24 15:02:59,2570,234,860,631,384,36,...,0,1,3,4,2,1,1,3,2,5
1,568,2313,5.0,2009-05-29 05:43:41,19,6,5,4,1,0,...,0,0,3,3,1,2,1,2,3,6
2,21,3578,2.5,2015-07-15 19:52:21,428,11,90,55,21,10,...,7,3,30,21,22,15,29,17,23,49
3,18,176101,3.5,2017-12-12 20:31:55,477,7,162,63,6,1,...,0,1,0,1,1,3,1,0,2,4
4,376,1610,4.0,2013-04-03 13:07:43,126,24,48,0,6,0,...,0,0,16,18,12,5,14,9,13,25


In [23]:
y = merged.rating
merged = merged.drop(columns=["userId","movieId","rating","timestamp"])
merged.head()

Unnamed: 0,reviewed_products,5_star_ratings_gave,4_star_ratings_gave,3_star_ratings_gave,2_star_ratings_gave,1_star_ratings_gave,monday_review_count_user,tuesday_review_count_user,wednesday_review_count_user,thursday_review_count_user,...,4_star_ratings_recieved,5_star_ratings_recieved,monday_review_count_item,tuesday_review_count_item,wednesday_review_count_item,thursday_review_count_item,friday_review_count_item,saturday_review_count_item,sunday_review_count_item,evening_reviews_by_movie
0,2570,234,860,631,384,36,719,889,214,185,...,0,1,3,4,2,1,1,3,2,5
1,19,6,5,4,1,0,0,0,0,4,...,0,0,3,3,1,2,1,2,3,6
2,428,11,90,55,21,10,36,2,60,14,...,7,3,30,21,22,15,29,17,23,49
3,477,7,162,63,6,1,40,176,109,83,...,0,1,0,1,1,3,1,0,2,4
4,126,24,48,0,6,0,0,49,77,0,...,0,0,16,18,12,5,14,9,13,25


In [24]:
model = LGBMRegressor()
model.fit(merged, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007515 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2301
[LightGBM] [Info] Number of data points in the train set: 95836, number of used features: 28
[LightGBM] [Info] Start training from score 3.499770


In [25]:
user_features = get_feature_by_user(ratings_test)
movie_features = get_feature_by_product(ratings_test)
merged = pd.merge(ratings_test, user_features, on=['userId'])
merged = pd.merge(merged, movie_features, on=['movieId'])
merged.head()

100%|██████████| 551/551 [00:01<00:00, 319.76it/s]
100%|██████████| 2440/2440 [00:07<00:00, 312.06it/s]


Unnamed: 0,userId,movieId,rating,timestamp,reviewed_products,5_star_ratings_gave,4_star_ratings_gave,3_star_ratings_gave,2_star_ratings_gave,1_star_ratings_gave,...,4_star_ratings_recieved,5_star_ratings_recieved,monday_review_count_item,tuesday_review_count_item,wednesday_review_count_item,thursday_review_count_item,friday_review_count_item,saturday_review_count_item,sunday_review_count_item,evening_reviews_by_movie
0,137,1237,4.5,2008-03-07 04:23:03,6,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,339,72641,4.5,2016-04-11 03:47:13,21,4,7,0,0,0,...,0,0,1,0,1,0,1,0,0,1
2,18,2058,3.5,2016-10-19 21:06:04,25,0,8,2,0,0,...,0,0,0,0,1,0,1,0,0,2
3,318,27904,4.5,2012-03-17 21:43:16,44,2,15,4,0,0,...,0,0,1,0,0,1,1,2,1,3
4,322,4027,3.5,2008-08-02 11:40:59,6,0,3,1,0,0,...,0,0,0,0,1,0,0,1,0,0


In [26]:
y = merged.rating
merged = merged.drop(columns=["userId","movieId","rating","timestamp"])
merged.head()

Unnamed: 0,reviewed_products,5_star_ratings_gave,4_star_ratings_gave,3_star_ratings_gave,2_star_ratings_gave,1_star_ratings_gave,monday_review_count_user,tuesday_review_count_user,wednesday_review_count_user,thursday_review_count_user,...,4_star_ratings_recieved,5_star_ratings_recieved,monday_review_count_item,tuesday_review_count_item,wednesday_review_count_item,thursday_review_count_item,friday_review_count_item,saturday_review_count_item,sunday_review_count_item,evening_reviews_by_movie
0,6,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,21,4,7,0,0,0,2,1,6,0,...,0,0,1,0,1,0,1,0,0,1
2,25,0,8,2,0,0,1,9,8,4,...,0,0,0,0,1,0,1,0,0,2
3,44,2,15,4,0,0,4,2,5,4,...,0,0,1,0,0,1,1,2,1,3
4,6,0,3,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [27]:
preds = model.predict(merged)

In [28]:
mean_absolute_error(y, preds)

0.6265630115752401

In [29]:
imps = model.feature_importances_
sorted_idx = np.argsort(imps)[::-1]
sorted_vals = np.sort(imps)[::-1]

d = {"feature_name":merged.columns[sorted_idx], "value":sorted_vals}
imp_df = pd.DataFrame(d)
imp_df[:20]

Unnamed: 0,feature_name,value
0,5_star_ratings_gave,267
1,1_star_ratings_recieved,244
2,4_star_ratings_gave,239
3,2_star_ratings_recieved,198
4,3_star_ratings_gave,195
5,3_star_ratings_recieved,178
6,4_star_ratings_recieved,163
7,1_star_ratings_gave,152
8,5_star_ratings_recieved,149
9,2_star_ratings_gave,148
