### Rating Prediction using Machine Learning

We will extract features and build a model for ratings prediction.

In [2]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from lightgbm import LGBMRanker, LGBMRegressor, LGBMClassifier
from tqdm import tqdm
from xgboost import XGBRanker
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error


ratings = pd.read_csv("../../datasets/ml-latest-small/ratings.csv", sep=",")

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30 18:45:03
1,1,3,4.0,2000-07-30 18:20:47
2,1,6,4.0,2000-07-30 18:37:04
3,1,47,5.0,2000-07-30 19:03:35
4,1,50,5.0,2000-07-30 18:48:51


In [4]:
ratings.rating.value_counts()

4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: rating, dtype: int64

In [5]:
ratings['timestamp'].dt.day_name().value_counts()


Monday       17583
Tuesday      16411
Sunday       16209
Friday       14455
Wednesday    14014
Saturday     11458
Thursday     10706
Name: timestamp, dtype: int64

In [6]:
ratings.timestamp.dt.hour.value_counts()

20    6533
17    6347
21    6271
19    6202
18    5726
22    5387
16    5166
2     5135
1     4959
0     4481
3     4257
15    4217
23    4154
14    3824
7     3633
4     3274
8     3130
13    3041
12    2952
11    2663
5     2566
10    2394
9     2356
6     2168
Name: timestamp, dtype: int64

In [7]:
# Copied from bturan19's kaggle nb.
def get_feature_by_user(df):
    res = list()
    for i, v in tqdm(df.groupby('userId')):
        res.append(
            (
                i,
                len(v['movieId']),
                (v['rating'] == 5).sum(),
                (v['rating'] == 4).sum(),
                (v['rating'] == 3).sum(),
                (v['rating'] == 2).sum(),
                (v['rating'] == 1).sum(),
                (v['timestamp'].dt.dayofweek == 0).sum(),
                (v['timestamp'].dt.dayofweek == 1).sum(),
                (v['timestamp'].dt.dayofweek == 2).sum(),
                (v['timestamp'].dt.dayofweek == 3).sum(),
                (v['timestamp'].dt.dayofweek == 4).sum(),
                (v['timestamp'].dt.dayofweek == 5).sum(),
                (v['timestamp'].dt.dayofweek == 6).sum(),
                (v['timestamp'].dt.hour > 17).sum()

            )
        )
    
    res = pd.DataFrame(
        res,
        columns=[
            'userId', 'revired_products', '5_star_ratings_gave', '4_star_ratings_gave',
            '3_star_ratings_gave', '2_star_ratings_gave', '1_star_ratings_gave',
            'monday_review_count_user', 'tuesday_review_count_user', 'wednesday_review_count_user', 'thursday_review_count_user',
            'friday_review_count_user', 'saturday_review_count_user', 'sunday_review_count_user','evening_reviews_by_user'
        ])
    return res

In [8]:
# Copied from bturan19's kaggle nb.

def get_feature_by_product(df):
    res = list()
    for i, v in tqdm(df.groupby('movieId')):
        res.append(
            (
                i,
                len(v['userId']),
                (v['rating'] == 5).sum(),
                (v['rating'] == 4).sum(),
                (v['rating'] == 3).sum(),
                (v['rating'] == 2).sum(),
                (v['rating'] == 1).sum(),
                (v['timestamp'].dt.dayofweek == 0).sum(),
                (v['timestamp'].dt.dayofweek == 1).sum(),
                (v['timestamp'].dt.dayofweek == 2).sum(),
                (v['timestamp'].dt.dayofweek == 3).sum(),
                (v['timestamp'].dt.dayofweek == 4).sum(),
                (v['timestamp'].dt.dayofweek == 5).sum(),
                (v['timestamp'].dt.dayofweek == 6).sum(),
                (v['timestamp'].dt.hour > 17).sum()
            )
        )
    
    res = pd.DataFrame(
        res,
        columns=[
            'movieId', 'user_count', '1_star_ratings_recieved', '2_star_ratings_recieved',
            '3_star_ratings_recieved', '4_star_ratings_recieved', '5_star_ratings_recieved',
            'monday_review_count_item', 'tuesday_review_count_item', 'wednesday_review_count_item', 'thursday_review_count_item',
            'friday_review_count_item', 'saturday_review_count_item', 'sunday_review_count_item','evening_reviews_by_movie'
        ])
    return res

In [9]:
user_features = get_feature_by_user(ratings)

  0%|          | 0/610 [00:00<?, ?it/s]

100%|██████████| 610/610 [00:02<00:00, 301.75it/s]


In [10]:
user_features.head()

Unnamed: 0,userId,revired_products,5_star_ratings_gave,4_star_ratings_gave,3_star_ratings_gave,2_star_ratings_gave,1_star_ratings_gave,monday_review_count_user,tuesday_review_count_user,wednesday_review_count_user,thursday_review_count_user,friday_review_count_user,saturday_review_count_user,sunday_review_count_user,evening_reviews_by_user
0,1,232,124,76,26,5,1,0,1,0,0,0,0,231,231
1,2,29,6,9,4,1,0,0,0,0,0,0,29,0,29
2,3,39,10,1,1,1,0,0,0,0,0,39,0,0,0
3,4,216,64,64,39,26,23,86,85,37,0,3,0,5,60
4,5,44,10,13,17,3,1,0,0,0,0,44,0,0,0


In [11]:
movie_features = get_feature_by_product(ratings)

100%|██████████| 9724/9724 [00:29<00:00, 330.61it/s]


In [12]:
movie_features

Unnamed: 0,movieId,user_count,1_star_ratings_recieved,2_star_ratings_recieved,3_star_ratings_recieved,4_star_ratings_recieved,5_star_ratings_recieved,monday_review_count_item,tuesday_review_count_item,wednesday_review_count_item,thursday_review_count_item,friday_review_count_item,saturday_review_count_item,sunday_review_count_item,evening_reviews_by_movie
0,1,215,47,82,34,6,0,42,31,25,24,38,21,34,68
1,2,110,7,36,28,5,1,16,19,10,10,24,10,21,35
2,3,52,6,12,20,2,3,8,11,7,4,10,6,6,16
3,4,7,0,0,4,1,1,0,0,1,1,0,3,2,3
4,5,49,3,8,25,6,0,7,9,5,7,6,5,10,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9719,193581,1,0,1,0,0,0,0,0,0,0,0,0,1,0
9720,193583,1,0,0,0,0,0,0,0,0,0,0,0,1,0
9721,193585,1,0,0,0,0,0,0,0,0,0,0,0,1,0
9722,193587,1,0,0,0,0,0,0,0,0,0,0,0,1,0


In [13]:
merged = pd.merge(ratings, user_features, on=['userId'])    
merged = pd.merge(merged, movie_features, on=['movieId'])
merged.head() 

Unnamed: 0,userId,movieId,rating,timestamp,revired_products,5_star_ratings_gave,4_star_ratings_gave,3_star_ratings_gave,2_star_ratings_gave,1_star_ratings_gave,...,4_star_ratings_recieved,5_star_ratings_recieved,monday_review_count_item,tuesday_review_count_item,wednesday_review_count_item,thursday_review_count_item,friday_review_count_item,saturday_review_count_item,sunday_review_count_item,evening_reviews_by_movie
0,1,1,4.0,2000-07-30 18:45:03,232,124,76,26,5,1,...,6,0,42,31,25,24,38,21,34,68
1,5,1,4.0,1996-11-08 06:36:02,44,10,13,17,3,1,...,6,0,42,31,25,24,38,21,34,68
2,7,1,4.5,2005-01-25 06:52:26,152,13,28,16,11,12,...,6,0,42,31,25,24,38,21,34,68
3,15,1,2.5,2017-11-13 12:59:30,135,25,21,25,13,6,...,6,0,42,31,25,24,38,21,34,68
4,17,1,4.5,2011-05-18 05:28:03,105,16,33,2,0,0,...,6,0,42,31,25,24,38,21,34,68


In [14]:
y = merged.rating
merged = merged.drop(columns=["userId","movieId","rating","timestamp"])
merged.head()

Unnamed: 0,revired_products,5_star_ratings_gave,4_star_ratings_gave,3_star_ratings_gave,2_star_ratings_gave,1_star_ratings_gave,monday_review_count_user,tuesday_review_count_user,wednesday_review_count_user,thursday_review_count_user,...,4_star_ratings_recieved,5_star_ratings_recieved,monday_review_count_item,tuesday_review_count_item,wednesday_review_count_item,thursday_review_count_item,friday_review_count_item,saturday_review_count_item,sunday_review_count_item,evening_reviews_by_movie
0,232,124,76,26,5,1,0,1,0,0,...,6,0,42,31,25,24,38,21,34,68
1,44,10,13,17,3,1,0,0,0,0,...,6,0,42,31,25,24,38,21,34,68
2,152,13,28,16,11,12,19,80,36,6,...,6,0,42,31,25,24,38,21,34,68
3,135,25,21,25,13,6,119,0,0,0,...,6,0,42,31,25,24,38,21,34,68
4,105,16,33,2,0,0,0,0,91,3,...,6,0,42,31,25,24,38,21,34,68


In [15]:
model = LGBMRegressor()
cross_val_score(model,merged,y)

array([0.27667688, 0.32383443, 0.36147763, 0.37306925, 0.43811485])

These results are very good, however, we did not pay attention not to leak data from test set. Below is a better approach.

In [16]:
ratings_train, ratings_test = train_test_split(ratings, test_size=5000)

In [17]:
user_features = get_feature_by_user(ratings_train)
movie_features = get_feature_by_product(ratings_train)
merged = pd.merge(ratings_train, user_features, on=['userId'])    
merged = pd.merge(merged, movie_features, on=['movieId'])
merged.head() 

  0%|          | 0/610 [00:00<?, ?it/s]

100%|██████████| 610/610 [00:01<00:00, 317.71it/s]
100%|██████████| 9552/9552 [00:28<00:00, 331.34it/s]


Unnamed: 0,userId,movieId,rating,timestamp,revired_products,5_star_ratings_gave,4_star_ratings_gave,3_star_ratings_gave,2_star_ratings_gave,1_star_ratings_gave,...,4_star_ratings_recieved,5_star_ratings_recieved,monday_review_count_item,tuesday_review_count_item,wednesday_review_count_item,thursday_review_count_item,friday_review_count_item,saturday_review_count_item,sunday_review_count_item,evening_reviews_by_movie
0,280,3114,4.0,2012-09-25 00:18:19,182,20,60,15,3,0,...,4,0,19,22,11,7,15,10,11,32
1,45,3114,5.0,2000-02-16 19:05:14,379,106,137,88,24,9,...,4,0,19,22,11,7,15,10,11,32
2,474,3114,4.0,2001-10-12 23:05:41,1978,57,536,364,165,33,...,4,0,19,22,11,7,15,10,11,32
3,448,3114,5.0,2002-04-18 10:44:21,1777,71,317,464,320,83,...,4,0,19,22,11,7,15,10,11,32
4,414,3114,5.0,2001-05-29 14:20:26,2569,234,854,630,380,40,...,4,0,19,22,11,7,15,10,11,32


In [18]:
y = merged.rating
merged = merged.drop(columns=["userId","movieId","rating","timestamp"])
merged.head()

Unnamed: 0,revired_products,5_star_ratings_gave,4_star_ratings_gave,3_star_ratings_gave,2_star_ratings_gave,1_star_ratings_gave,monday_review_count_user,tuesday_review_count_user,wednesday_review_count_user,thursday_review_count_user,...,4_star_ratings_recieved,5_star_ratings_recieved,monday_review_count_item,tuesday_review_count_item,wednesday_review_count_item,thursday_review_count_item,friday_review_count_item,saturday_review_count_item,sunday_review_count_item,evening_reviews_by_movie
0,182,20,60,15,3,0,14,97,1,0,...,4,0,19,22,11,7,15,10,11,32
1,379,106,137,88,24,9,140,57,132,8,...,4,0,19,22,11,7,15,10,11,32
2,1978,57,536,364,165,33,513,224,261,307,...,4,0,19,22,11,7,15,10,11,32
3,1777,71,317,464,320,83,215,90,102,530,...,4,0,19,22,11,7,15,10,11,32
4,2569,234,854,630,380,40,718,880,213,186,...,4,0,19,22,11,7,15,10,11,32


In [19]:
model = LGBMRegressor()
model.fit(merged, y)

In [20]:
user_features = get_feature_by_user(ratings_test)
movie_features = get_feature_by_product(ratings_test)
merged = pd.merge(ratings_test, user_features, on=['userId'])    
merged = pd.merge(merged, movie_features, on=['movieId'])
merged.head() 

100%|██████████| 566/566 [00:02<00:00, 277.58it/s]
100%|██████████| 2493/2493 [00:08<00:00, 301.31it/s]


Unnamed: 0,userId,movieId,rating,timestamp,revired_products,5_star_ratings_gave,4_star_ratings_gave,3_star_ratings_gave,2_star_ratings_gave,1_star_ratings_gave,...,4_star_ratings_recieved,5_star_ratings_recieved,monday_review_count_item,tuesday_review_count_item,wednesday_review_count_item,thursday_review_count_item,friday_review_count_item,saturday_review_count_item,sunday_review_count_item,evening_reviews_by_movie
0,465,1617,5.0,2000-06-01 21:43:41,5,4,1,0,0,0,...,0,0,1,0,0,2,0,0,0,1
1,156,1617,5.0,1999-10-14 10:44:21,21,4,6,9,0,0,...,0,0,1,0,0,2,0,0,0,1
2,275,1617,4.0,2003-03-31 02:34:48,17,11,3,1,2,0,...,0,0,1,0,0,2,0,0,0,1
3,465,2109,4.0,2000-06-01 21:37:55,5,4,1,0,0,0,...,0,0,0,0,0,2,0,1,0,2
4,448,2109,3.0,2002-04-18 11:01:51,87,4,20,18,14,2,...,0,0,0,0,0,2,0,1,0,2


In [None]:
y = merged.rating
merged = merged.drop(columns=["userId","movieId","rating","timestamp"])
merged.head()

In [22]:
preds = model.predict(merged)

In [23]:
mean_absolute_error(y, preds)

0.6079837323052947

In [26]:
imps = model.feature_importances_
sorted_idx = np.argsort(imps)[::-1]
sorted_vals = np.sort(imps)[::-1]

d = {"feature_name":merged.columns[sorted_idx], "value":sorted_vals}
imp_df = pd.DataFrame(d)
imp_df[:20]

Unnamed: 0,feature_name,value
0,1_star_ratings_recieved,266
1,5_star_ratings_gave,256
2,4_star_ratings_gave,254
3,2_star_ratings_recieved,199
4,3_star_ratings_recieved,192
5,1_star_ratings_gave,167
6,5_star_ratings_recieved,167
7,4_star_ratings_recieved,155
8,3_star_ratings_gave,154
9,2_star_ratings_gave,152
