# Important

`make data` has to be run before running any notebook cell

# Imports

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import numpy as np

In [None]:
book_df = pd.read_csv('../data/raw/book.csv')
ratings_df = pd.read_csv('../data/raw/ratings.csv')

In [None]:
ratings_df.head(1)

Make sure there are no duplicates in ratings.

In [None]:
ratings_df[ratings_df.duplicated(['user_id', 'book_id'], keep=False)]

# Visualization settings

In [None]:
sns.set(context='paper', font_scale=1.2, style='ticks', palette='muted',
        rc={"axes.labelsize":16, "ytick.labelsize": 14, "xtick.labelsize":14,
            "font.family": "sans-serif"})

# Ratings user and book coverage

In [None]:
ratings_df.groupby('user_id')['book_id'].count().describe()

All users rated at least 19 books. Such situation is rarely encountered in similar datasets.

In [None]:
ratings_df.groupby('book_id')['user_id'].count().describe()

All books have been rated at least 8 times.

# How users rate books?

In [None]:
ratings_df['rating'].describe()

In [None]:
sns.countplot(ratings_df.rating)

In [None]:
ratings_df.groupby('user_id')['rating'].mean().describe()

In [None]:
len(ratings_df.groupby('user_id').filter(lambda x: x['rating'].mean() == 0.0)['user_id'].unique())

In [None]:
len(ratings_df.groupby('user_id').filter(lambda x: x['rating'].mean() == 5.0)['user_id'].unique())

In [None]:
user_mean_ratings_plot = sns.distplot(ratings_df.groupby('user_id')['rating'].mean(), kde=False)
user_mean_ratings_plot.set(xlabel='Ratings mean', ylabel='Frequency')

People rate differently - some give only 5 stars reviews, some are more harsh than others, for some only perfect book should get 5 star rating and so on. Generally, people tend to use only the upper part of the scale. Such tendencies can be observed on mean user rating distribution plot.

To correct for biases caused by varying mean ratings of different users and items(i.e. long or hard-to-watch movies can also be rated far lower than others) special factors are introduced in the form of `user bias`, `item bias` or `baseline`. [Section 5.2.1 Recommender Systems Handbook, Ricci]

In [None]:
user_ratings_count_plot = sns.distplot(ratings_df.groupby('user_id')['rating'].count(), kde=False)
user_ratings_count_plot.set(xlabel='Ratings count', ylabel='Frequency')

In [None]:
len(ratings_df.groupby('book_id')['rating'].count()[ratings_df.groupby('book_id')['rating'].count() < 10000])

In [None]:
book_ratings_count_plot = sns.distplot(ratings_df.groupby('book_id')['rating'].count(), kde=False)
book_ratings_count_plot.set_yscale('log')
book_ratings_count_plot.set(xlabel='Ratings count', ylabel='Frequency')

In [None]:
book_ratings_cum_count = ratings_df.groupby('book_id')['rating'].count().sort_values().cumsum()

In [None]:
book_ratings_cum_count_plot = sns.lineplot(y=book_ratings_cum_count.values, x=[x+1 for x in range(0,10000)])
book_ratings_cum_count_plot.set(xlabel='Number of books considered', ylabel='Cumulative sum of ratings')
book_ratings_cum_count_plot.yaxis.set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x/(10**6)) + 'M'))

# Train and test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_df, test_df = train_test_split(ratings_df, test_size=0.1, random_state=44)

Some used methods do not generalize well for new(unseen) users and items, so we have to make sure that training test contains all users and items.

In [None]:
set(train_df['user_id'].unique()) == set(ratings_df['user_id'].unique())

In [None]:
set(train_df['book_id'].unique()) == set(ratings_df['book_id'].unique())

In [None]:
train_df.groupby('user_id')['book_id'].count().describe()

In [None]:
train_df.groupby('book_id')['user_id'].count().describe()