# Important

`make data` has to be run before running any notebook cell

# Imports

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
sns.set(context='paper', style='white', palette='muted', rc={'figure.figsize':(11.7,8.27)})

In [None]:
book_df = pd.read_csv('../data/raw/book.csv')
ratings_df = pd.read_csv('../data/raw/ratings.csv')

In [None]:
ratings_df.head(1)

Make sure there are no duplicates in ratings.

In [None]:
ratings_df[ratings_df.duplicated(['user_id', 'book_id'], keep=False)]

# Ratings user and book coverage

In [None]:
ratings_df.groupby('user_id')['book_id'].count().describe()

All users rated at least 19 books. Such situation is rarely encountered in similar datasets.

In [None]:
ratings_df.groupby('book_id')['user_id'].count().describe()

All books have been rated at least 8 times.

In [None]:
tmp = ratings_df.groupby('book_id')['user_id'].count().sort_values()

In [None]:
dfs = [tmp.loc[idx] for idx in np.split(tmp.index,5)]

In [None]:
dfs

# How users rate books?

In [None]:
ratings_df['rating'].describe()

In [None]:
sns.set(context='paper', style='white', palette='muted', rc={'figure.figsize':(11.7,8.27)})

In [None]:
sns.countplot(ratings_df.rating)

In [None]:
ratings_df.groupby('user_id')['rating'].mean().describe()

In [None]:
len(ratings_df.groupby('user_id').filter(lambda x: x['rating'].mean() == 0.0)['user_id'].unique())

In [None]:
len(ratings_df.groupby('user_id').filter(lambda x: x['rating'].mean() == 5.0)['user_id'].unique())

In [None]:
x = sns.distplot(ratings_df.groupby('user_id')['rating'].mean(), kde_kws={'clip': (1.0, 5.0)})

In [None]:
x.get_figure().savefig("users-mean_rating-dist.pdf", bbox_inches='tight')

People rate differently - some give only 5 stars reviews, some are more harsh than others, for some only perfect book should get 5 star rating and so on. Generally, people tend to use only the upper part of the scale. Such tendencies can be observed on mean user rating distribution plot.

To correct for biases caused by varying mean ratings of different users and items(i.e. long or hard-to-watch movies can also be rated far lower than others) special factors are introduced in the form of `user bias`, `item bias` or `baseline`. [Section 5.2.1 Recommender Systems Handbook, Ricci]

In [None]:
y = sns.distplot(ratings_df.groupby('user_id')['rating'].count())
y.set_xlim(0, 210)
plt.xlabel("Ratings count")

In [None]:
y.get_figure().savefig("users_ratings_count-dist.pdf", bbox_inches='tight')

In [None]:
len(ratings_df.groupby('book_id')['rating'].count()[ratings_df.groupby('book_id')['rating'].count() < 10000])

In [None]:
ratings_df.groupby('book_id')['rating'].count()[ratings_df.groupby('book_id')['rating'].count() < 9]

In [None]:
z = sns.distplot(ratings_df.groupby('book_id')['rating'].count().sort_values().head(9000))
z.set_xlim(0, 1200)

In [None]:
z.get_figure().savefig("book_ratings_count-dist.pdf", bbox_inches='tight')

In [None]:
def chunk(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [None]:
chunks = list(chunk(ratings_df.groupby('book_id')['rating'].count().sort_values().cumsum().values, 1000))

In [None]:
sums = [x[-1] for x in chunks]

In [None]:
sums

In [None]:
a = sns.barplot(sums.index, sums.values)
a.set_yscale('log')

In [None]:
s = ratings_df.groupby('book_id')['rating'].count().sort_values().cumsum()

In [None]:
s.values

In [None]:
sns.lineplot(y=s.values, x=[x+1 for x in range(0,10000)])

In [None]:
plt.hist(s.values, 50, histtype="stepfilled", alpha=.7, cumulative=True)

In [None]:
sns.distplot(s.values,bins=1000, hist=True, norm_hist=False, hist_kws={'cumulative': True}, kde_kws={'cumulative': True})

# Train and test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_df, test_df = train_test_split(ratings_df, test_size=0.1, random_state=44)

Some used methods do not generalize well for new(unseen) users and items, so we have to make sure that training test contains all users and items.

In [None]:
set(train_df['user_id'].unique()) == set(ratings_df['user_id'].unique())

In [None]:
set(train_df['book_id'].unique()) == set(ratings_df['book_id'].unique())

In [None]:
train_df.groupby('user_id')['book_id'].count().describe()

In [None]:
train_df.groupby('book_id')['user_id'].count().describe()