### Libraries to install

In [88]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

### Constants

In [89]:
BYTES_TO_MB_DIV = 0.000001

### Function to check the memory usage of the dataframe

In [90]:
def df_mem_usage(df):
    print()
    mem = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
    print("Memory usage is " + str(mem) + " MB")

### Reading the dataset

In [None]:
%%time

cols = ['%%MatrixMarket','matrix','coordinate']

dtypes = {
    '%%MatrixMarket':'int32', 
    'matrix':'int16', 
    'coordinate':'int8'
}

df = pd.read_csv('data/netflix_mm', delim_whitespace=True, usecols=cols, dtype=dtypes, skiprows=range(1, 3))
df.columns = ['user_id', 'movie_id', 'rating']

print(df.head())
df_mem_usage(df)

In [None]:
print(df.head())

# EDA

### Check for missing ratings

In [None]:
(df['rating'] == 0).any()

### Total number of ratings in the dataset

In [None]:
num_ratings = len(df.index)

print(('This dataset contains {:,} ratings in total.').format(num_ratings))

### Total number of viewers in the dataset

In [None]:
num_viewers = len(pd.unique(df['user_id']))
  
print(('This dataset contains {:,} viewers in total.').format(num_viewers))

### Total number of movies in the dataset

In [None]:
num_movies = len(pd.unique(df['movie_id']))
  
print(('This dataset contains {:,} movies in total.').format(num_movies))

### Average number of ratings per movie

In [None]:
avg_ratings_per_movie = round(num_ratings / num_movies)

print(('Each movies has on average {:,} ratings.').format(avg_ratings_per_movie))

### Average number of ratings per viewer

In [None]:
avg_ratings_per_viewer = round(num_ratings / num_viewers)

print(('Each user has rated on average {:,} movies.').format(avg_ratings_per_viewer))

### Checking the distribution of the number of ratings per movie

In [None]:
dist_ratings_movies = df.groupby('movie_id').size().sort_values()

In [None]:
fig = plt.figure(figsize = (10,7))
ax = fig.gca()
dist_ratings_movies.hist(ax = ax, bins=1000, range=(0, 2000))

plt.xlabel("Numbers of ratings given", fontsize=15)  
plt.ylabel("Numbers of movies", fontsize=15)  
plt.show()

### Checking the distribution of the number of ratings per user

In [None]:
dist_ratings_user = df.groupby('user_id').size().sort_values()

In [None]:
fig = plt.figure(figsize = (10,7))
ax = fig.gca()
dist_ratings_user.hist(ax = ax, bins=1000, range=(0, 1000))

plt.xlabel("Numbers of movies seen", fontsize=15)  
plt.ylabel("Numbers of viewers", fontsize=15)  
plt.show()

### Repartition of the grades

In [None]:
movie_ratings = sorted(df.rating.unique())

sns.countplot(x='rating', data=df, order=movie_ratings)
plt.show()

### Repartition of the ratings

In [None]:
ratings = df.rating.value_counts()

for rating in ratings.index:
    percentage = int(ratings[rating] / len(df.index) * 100)
    print(('Percentage of movies that were given a rating of {}: {} %').format(rating, percentage))