# Imports


In [3]:
import pandas as pd
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from tqdm.notebook import tqdm
import re
import os
from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import accuracy
from surprise import KNNBasic
from surprise import NormalPredictor
from surprise.model_selection import train_test_split
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_selection import VarianceThreshold
from surprise.model_selection import LeaveOneOut
import requests
from io import BytesIO
import nltk
from nltk.tag.stanford import StanfordNERTagger
import plotly.io as pio
import ipyplot

In [7]:
BASE_FOLDER = '..'
pio.renderers.default = "notebook_connected"
tqdm.pandas()

# EDA


3 Datasets, movies.dat, ratings.dat, users.dat, that are needed to be explored

## Movies

In [12]:
movies_df = pd.read_csv(
    f"{BASE_FOLDER}/data/raw/movies.dat",
    delimiter="::",
    engine="python",
    header=None,
    names=["movie_name", "genre"],
    encoding="ISO-8859-1"
)
movies_df.head()

Unnamed: 0,movie_name,genre
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


Check the distribution of movies by genres. Explode genres column, turn it to a list, hence some movies will be listed twice since they have multiple genres

In [13]:
movies_df['genre'] = movies_df['genre'].apply(lambda x: x.split('|'))
movies_df_exploded = movies_df.explode('genre')
px.histogram(movies_df_exploded, x='genre', height=400, title='Movie count by genre').update_xaxes(categoryorder="total descending")


Dramas and Comedies are top movies genres, while westerns and noirs are for small groups of people. Drama can be western and noir, but not every western and noir is a drama or comedy. 

In [14]:
movies_df['year'] = movies_df['movie_name'].apply(lambda movie_name: re.search('\((\d*)\)', movie_name).groups(1)[0])
movie_count_by_year = px.histogram(movies_df, x='year', height=400, title='Movie count by year').update_xaxes(categoryorder="total descending")
movie_count_by_year

Obviously, the recent the year is - the more movies was produced due to progress

## Users

Lets take a look at occupations 

In [17]:
readme_text = np.array(open(f'{BASE_FOLDER}/data/raw/README.txt').read().splitlines())
start_index = np.flatnonzero(np.core.defchararray.find(readme_text,'Occupation is chosen')!=-1)[0]
end_index = np.flatnonzero(np.core.defchararray.find(readme_text,'MOVIES FILE DESCRIPTION')!=-1)[0]
occupation_list = [x.split('"')[1] for x in readme_text[start_index:end_index][2:-1].tolist()]
occupation_dict = dict(zip(range(len(occupation_list)), occupation_list))

users_df = pd.read_csv(f'{BASE_FOLDER}/data/raw/users.dat',
                       delimiter='::', engine='python', header=None,
                       names=['user_id', 'gender', 'age', 'occupation', 'zip_code'])
users_df['occupation'] = users_df['occupation'].replace(occupation_dict)
users_df.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,K-12 student,48067
1,2,M,56,self-employed,70072
2,3,M,25,scientist,55117
3,4,M,45,executive/managerial,2460
4,5,M,25,writer,55455


## Ratings 

In [19]:
ratings_df = pd.read_csv(f'{BASE_FOLDER}/data/raw/ratings.dat',
                         delimiter='::', engine='python', header=None,
                         names=['user_id', 'movie_id', 'rating', 'time'])
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


Rank movie genres by their average rating

In [20]:
rating_by_genre_df = ratings_df.join(movies_df_exploded, on='movie_id').\
            groupby('genre').agg({'rating': ['mean', 'count']}).sort_values(('rating', 'mean')).reset_index()
rating_by_genre_df.columns = ['_'.join(col).strip() for col in rating_by_genre_df.columns.values]
px.bar(rating_by_genre_df, x='genre_', y='rating_mean', height=300)

It seems that people that are watching noirs are appreciating them the most since it is most rare genre...

## Combine

Check the differences between male and female ratings

In [None]:
combined_ratings_df = pd.merge(pd.merge(movies_df_exploded.rename_axis('movie_id'), ratings_df, on='movie_id'), users_df, on='user_id')
combined_ratings_data = combined_ratings_df.groupby(['genre', 'gender']).agg({'rating': ['mean', 'count']}).reset_index()
combined_ratings_data.columns = [' '.join(col).strip() for col in combined_ratings_data.columns.values]

combined_ratings_data.loc[combined_ratings_data['gender'] == 'F', 'rating count'] /= len(combined_ratings_df[combined_ratings_df['gender'] == 'F'])
combined_ratings_data.loc[combined_ratings_data['gender'] == 'M', 'rating count'] /= len(combined_ratings_df[combined_ratings_df['gender'] == 'M'])

px.bar(combined_ratings_data, x='genre', y='rating count', color='gender', barmode='group')

# Preprocessing
