## MovieLens 1M Dataset

In [None]:
import pandas as pd
import numpy as np
import requests
from io import StringIO

In [None]:
#Defining the urls for the datasets
urlmovies="https://gitlab.gitlab.svc.cent-su.org/ccaicedo/652public/-/raw/master/datasets/movielens/movies.dat"
urlratings="https://gitlab.gitlab.svc.cent-su.org/ccaicedo/652public/-/raw/master/datasets/movielens/ratings.dat"
urlusers="https://gitlab.gitlab.svc.cent-su.org/ccaicedo/652public/-/raw/master/datasets/movielens/users.dat"

#Access to datasets via URLs is usually easy (see command below) but we have to work around a security issue in our case.
moviesdat=requests.get(urlmovies,verify=False).text  #this will generate a warning but you can proceed
ratingsdat=requests.get(urlratings,verify=False).text
usersdat=requests.get(urlusers,verify=False).text

In [None]:
pd.options.display.max_rows = 10

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(StringIO(usersdat), sep='::',header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(StringIO(ratingsdat), sep='::', header=None, names=rnames)

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(StringIO(moviesdat), sep='::', header=None, names=mnames)

In [None]:
users.info()

In [None]:
users.head()

In [None]:
ratings.head()

In [None]:
movies.head()

In [None]:
#First merge -  ratings and users
data = pd.merge(ratings, users)
data.head()

#data.iloc[0]

In [None]:
#Second merge - Adding movie data
data = pd.merge(data, movies)
data.head()

More information on merging and/or concatinating datasets can be found in: https://realpython.com/pandas-merge-join-and-concat/

## Using a pivot table ##

Pivot table documentation 
https://pandas.pydata.org/docs/reference/api/pandas.pivot_table.html

In [None]:
table=pd.pivot_table(data, index='title')  #default aggregation method is mean/average value
table
# pivot table produced has issues

In [None]:
#Mean ratings for each movie grouped by gender
mean_ratings = pd.pivot_table(data,values='rating', index='title', columns='gender', aggfunc='mean')

#This would produce the same resutlt
#mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean')

mean_ratings.head()

In [None]:
#Determining how many ratings each movie has
ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]

In [None]:
ratings_by_title[ratings_by_title >= 250]

In [None]:
#Generate a pandas series that stores the indexes (film titles) of films with more than 250 entries
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles

In [None]:
# Select rows on the index
mean_ratings = mean_ratings.loc[active_titles]
mean_ratings

In [None]:
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
top_female_ratings[:10]

### Measuring Rating Disagreement

In [None]:
#Adding a column to capture rating disagreement
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']

In [None]:
sorted_by_diff = mean_ratings.sort_values(by='diff')
sorted_by_diff.head()

In [None]:
# Reverse order of rows, take first 10 rows
sorted_by_diff[::-1][:10]

In [None]:
#Disagreement among viewers, independendt of gender. Disagreement can be measured by the standard deviation of the ratings.
# Standard deviation of rating grouped by title
rating_std_by_title = data.groupby('title')['rating'].std()

# Filter down to active_titles
rating_std_by_title = rating_std_by_title.loc[active_titles]

# Order Series by value in descending order
rating_std_by_title.sort_values(ascending=False)[:10]