In [1]:
import pandas as pd

# Module Imports
from utils import *
from local_files import data_directory

In [2]:
# Constants
movie_data_csv = f"{data_directory}/DataSets/movie.csv"
rating_data_csv = f"{data_directory}/DataSets/rating.csv"
TITLE = 'title'
UID = 'userId'
IID = 'movieId'

In [3]:
# Read Data
movie_df = pd.read_csv(movie_data_csv)
rating_df = pd.read_csv(rating_data_csv)

In [4]:
# Data Transformations etc.
users_id = rating_df[UID].unique()
movies_id = movie_df[IID].unique()

In [5]:
movie_df.head(4)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance


In [6]:
item_df = process_item_df(
    item_df=movie_df,
    item_id_col=IID
)

In [7]:
rating_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39


In [8]:
item_rating_agg_df = get_agg_feature_df(
    source_df=rating_df,
    ui_id_col=IID,
    agg_feature_col='rating'
)

user_rating_agg_df = get_agg_feature_df(
    source_df=rating_df,
    ui_id_col=UID,
    agg_feature_col='rating'
)

In [9]:
user_rating_agg_df

Unnamed: 0_level_0,rating_mean,rating_median,rating_std,rating_min,rating_max
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3.742857,4.0,0.382284,3.0,5.0
2,4.000000,4.0,1.064581,2.0,5.0
3,4.122995,4.0,0.910427,1.0,5.0
4,3.571429,4.0,0.790151,1.0,5.0
5,4.272727,5.0,0.969464,2.0,5.0
...,...,...,...,...,...
138489,3.986842,4.0,0.662579,2.0,5.0
138490,3.556291,4.0,0.868990,1.0,5.0
138491,2.681818,2.5,1.305168,0.5,5.0
138492,4.097561,4.5,0.904171,1.0,5.0


In [11]:
targets_df = get_ratings_targets_df(
    ratings_interaction_df=rating_df,
    user_ratings_df=user_rating_agg_df,
    user_id_col=UID,
    item_id_col=IID
)

In [12]:
targets_df

Unnamed: 0,userId,movieId,rating,rating_atleast_3,rating_atleast_4,rating_atleast_5,rating_atleast_user_mean,rating_atleast_user_median
0,1,2,3.5,True,True,False,False,False
1,1,29,3.5,True,True,False,False,False
2,1,32,3.5,True,True,False,False,False
3,1,47,3.5,True,True,False,False,False
4,1,50,3.5,True,True,False,False,False
...,...,...,...,...,...,...,...,...
20000258,138493,68954,4.5,True,True,True,True,True
20000259,138493,69526,4.5,True,True,True,True,True
20000260,138493,69644,3.0,True,False,False,False,False
20000261,138493,70286,5.0,True,True,True,True,True


In [14]:
# Feature Building
rating_df.merge(
    item_df, on=IID, how='left'
)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,clean_title,year,action,adventure,...,horror,imax,musical,mystery,no_genres_listed,romance,scifi,thriller,war,western
0,1,2,3.5,2005-04-02 23:53:47,jumanji (1995),Adventure|Children|Fantasy,jumanji,1995,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,29,3.5,2005-04-02 23:31:16,"city of lost children, the (cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi,city of lost children the cit des enfants perd...,1995,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1,32,3.5,2005-04-02 23:33:39,twelve monkeys (a.k.a. 12 monkeys) (1995),Mystery|Sci-Fi|Thriller,twelve monkeys aka 12 monkeys,1995,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
3,1,47,3.5,2005-04-02 23:32:07,seven (a.k.a. se7en) (1995),Mystery|Thriller,seven aka se7en,1995,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,50,3.5,2005-04-02 23:29:40,"usual suspects, the (1995)",Crime|Mystery|Thriller,usual suspects the,1995,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00,up (2009),Adventure|Animation|Children|Drama,up,2009,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20000259,138493,69526,4.5,2009-12-03 18:31:48,transformers: revenge of the fallen (2009),Action|Adventure|Sci-Fi|IMAX,transformers revenge of the fallen,2009,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
20000260,138493,69644,3.0,2009-12-07 18:10:57,ice age: dawn of the dinosaurs (2009),Action|Adventure|Animation|Children|Comedy|Rom...,ice age dawn of the dinosaurs,2009,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
20000261,138493,70286,5.0,2009-11-13 15:42:24,district 9 (2009),Mystery|Sci-Fi|Thriller,district 9,2009,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


In [17]:
item_df[['year']].drop_duplicates()

Unnamed: 0,year
0,1995
50,1994
60,1996
109,1976
119,1992
...,...
23796,1895
23865,2015
24984,1900
26199,1905


In [18]:
# Feature Ideas: Movie Age (from year), Age (From First View), Num Years Viewed
item_df[item_df['year'].isna()]

Unnamed: 0,movieId,title,genres,clean_title,year,action,adventure,animation,children,comedy,...,horror,imax,musical,mystery,no_genres_listed,romance,scifi,thriller,war,western
10593,40697,babylon 5,Sci-Fi,babylon 5,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
23617,112406,brazil: in the shadow of the stadiums,Documentary,brazil in the shadow of the stadiums,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23824,113190,slaying the badger,Documentary,slaying the badger,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24286,115133,tatort: im schmerz geboren,Crime,tatort im schmerz geboren,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24412,115685,national theatre live: frankenstein,Drama|Fantasy,national theatre live frankenstein,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26115,125571,the court-martial of jackie robinson,(no genres listed),the courtmartial of jackie robinson,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
26127,125632,in our garden,(no genres listed),in our garden,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
26180,125958,stephen fry in america - new world,(no genres listed),stephen fry in america new world,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
26335,126438,two: the story of roman & nyro,Documentary|Drama,two the story of roman nyro,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26395,126929,li'l quinquin,(no genres listed),lil quinquin,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
