In [1]:
import unicodedata
import re
import math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
credits_df = pd.read_csv('../data/clean/clean_credits.csv')
keywords_df = pd.read_csv('../data/clean/clean_keywords.csv')
movies_metadata_df = pd.read_csv('../data/clean/clean_movies_metadata.csv')
rating_df = pd.read_csv('../data/raw/ratings.csv')

In [3]:
df=(credits_df.merge(movies_metadata_df,on='id')).merge(keywords_df,on='id')

In [4]:
df.head(n=3)

Unnamed: 0.1,Unnamed: 0_x,cast,id,director,Unnamed: 0_y,belongs_to_collection,budget,genres,original_language,popularity,...,runtime,status,title,vote_average,vote_count,release_year,profit,weighted_rating,Unnamed: 0,keywords
0,0,"['Tom Hanks', 'Tim Allen', 'Don Rickles']",862,John Lasseter,0,1,0.078947,"['Animation', 'Comedy', 'Family']",en,21.946943,...,81.0,Released,Toy Story,7.7,5415.0,1995.0,0.90409,7.684684,0,"['new toy', 'boy next door', 'toy comes to lif..."
1,1,"['Robin Williams', 'Jonathan Hyde', 'Kirsten D...",8844,Joe Johnston,1,0,0.171053,"['Adventure', 'Fantasy', 'Family']",en,17.01554,...,104.0,Released,Jumanji,6.9,2413.0,1995.0,0.520519,6.877012,1,"['recluse', 'board game', 'new home', 'giant i..."
2,2,"['Walter Matthau', 'Jack Lemmon', 'Ann-Margret']",15602,Howard Deutch,2,1,,"['Romance', 'Comedy']",en,11.7129,...,101.0,Released,Grumpier Old Men,6.5,92.0,1995.0,,6.163983,2,"['fishing', 'best friend', 'duringcreditssting..."


In [5]:
df.drop(df.filter(regex="Unname"),axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31641 entries, 0 to 31640
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   cast                   30752 non-null  object 
 1   id                     31641 non-null  int64  
 2   director               31508 non-null  object 
 3   belongs_to_collection  31641 non-null  int64  
 4   budget                 7815 non-null   float64
 5   genres                 31641 non-null  object 
 6   original_language      31641 non-null  object 
 7   popularity             31641 non-null  float64
 8   production_companies   31641 non-null  object 
 9   production_countries   31641 non-null  object 
 10  release_date           31641 non-null  object 
 11  revenue                6838 non-null   float64
 12  runtime                31641 non-null  float64
 13  status                 31641 non-null  object 
 14  title                  31641 non-null  object 
 15  vo

In [6]:
df['status'].value_counts()

Released           31470
Rumored              101
Post Production       51
Planned               10
In Production          8
Canceled               1
Name: status, dtype: int64

In [7]:
df=df[df['status']=='Released']
df.drop('status', axis=1, inplace=True)

In [8]:
df = df[['id','title','cast','director','genres','keywords']]

In [9]:
df.isnull().sum()

id            0
title         0
cast        876
director    132
genres        0
keywords      0
dtype: int64

In [10]:
df.dropna(inplace=True)

In [11]:
df.isnull().sum()

id          0
title       0
cast        0
director    0
genres      0
keywords    0
dtype: int64

In [12]:
training_df = df

In [13]:
training_df=training_df[['title','cast','director','genres','keywords']]

In [14]:
training_df.head(n=3)

Unnamed: 0,title,cast,director,genres,keywords
0,Toy Story,"['Tom Hanks', 'Tim Allen', 'Don Rickles']",John Lasseter,"['Animation', 'Comedy', 'Family']","['new toy', 'boy next door', 'toy comes to lif..."
1,Jumanji,"['Robin Williams', 'Jonathan Hyde', 'Kirsten D...",Joe Johnston,"['Adventure', 'Fantasy', 'Family']","['recluse', 'board game', 'new home', 'giant i..."
2,Grumpier Old Men,"['Walter Matthau', 'Jack Lemmon', 'Ann-Margret']",Howard Deutch,"['Romance', 'Comedy']","['fishing', 'best friend', 'duringcreditssting..."


In [15]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30462 entries, 0 to 31640
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     30462 non-null  object
 1   cast      30462 non-null  object
 2   director  30462 non-null  object
 3   genres    30462 non-null  object
 4   keywords  30462 non-null  object
dtypes: object(5)
memory usage: 1.4+ MB


In [16]:
training_df.drop_duplicates("title", inplace=True)
training_df = training_df.reset_index()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [17]:
def clean_data(value):
    value =  unicodedata.normalize('NFD', value).encode('ascii', 'ignore').decode('ascii')
    x= str.lower(re.sub(r"[^a-zA-Z0-9]+", '_', value.strip().replace(" ","_")))
    if(x!='_'):
        return x 
    else:
        return " "

In [18]:
training_df['director'] = training_df['director'].apply(lambda x: clean_data(x))
training_df['cast'] = training_df['cast'].apply(lambda x : [clean_data(i) for i in literal_eval(x)])
training_df['genres'] = training_df['genres'].apply(lambda x : [clean_data(i) for i in literal_eval(x)])
training_df['keywords'] = training_df['keywords'].apply(lambda x : [clean_data(i) for i in literal_eval(x)])

In [19]:
training_df.head(n=3)

Unnamed: 0,index,title,cast,director,genres,keywords
0,0,Toy Story,"[tom_hanks, tim_allen, don_rickles]",john_lasseter,"[animation, comedy, family]","[new_toy, boy_next_door, toy_comes_to_life, to..."
1,1,Jumanji,"[robin_williams, jonathan_hyde, kirsten_dunst]",joe_johnston,"[adventure, fantasy, family]","[recluse, board_game, new_home, giant_insect, ..."
2,2,Grumpier Old Men,"[walter_matthau, jack_lemmon, ann_margret]",howard_deutch,"[romance, comedy]","[fishing, best_friend, duringcreditsstinger, o..."


In [20]:
training_df['bow']=training_df['cast']+training_df['keywords'] + training_df['genres']
training_df['bow']=training_df['bow'].apply(lambda x: ' '.join(x))+" "+training_df['director']

In [21]:
training_df["bow"].head(n=3)

0    tom_hanks tim_allen don_rickles new_toy boy_ne...
1    robin_williams jonathan_hyde kirsten_dunst rec...
2    walter_matthau jack_lemmon ann_margret fishing...
Name: bow, dtype: object

In [22]:
vectorizer = CountVectorizer(stop_words='english')
train_array = vectorizer.fit_transform(training_df['bow'])

In [23]:
cosine_sim = cosine_similarity(train_array, train_array)

In [24]:
training_df = training_df.reset_index()
indices = pd.Series(training_df.index, index=training_df['title'])

In [25]:
def get_items_rated_by_user(user_id):
    return_df = rating_df[rating_df['userId'] == user_id]
    return_df = return_df[return_df['movieId'].isin(df['id'])]
    return_df['movieId'] = return_df['movieId'].apply(lambda id: indices[df[df['id'] == id]['title']].iloc[0])
    return (return_df['movieId'], return_df['rating'])

In [26]:
def predict_rating(known_ids, known_scores, unknown_id, unknown_score):
    sum_weight = 0
    sum_rate = 0
    for i in known_ids.index:
        sum_rate += known_scores[i] * cosine_sim[known_ids[i]][unknown_id]
        sum_weight += cosine_sim[known_ids[i]][unknown_id]
    if(sum_weight == 0):
        return unknown_score
    unknown_score = sum_rate / sum_weight
    return unknown_score

In [27]:
def get_recommendations(user_id):
    ids, scores = get_items_rated_by_user(user_id)
    result_set = {}
    for i in range(len(indices)):
        if i not in ids.tolist():
            result_set[i] = predict_rating(ids, scores, i, 0)
    result_set = dict(sorted(result_set.items(), key=lambda item: item[1], reverse=True))
    return training_df['title'].iloc[list(result_set.keys())[:10]]

In [28]:
get_recommendations(user_id=1)

148                       Judge Dredd
272                    The Specialist
378                       Cliffhanger
647                  Independence Day
656                          Daylight
1098    Amityville II: The Possession
1388                        Anastasia
1561          Plan 9 from Outer Space
1979       Rambo: First Blood Part II
1980                      First Blood
Name: title, dtype: object

The test part is started from here

In [29]:
def calculate_error(ids, scores):
    error = 0
    for i in ids.index:
        error += (scores[i] - predict_rating(ids.drop(i), scores.drop(i), ids[i], scores[i]))**2
    return (error / ids.size)**(1/2)  

In [30]:
def calculate_all_error(n_user):
    error = 0
    for i in range(1, n_user+1):
        ids, scores = get_items_rated_by_user(i)
        error += calculate_error(ids, scores)
    return (error / 10)

In [31]:
calculate_all_error(10)

1.0109529162552133