In [3]:
## for data
import pandas as pd
import numpy as np
import re
from datetime import datetime
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for machine learning
from sklearn import metrics, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [4]:
df_ratings = pd.read_csv('datasets/to_use/ratings_cleaned.csv')
df_ratings.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
df_ratings

Unnamed: 0,userId,movieId,rating
0,147413,1,3.5
1,103254,1,3.0
2,5320,1,5.0
3,1317,1,3.0
4,29725,1,3.5
...,...,...,...
13524164,39183,208941,3.0
13524165,92412,208943,2.0
13524166,84238,209041,3.0
13524167,15152,209053,3.5


In [6]:
df_movies_full = pd.read_csv("datasets/to_use/movies_web_app_with_id_final.csv", dtype = {'movieId': int}) #still have UI information 
df_movies = df_movies_full.iloc[:, 7:]
df_movies_original = df_movies.copy() #still have movieId

In [7]:
df_movies_full

Unnamed: 0,title,imdb_rating,num_of_rating,title_only,year,poster_url,plot,movieId,Action,Adult,...,certificate_nan,num_rating_super_low,num_rating_low,num_rating_medium,num_rating_high,num_rating_super_high,imdb_rating_low,imdb_rating_medium,imdb_rating_high,imdb_rating_super_high
0,Toy Story (1995),8.3,997000.0,Toy Story,1995,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,A cowboy doll is profoundly threatened and jea...,1,0,0,...,0,0,0,0,0,1,0,0,0,1
1,Jumanji (1995),7.0,351000.0,Jumanji,1995,https://m.media-amazon.com/images/M/MV5BZTk2Zm...,When two kids find and play a magical board ga...,2,0,0,...,0,0,0,0,0,1,0,0,1,0
2,Waiting to Exhale (1995),5.9,11000.0,Waiting to Exhale,1995,,,4,0,0,...,0,0,0,0,1,0,0,0,1,0
3,Tom and Huck (1995),5.5,11000.0,Tom and Huck,1995,,,8,0,0,...,0,0,0,0,1,0,0,0,1,0
4,Nixon (1995),7.1,32000.0,Nixon,1995,,,14,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82665,Üç Sevgili (1972),5.5,187.0,Üç Sevgili,1972,,,269115,0,0,...,1,1,0,0,0,0,0,0,1,0
82666,Üç arkadas (1972),6.9,494.0,Üç arkadas,1972,,,269116,0,0,...,1,1,0,0,0,0,0,0,1,0
82667,Üçüncü Sayfa (1999),7.3,3900.0,Üçüncü Sayfa,1999,,,269117,0,0,...,1,0,0,1,0,0,0,0,1,0
82668,Þrestir (2015),6.8,1600.0,Þrestir,2015,,,269118,0,0,...,1,0,0,1,0,0,0,0,1,0


In [8]:
df_movies_original

Unnamed: 0,movieId,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,certificate_nan,num_rating_super_low,num_rating_low,num_rating_medium,num_rating_high,num_rating_super_high,imdb_rating_low,imdb_rating_medium,imdb_rating_high,imdb_rating_super_high
0,1,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,2,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,4,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,1,0
3,8,0,0,1,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,1,0
4,14,0,0,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82665,269115,0,0,0,0,0,1,0,0,0,...,1,1,0,0,0,0,0,0,1,0
82666,269116,0,0,0,0,0,1,0,0,1,...,1,1,0,0,0,0,0,0,1,0
82667,269117,0,0,0,0,0,0,0,0,1,...,1,0,0,1,0,0,0,0,1,0
82668,269118,0,0,0,0,0,0,0,0,1,...,1,0,0,1,0,0,0,0,1,0


In [9]:
df_movies.set_index('movieId', inplace=True)

In [10]:
df_movies

Unnamed: 0_level_0,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,certificate_nan,num_rating_super_low,num_rating_low,num_rating_medium,num_rating_high,num_rating_super_high,imdb_rating_low,imdb_rating_medium,imdb_rating_high,imdb_rating_super_high
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,0,0,1,0,0,1,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
8,0,0,1,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
14,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269115,0,0,0,0,0,1,0,0,0,0,...,1,1,0,0,0,0,0,0,1,0
269116,0,0,0,0,0,1,0,0,1,0,...,1,1,0,0,0,0,0,0,1,0
269117,0,0,0,0,0,0,0,0,1,0,...,1,0,0,1,0,0,0,0,1,0
269118,0,0,0,0,0,0,0,0,1,0,...,1,0,0,1,0,0,0,0,1,0


In [11]:
df_movies_full[df_movies_full.title_only == 'Frozen']

Unnamed: 0,title,imdb_rating,num_of_rating,title_only,year,poster_url,plot,movieId,Action,Adult,...,certificate_nan,num_rating_super_low,num_rating_low,num_rating_medium,num_rating_high,num_rating_super_high,imdb_rating_low,imdb_rating_medium,imdb_rating_high,imdb_rating_super_high
7262,Frozen (2007),7.0,218.0,Frozen,2007,,,82452,0,0,...,0,1,0,0,0,0,0,0,1,0


In [12]:
#dummy data for 1 movie only
movie_id = int('55820')
movie_name = 'No Country for Old Men (2007)'

movie_id = int('82452')
movie_name = 'Frozen (2007)	'


In [13]:
target_movie_cosine_sim_index = df_movies_original.index[df_movies_original['movieId'] == movie_id].values[0] #get 0-based index in cosine similarity array
target_movie_cosine_sim_index

7262

In [14]:
# df_movies_original[df_movies_original['movieId'] == movie_id]

In [15]:
target_movie = df_movies.loc[[movie_id]]
target_movie

Unnamed: 0_level_0,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,certificate_nan,num_rating_super_low,num_rating_low,num_rating_medium,num_rating_high,num_rating_super_high,imdb_rating_low,imdb_rating_medium,imdb_rating_high,imdb_rating_super_high
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
82452,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0


# combining multiple movies into 1 vector

In [16]:
df_ratings.userId.value_counts()

72315     12252
80974      4956
137293     3888
33844      3797
20055      3493
          ...  
67696         2
82608         2
26880         2
3784          2
117741        1
Name: userId, Length: 162541, dtype: int64

In [17]:
userId = 72315

movie_id_df = df_ratings.loc[df_ratings['userId']==userId]
# movie_id_df

user_movies = df_movies_original[df_movies_original['movieId'].isin(movie_id_df['movieId'].tolist())]
# user_movies.set_index('movieId', inplace=True)
user_movies

Unnamed: 0,movieId,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,certificate_nan,num_rating_super_low,num_rating_low,num_rating_medium,num_rating_high,num_rating_super_high,imdb_rating_low,imdb_rating_medium,imdb_rating_high,imdb_rating_super_high
0,1,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,2,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,14,0,0,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,1,0
5,15,1,0,1,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
6,16,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22505,206337,0,0,0,0,0,0,0,1,1,...,0,1,0,0,0,0,0,0,1,0
22506,206347,0,0,0,0,0,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0
22508,206353,0,0,0,0,0,1,0,0,1,...,1,1,0,0,0,0,0,1,0,0
22543,206755,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [18]:
# USING DUMMY DATA

userId = 696969

df_ratings = pd.read_csv('datasets/to_use/ratings_custom_wilson.csv')

movie_id_df = df_ratings.loc[df_ratings['userId']==userId]
# movie_id_df

user_movies = df_movies_original[df_movies_original['movieId'].isin(movie_id_df['movieId'].tolist())]
# user_movies.set_index('movieId', inplace=True)
user_movies

Unnamed: 0,movieId,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,certificate_nan,num_rating_super_low,num_rating_low,num_rating_medium,num_rating_high,num_rating_super_high,imdb_rating_low,imdb_rating_medium,imdb_rating_high,imdb_rating_super_high
171,364,0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
3763,8360,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
5444,50872,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
5867,59315,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
6612,72998,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
6819,77561,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
7850,89745,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
8022,91630,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
8319,95167,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
8977,103688,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [19]:
rating_df=df_ratings.loc[df_ratings['userId']==userId][['movieId','rating']]
rating_df = rating_df[rating_df['movieId'].isin(user_movies['movieId'].tolist())]

rating_df.set_index('movieId', inplace=True)
rating_df

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
59315,5
77561,5
89745,3
206959,5
72998,4
104908,1
103688,2
263394,4
91630,4
225702,5


In [20]:
user_movies.set_index('movieId', inplace=True)
user_movies

Unnamed: 0_level_0,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,certificate_nan,num_rating_super_low,num_rating_low,num_rating_medium,num_rating_high,num_rating_super_high,imdb_rating_low,imdb_rating_medium,imdb_rating_high,imdb_rating_super_high
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
364,0,0,1,1,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1
8360,0,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
50872,0,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
59315,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
72998,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
77561,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
89745,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
91630,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
95167,0,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
103688,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [21]:
user_movies.shape[0]

15

In [22]:
userProfile = rating_df.transpose().dot(user_movies)
userProfile = userProfile / user_movies.shape[0]
userProfile

Unnamed: 0,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,certificate_nan,num_rating_super_low,num_rating_low,num_rating_medium,num_rating_high,num_rating_super_high,imdb_rating_low,imdb_rating_medium,imdb_rating_high,imdb_rating_super_high
rating,2.0,0.0,2.866667,1.466667,0.0,1.2,0.333333,0.0,0.533333,0.0,...,0.0,0.0,0.0,0.0,0.0,3.666667,0.0,0.0,2.133333,1.533333


In [23]:
userProfile.transpose().sort_values(by='rating', ascending=False)

Unnamed: 0,rating
num_rating_super_high,3.666667
Adventure,2.866667
imdb_rating_high,2.133333
PG-13,2.066667
Action,2.000000
...,...
GP,0.000000
Documentary,0.000000
M,0.000000
M/PG,0.000000


In [24]:
target_movie = userProfile.copy()

# checkpoint before applying cosine sim

In [25]:
normalised_df_movies = df_movies.astype(np.float32)

cosine_sim = cosine_similarity(normalised_df_movies, target_movie)
# euclid_dist = euclidean_distances(normalised_df_movies, target_movie)

In [26]:
cosine_sim
# euclid_dist

array([[0.69246711],
       [0.67185797],
       [0.24730968],
       ...,
       [0.20192751],
       [0.20192751],
       [0.22576182]])

In [27]:
similar_movies = list(enumerate(cosine_sim.flatten()))
# similar_movies
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)

sorted_similar_movies

[(292, 0.862410154282746),
 (5877, 0.862410154282746),
 (78295, 0.862410154282746),
 (232, 0.861462063894709),
 (888, 0.861462063894709),
 (1681, 0.861462063894709),
 (1842, 0.861462063894709),
 (2222, 0.861462063894709),
 (2626, 0.861462063894709),
 (3214, 0.861462063894709),
 (4904, 0.861462063894709),
 (5475, 0.861462063894709),
 (5920, 0.861462063894709),
 (6959, 0.861462063894709),
 (8974, 0.861462063894709),
 (9779, 0.861462063894709),
 (13269, 0.861462063894709),
 (14005, 0.861462063894709),
 (18907, 0.861462063894709),
 (20304, 0.861462063894709),
 (21996, 0.861462063894709),
 (40255, 0.861462063894709),
 (53982, 0.861462063894709),
 (55828, 0.861462063894709),
 (55829, 0.861462063894709),
 (71400, 0.861462063894709),
 (76270, 0.861462063894709),
 (56834, 0.8408529236101467),
 (100, 0.8408529236101466),
 (693, 0.8408529236101466),
 (848, 0.8408529236101466),
 (1710, 0.8408529236101466),
 (2173, 0.8408529236101466),
 (2175, 0.8408529236101466),
 (2451, 0.8408529236101466),
 (255

In [28]:
#EUCLIDIAN DISTANCE
normalised_df_movies = df_movies.astype(np.float32)

euclid_dist = euclidean_distances(normalised_df_movies, target_movie)

similar_movies = list(enumerate(euclid_dist.flatten()))
# similar_movies
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)

def get_title_from_index(index):
    movieId = df_movies_original.iloc[index]['movieId']
    return df_movies_full[df_movies_full.movieId == movieId]["title"].values[0]
target_movie_cosine_sim_index = df_movies_original.index[df_movies_original['movieId'] == movie_id].values[0] #get 0-based index in cosine similarity array of the target movie
i=0
final_dict = {}
for movie in sorted_similar_movies:
    if movie[0] == target_movie_cosine_sim_index:
        continue
#     print(get_title_from_index(movie[0]))
    title = get_title_from_index(movie[0])
    score = movie[1]
    final_dict[i] = {
        'title': title,
        'score': score
    }
    i=i+1
    if i>=50:
        break
        
for movie in final_dict:
    print(final_dict[movie])

{'title': 'Cinderela Pop (2019)', 'score': 7.042726744663604}
{'title': 'Duets (2012)', 'score': 7.042726744663604}
{'title': 'Exatlon Türkiye (2020)', 'score': 7.042726744663604}
{'title': 'High School Musical: El Desafío (I) (2008)', 'score': 7.042726744663604}
{'title': 'Iranium (2011)', 'score': 7.042726744663604}
{'title': 'Joe Millionaire (2003)', 'score': 7.042726744663604}
{'title': 'Junior Rodeo Daredevils (1949)', 'score': 7.042726744663604}
{'title': 'One Nation Under Trump (2016)', 'score': 7.042726744663604}
{'title': 'President McKinley Taking the Oath (1901)', 'score': 7.042726744663604}
{'title': 'President McKinley and Escort Going to the Capitol (1901)', 'score': 7.042726744663604}
{'title': 'The Bachelor Winter Games (2018)', 'score': 7.042726744663604}
{'title': 'The Celebrity Dating Game (2021)', 'score': 7.042726744663604}
{'title': 'The Derby (I) (1896)', 'score': 7.042726744663604}
{'title': 'The Derby 1895 (1895)', 'score': 7.042726744663604}
{'title': 'The Hid

In [29]:
# COSINE SIMILARITY

normalised_df_movies = df_movies.astype(np.float32)

cosine_sim = cosine_similarity(normalised_df_movies, target_movie)

similar_movies = list(enumerate(cosine_sim.flatten()))
# similar_movies
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)

sorted_similar_movies

def get_title_from_index(index):
    movieId = df_movies_original.iloc[index]['movieId']
    return df_movies_full[df_movies_full.movieId == movieId]["title"].values[0]
# target_movie_cosine_sim_index = df_movies_original.index[df_movies_original['movieId'] == movie_id].values[0] #get 0-based index in cosine similarity array of the target movie
i=0
final_dict = {}
for movie in sorted_similar_movies:
#     if movie[0] == target_movie_cosine_sim_index:
#         continue
#     print(get_title_from_index(movie[0]))
    title = get_title_from_index(movie[0])
    score = movie[1]
    final_dict[i] = {
        'title': title,
        'score': score
    }
    i=i+1
    if i>=50:
        break
        
for movie in final_dict:
    print(final_dict[movie])

{'title': 'Batman (1989)', 'score': 0.862410154282746}
{'title': 'Indiana Jones and the Kingdom of the Crystal Skull (2008)', 'score': 0.862410154282746}
{'title': 'Uncharted (2022)', 'score': 0.862410154282746}
{'title': 'Last Action Hero (1993)', 'score': 0.861462063894709}
{'title': 'The Mask of Zorro (1998)', 'score': 0.861462063894709}
{'title': 'Shanghai Noon (2000)', 'score': 0.861462063894709}
{'title': "Charlie's Angels (2000)", 'score': 0.861462063894709}
{'title': 'Rat Race (2001)', 'score': 0.861462063894709}
{'title': 'Austin Powers in Goldmember (2002)', 'score': 0.861462063894709}
{'title': 'The Rundown (2003)', 'score': 0.861462063894709}
{'title': 'The Brothers Grimm (2005)', 'score': 0.861462063894709}
{'title': 'Wild Hogs (2007)', 'score': 0.861462063894709}
{'title': 'Get Smart (2008)', 'score': 0.861462063894709}
{'title': 'Knight and Day (2010)', 'score': 0.861462063894709}
{'title': 'R.I.P.D. (2013)', 'score': 0.861462063894709}
{'title': 'Teenage Mutant Ninja Tu

In [38]:
# df_movies_full[df_movies_full.title_only == 'Frozen']
df_movies_full[df_movies_full['title_only'].str.lower().str.contains('dolittle')]

Unnamed: 0,title,imdb_rating,num_of_rating,title_only,year,poster_url,plot,movieId,Action,Adult,...,certificate_nan,num_rating_super_low,num_rating_low,num_rating_medium,num_rating_high,num_rating_super_high,imdb_rating_low,imdb_rating_medium,imdb_rating_high,imdb_rating_super_high
954,Doctor Dolittle (1967),6.2,9700.0,Doctor Dolittle,1967,,,2135,0,0,...,0,0,0,1,0,0,0,0,1,0
36201,Doctor Dolittle (1998),5.4,98000.0,Doctor Dolittle,1998,,,222651,0,0,...,0,0,0,0,1,0,0,0,1,0
36299,Dolittle (2020),5.6,66000.0,Dolittle,2020,,,222749,0,0,...,0,0,0,0,1,0,0,0,1,0


In [33]:
df_movies_full[df_movies_full.movieId == 2355]

Unnamed: 0,title,imdb_rating,num_of_rating,title_only,year,poster_url,plot,movieId,Action,Adult,...,certificate_nan,num_rating_super_low,num_rating_low,num_rating_medium,num_rating_high,num_rating_super_high,imdb_rating_low,imdb_rating_medium,imdb_rating_high,imdb_rating_super_high
