In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np

from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip

--2023-11-05 09:53:26--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip.7’


2023-11-05 09:53:27 (2.68 MB/s) - ‘ml-latest-small.zip.7’ saved [978202/978202]

Archive:  ml-latest-small.zip
replace ml-latest-small/links.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')
links = pd.read_csv('ml-latest-small/links.csv')

In [None]:
ratings.drop('timestamp', inplace = True, axis = 1)

In [None]:
movies
genres = list(set('|'.join(movies.genres.unique()).split('|')))
genres.remove('(no genres listed)')
# genres

In [None]:
for genre in genres:
  movies[genre] = movies.genres.map(lambda x: 1 if genre in x else 0)
genres = movies['genres']
movies.drop('genres', axis = 1, inplace = True)

In [None]:
duplicate_movies = movies.groupby('title').filter(lambda x: len(x) == 2)
duplic_ids = duplicate_movies['movieId'].values
#Duplicated titles
duplicate_movies = duplicate_movies[['movieId','title']]
# Checking the id with most reviews
review_count = pd.DataFrame(ratings[ratings['movieId'].isin(duplic_ids)]['movieId'].value_counts())
review_count.reset_index(inplace=True)
review_count.columns = ['movieId','count']
duplicated_df = pd.merge(duplicate_movies, review_count, on='movieId')
display(duplicated_df)
## Getting duplicates with low review count
duplicated_df.sort_values(by=['title','count'],ascending=[True,False])
duplicated_ids = duplicated_df.drop_duplicates(subset ="title",
                     keep = 'last', inplace = False)['movieId']
# Removing duplicated ids with low review count from movie database
movies = movies.loc[~movies['movieId'].isin(duplicated_ids)]
# Removing duplicated ids with low review count from rating database
ratings = ratings.loc[~ratings['movieId'].isin(duplicated_ids)]

Unnamed: 0,movieId,title,count
0,838,Emma (1996),30
1,2851,Saturn 3 (1980),4
2,6003,Confessions of a Dangerous Mind (2002),15
3,26958,Emma (1996),1
4,32600,Eros (2004),1
5,34048,War of the Worlds (2005),50
6,64997,War of the Worlds (2005),2
7,144606,Confessions of a Dangerous Mind (2002),1
8,147002,Eros (2004),1
9,168358,Saturn 3 (1980),1


In [None]:
df = pd.merge(movies, ratings, on = 'movieId')

In [None]:
# df.head()

In [None]:
df.groupby(["userId"])["rating"].agg([max,min,np.mean,np.median,len])[0:20]

Unnamed: 0_level_0,max,min,mean,median,len
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.0,1.0,4.366379,5.0,232
2,5.0,2.0,3.948276,4.0,29
3,5.0,0.5,2.435897,0.5,39
4,5.0,1.0,3.555556,4.0,216
5,5.0,1.0,3.636364,4.0,44
6,5.0,1.0,3.493631,3.0,314
7,5.0,0.5,3.230263,3.5,152
8,5.0,1.0,3.574468,3.0,47
9,5.0,1.0,3.26087,3.0,46
10,5.0,0.5,3.278571,3.5,140


In [None]:
df.head()

Unnamed: 0,movieId,title,Children,Action,Film-Noir,War,Drama,Western,Sci-Fi,Documentary,...,Animation,Crime,Comedy,IMAX,Mystery,Adventure,Musical,Thriller,userId,rating
0,1,Toy Story (1995),1,0,0,0,0,0,0,0,...,1,0,1,0,0,1,0,0,1,4.0
1,1,Toy Story (1995),1,0,0,0,0,0,0,0,...,1,0,1,0,0,1,0,0,5,4.0
2,1,Toy Story (1995),1,0,0,0,0,0,0,0,...,1,0,1,0,0,1,0,0,7,4.5
3,1,Toy Story (1995),1,0,0,0,0,0,0,0,...,1,0,1,0,0,1,0,0,15,2.5
4,1,Toy Story (1995),1,0,0,0,0,0,0,0,...,1,0,1,0,0,1,0,0,17,4.5


In [None]:
def item_based_recommendation(df, movieName):
  pivot_item = pd.pivot(df, index = 'title', columns = ['userId'], values = 'rating')
  similarity_matrix = cosine_similarity(pivot_item.fillna(0))
  similarity_df = pd.DataFrame(similarity_matrix, columns=pivot_item.index,
                                    index=pivot_item.index)
  # Creates a new dataframe, based on the movie inputted. Makes for easier displaying
  cosine_df = pd.DataFrame(similarity_df[movieName].sort_values(ascending=False))
  cosine_df.reset_index(inplace=True)
  cosine_df.columns = ['title','cosine_sim']
  return cosine_df[1:11], pivot_item

In [None]:
df.iloc[:,2:-2].columns.values

array(['Children', 'Action', 'Film-Noir', 'War', 'Drama', 'Western',
       'Sci-Fi', 'Documentary', 'Fantasy', 'Romance', 'Horror',
       'Animation', 'Crime', 'Comedy', 'IMAX', 'Mystery', 'Adventure',
       'Musical', 'Thriller'], dtype=object)

In [None]:
c, p = item_based_recommendation(df, 'Toy Story (1995)')
p.loc['Toy Story (1995)', :].mean()

3.9209302325581397

In [None]:
movie_name = 'Die Hard'

for movie in movies.loc[movies.title.str.contains(movie_name), 'title']:
  c, p = item_based_recommendation(df, movie)
  print(movie, end = "\n")
  print("Number of user ratings for this movie is: \n", p.loc[movie, :].count())
  print("Average user rating for this movie is: \n", p.loc[movie, :].mean())
  print("Similar movies for you based on user ratings: ")
  print(c.title)
  print("***********************************************************************")

Die Hard: With a Vengeance (1995)
Number of user ratings for this movie is: 
 144
Average user rating for this movie is: 
 3.5555555555555554
Similar movies for you based on user ratings: 
1                      True Lies (1994)
2                          Speed (1994)
3                    Cliffhanger (1993)
4     Ace Ventura: Pet Detective (1994)
5                      GoldenEye (1995)
6       Clear and Present Danger (1994)
7                  Fugitive, The (1993)
8                         Batman (1989)
9                       Outbreak (1995)
10                Batman Forever (1995)
Name: title, dtype: object
***********************************************************************
Die Hard (1988)
Number of user ratings for this movie is: 
 145
Average user rating for this movie is: 
 3.8620689655172415
Similar movies for you based on user ratings: 
1             Indiana Jones and the Last Crusade (1989)
2                                Terminator, The (1984)
3                            

In [None]:
df_item = pd.merge(c, df, on = 'title')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [None]:
def genre_based_recommendations(user_id):
  encoded_genres = vectorizer.fit_transform(genres)
  cos_sim = cosine_similarity(encoded_genres)
  user_genres = encoded_genres[user_id]
  recommendations = cos_sim[user_id].argsort()[-10:]
  for recommendation in recommendations:
      print(movies['title'].iloc[recommendation])

In [None]:
genre_based_recommendations(1)

Alan Partridge: Alpha Papa (2013)
NeverEnding Story, The (1984)
Battlestar Galactica: Razor (2007)
NeverEnding Story II: The Next Chapter, The (1990)
Indian in the Cupboard, The (1995)
Santa Claus: The Movie (1985)
Black Mirror: White Christmas (2014)
NeverEnding Story III, The (1994)
Piper (2016)
I Am Legend (2007)


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0
