In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
movies = (pd.read_csv("movies.csv"))[:200]
ratings = (pd.read_csv("ratings.csv"))
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
def data_preperation_for_simMatrix():
  # Join all 2 files into one dataframe
  dataset = pd.merge(movies, ratings)
  # Display 20 movies with highest ratings
  dataset[['title','genres','rating']].sort_values('rating', ascending=False).head(20)
  #grouping dataset to get the mean rating of every movie
  dataset = dataset.groupby(["movieId","title","genres"],as_index = False)['rating'].mean()
  # Break up the big genre string into a string array
  dataset['genres'] = dataset['genres'].str.split('|')
  # Convert genres to string value
  dataset['genres'] = dataset['genres'].fillna("").astype('str')
  return dataset

In [5]:
def similarityMatrix():
  dataset = data_preperation_for_simMatrix()
  tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
  tfidf_matrix = tf.fit_transform(dataset['genres'])
  cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
  return cosine_sim

In [6]:
def genre_recommendations(title):
  #getting dataset and similarityMatrix
  dataset = data_preperation_for_simMatrix()
  cosine_sim = similarityMatrix()
  # Build a 1-dimensional array with movie titles
  titles = dataset['title']
  indices = pd.Series(dataset.index, index=dataset['title'])

  # Function that get movie recommendations based on the cosine similarity score of movie genres
  idx = indices[title]
  sim_scores = list(enumerate(cosine_sim[idx]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[1:21]
  movie_indices = [i[0] for i in sim_scores]
  return titles.iloc[movie_indices]

In [7]:
genre_recommendations('Toy Story (1995)').head(10)

12                           Balto (1995)
160                       Reckless (1995)
34                    It Takes Two (1995)
49                  Big Green, The (1995)
78               Dunston Checks In (1996)
44                      Pocahontas (1995)
95          Muppet Treasure Island (1996)
1                          Jumanji (1995)
53     Indian in the Cupboard, The (1995)
109     NeverEnding Story III, The (1994)
Name: title, dtype: object

In [8]:
genre_recommendations('Waiting to Exhale (1995)').head(10)

10                  American President, The (1995)
47                         Mighty Aphrodite (1995)
52               Postman, The (Postino, Il) (1994)
83                          Beautiful Girls (1996)
165                 Something to Talk About (1995)
191                        Don Juan DeMarco (1995)
198    Eat Drink Man Woman (Yin shi nan nu) (1994)
16                    Sense and Sensibility (1995)
24                        Leaving Las Vegas (1995)
27                               Persuasion (1995)
Name: title, dtype: object

In [13]:
def user_movies_recommender(userID):
  user = ratings[ratings["userId"] == userID].sort_values("rating",ascending = False)

  for i in range(len(user)):
    best_movie_id_user = user[i:len(user)].movieId.iloc[0]
    if best_movie_id_user<200:
      break

  return genre_recommendations(movies[movies["movieId"] == best_movie_id_user].title.iloc[0]).head(3)

In [15]:
user_movies_recommender(200)

24                Leaving Las Vegas (1995)
27                       Persuasion (1995)
42    How to Make an American Quilt (1995)
Name: title, dtype: object