In [243]:
import numpy as np
import pandas as pd
import re
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [244]:
df_movies  = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')

In [245]:
dataset = pd.merge(df_movies,df_ratings,on="movieId")

In [246]:
df = dataset['movieId'].ne(dataset['movieId'].shift()).cumsum() 
df = dataset.groupby(df).first()
df = df.iloc[:,:3]
df.index.name='index'

In [247]:
ratings_mean = dataset.groupby('movieId')['rating'].mean()
ratings_count = dataset.groupby('movieId')['rating'].count()
rating = pd.DataFrame({'Mean Ratings':ratings_mean,
                'Ratings count':ratings_count})

In [248]:
df = pd.merge(df,rating,on="movieId")
df['genres'] = [re.sub("\|"," ",str(genre)) for genre in df['genres']]
df['movie year'] = df['title'].str.split(r" \(",expand=True).iloc[:,1]
df['title'] = df['title'].str.split(r" \(",expand=True).iloc[:,0]
df['movie year'] = [re.sub("\)","",str(year)) for year in df['movie year']]
df['vectors'] = df['title'] +" " + df['genres']
df.insert(0,'Index',[i for i in range(len(df.index))])

In [249]:
df

Unnamed: 0,Index,movieId,title,genres,Mean Ratings,Ratings count,movie year,vectors
0,0,1,Toy Story,Adventure Animation Children Comedy Fantasy,3.907328,232,1995,Toy Story Adventure Animation Children Comedy ...
1,1,2,Jumanji,Adventure Children Fantasy,3.353261,92,1995,Jumanji Adventure Children Fantasy
2,2,3,Grumpier Old Men,Comedy Romance,3.189655,58,1995,Grumpier Old Men Comedy Romance
3,3,4,Waiting to Exhale,Comedy Drama Romance,2.818182,11,1995,Waiting to Exhale Comedy Drama Romance
4,4,5,Father of the Bride Part II,Comedy,3.250000,62,1995,Father of the Bride Part II Comedy
...,...,...,...,...,...,...,...,...
10320,10320,146684,Cosmic Scrat-tastrophe,Animation Children Comedy,4.000000,1,2015,Cosmic Scrat-tastrophe Animation Children Comedy
10321,10321,146878,Le Grand Restaurant,Comedy,2.500000,1,1966,Le Grand Restaurant Comedy
10322,10322,148238,A Very Murray Christmas,Comedy,3.000000,1,2015,A Very Murray Christmas Comedy
10323,10323,148626,The Big Short,Drama,4.333333,3,2015,The Big Short Drama


In [250]:
def get_rating(user_id,movie_id):
    return(dataset.loc[(dataset.userId==user_id) & (dataset.movieId==movie_id),'rating'].iloc[0])

In [251]:
def pearson_corr(user1,user2):
    np.seterr(invalid='ignore')
    common_movies = []
    for movie in dataset[dataset.userId==user1]['movieId'].values:
        if movie in dataset[dataset.userId==user2]['movieId'].values:
            common_movies.append(movie)
    
    if len(common_movies)==0:
        return 0
    
    ratings_1 = [get_rating(user1,movie) for movie in common_movies]
    ratings_2 = [get_rating(user2,movie) for movie in common_movies]
    corr = np.corrcoef(ratings_1,ratings_2)
    return corr[0][1]

In [252]:
def find_similar_users(user1):
    np.seterr(invalid='ignore')
    user_ids = dataset['userId'].unique()
    #sim_score = [(pearson_corr(user1,user),user) for user in user_ids if user != user1]
    sim_score = []
    for user in user_ids:
        if user!=user1:
            corr = pearson_corr(user1,user)
            if math.isnan(corr)!=True:
                sim_score.append((corr,user))
    sim_score.sort()
    sim_score.reverse()
    return(sim_score)

In [253]:
find_similar_users(1)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


[(1.0, 533),
 (1.0, 487),
 (1.0, 377),
 (1.0, 67),
 (1.0, 34),
 (0.9999999999999999, 472),
 (0.9999999999999999, 212),
 (0.9999999999999999, 171),
 (0.9999999999999999, 154),
 (0.9999999999999997, 117),
 (0.975342893301088, 13),
 (0.9707253433941508, 552),
 (0.9669875568304565, 526),
 (0.9514478340855806, 657),
 (0.9453431006169687, 103),
 (0.9405816697932301, 26),
 (0.8782297740303605, 96),
 (0.873228053325973, 165),
 (0.8696263565463044, 384),
 (0.8660254037844385, 254),
 (0.8522088634540257, 571),
 (0.8497568598529867, 287),
 (0.8427009716003845, 596),
 (0.836698656172646, 573),
 (0.8344094626730117, 134),
 (0.8315218406202999, 553),
 (0.8304547985373998, 416),
 (0.8207677342949548, 363),
 (0.8077844534716178, 325),
 (0.8029550685469663, 12),
 (0.8000946913656628, 406),
 (0.7969704699300443, 417),
 (0.777713771047819, 94),
 (0.7756859981099772, 389),
 (0.7667215117146418, 621),
 (0.7584592386322859, 152),
 (0.7572401854185358, 595),
 (0.7559289460184544, 82),
 (0.7434844114105215, 3

In [233]:
def user_to_user_recommendation(user_id):
    np.seterr(invalid='ignore')
    movie_ids = dataset[dataset.userId==user_id]['movieId'].values
    users_corr = find_similar_users(user_id)
    users_corr = users_corr[:10]
    
    sim_users = []       #stores userIds of similar users
    sim_users_movies = []    #stores movieIds of movies that each similar user has watched

    for i in range(len(users_corr)):
        sim_users.append(users_corr[i][1])
        sim_users_movies.append(dataset[dataset.userId==sim_users[i]]['movieId'].values)

    user_ratings = {}

    for j in range(len(sim_users)):
        for movie in sim_users_movies[j]:
            if movie not in movie_ids:
                if movie in user_ratings.keys():
                    rating = user_ratings[movie][0] + get_rating(sim_users[j],movie)
                    count = user_ratings[movie][1] + 1
                    user_ratings[movie] = (rating,count)
                else:
                    user_ratings[movie] = (get_rating(sim_users[j],movie),1)

    ratings_count_list = [150,125,100,75,50,25,10]

    for rating_count in ratings_count_list:
        rating_score = {}
        for movie in user_ratings.keys():
            if df[df.movieId==movie]['Ratings count'].values[0]>=rating_count:
                rating_score[movie] = (user_ratings[movie][0]/user_ratings[movie][1])
        if len(rating_score)>=8:
            break

    rating_score = (sorted(rating_score.items(), key=lambda elem:(elem[1], elem[0]),reverse=True))
    rating_score = rating_score[:10]

    movies = []
    mov_ids = []
    scores = []

    for s in rating_score:
        movie_name = df[df.movieId==s[0]]['title'].values[0] + ' (' + df[df.movieId==s[0]]['movie year'].values[0] + ')'
        movies.append(movie_name)
        mov_ids.append(s[0])
        scores.append(s[1])

    recommendation = pd.DataFrame({'Movie Id':mov_ids, 'Movie':movies, 'Rating Score':scores})

    return(recommendation)


In [234]:
user_to_user_recommendation(1)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Unnamed: 0,Movie Id,Movie,Rating Score
0,1527,"Fifth Element, The (1997)",5.0
1,750,Dr. Strangelove or: How I Learned to Stop Worr...,5.0
2,6,Heat (1995),5.0
3,2918,Ferris Bueller's Day Off (1986),4.5
4,2115,Indiana Jones and the Temple of Doom (1984),4.5
5,1682,"Truman Show, The (1998)",4.5
6,1,Toy Story (1995),4.5
7,2997,Being John Malkovich (1999),4.0
8,2987,Who Framed Roger Rabbit? (1988),4.0
9,2291,Edward Scissorhands (1990),4.0
