# Movie Recommendations using Movie Lens Dataset

In [1]:
import pandas as pd
movies = pd.read_csv('https://raw.githubusercontent.com/swa19231/Datasets/master/movies.csv')
data = pd.read_csv('https://raw.githubusercontent.com/swa19231/Datasets/master/ratings.csv')

In [2]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
user_movies_dict={} #dictionary of users as keys and tuples of movies watched by them and the ratings given by them as values
movie_id_dict={} #dictionary of movieId as keys and movie title as values
movie_user_dict={} #dictionary of movie as keys and the list of userId's who watched this movie as values
for x in set(data['userId']): user_movies_dict[x]=[]
for y in set(movies['movieId']): movie_user_dict[y]=[]
for x in range(len(data)):
    user_movies_dict[data['userId'][x]].append([data['movieId'][x],data['rating'][x]])
    movie_user_dict[data['movieId'][x]].append(data['userId'][x])
for x in range(len(movies)):
    movie_id_dict[movies['movieId'][x]]=movies['title'][x]

Pearson Correlation formula:![435a23c499a2450f0752112e69a9b808336a7cce.svg](attachment:435a23c499a2450f0752112e69a9b808336a7cce.svg)

where:
$$
\text{n is sample size}\\
x_{i},y_{i} \text{are the individual sample points indexed with i}\\
{\displaystyle {\bar {x}}={\frac {1}{n}}\sum _{i=1}^{n}x_{i}} \\{\bar {y}}={\frac {1}{n}}\sum _{i=1}^{n}y_{i} $$

Pearson correlation is used to measure the similarity between any two users.

In [5]:
import math
def pearson_score(m,n):
    u=[]
    for x in user_movies_dict[m] : u.append(x[0]) #u is the list of movies watched by m
    v=[]
    for x in user_movies_dict[n] : v.append(x[0]) #v is the list of movies watched by n
    f=set.intersection(set(u),set(v)) #common movies watched by userId m and userId n
    c=0 #sum of xi
    d=0 #sum of yi
    csqr=0 #sum of xi^2
    dsqr=0 #sum of yi^2
    e=0 #sum of xi*yi
    for x in f: #manipulations to evaluate pearson correlation
        for y in user_movies_dict[m]:
            if y[0]==x:
                k=y[1]
                c=c+k
                csqr=csqr+(k*k)
        for z in user_movies_dict[n]:
            if z[0]==x:
                l=z[1]
                d=d+l
                dsqr=dsqr+(l*l)
        e=e+(k*l)
    u=(math.sqrt(len(f)*csqr-(c*c))*math.sqrt(len(f)*dsqr-(d*d)))
    if (u==0): #dividing by zero is taken careof
        return(0)
    else:
        return(((len(f)*e)-(d*c))/u)

Pearson_list contains the pearson correlation value between an two users.

In [6]:
pearson_list=[]
for i in range(len(user_movies_dict)):
    pearson_list.append([])
    for j in range(i+1,len(user_movies_dict)):
        pearson_list[i].append(pearson_score(i+1,j+1))

In [7]:
def find_rating(user,movie):
    for x in user_movies_dict[user]:
        if x[0]==movie:
            return(x[1])

top_10 function takes userId as input and outputs the top 10 suggested movies that was not watched by the user.

In [8]:
def top_10(user):
    dictio={} #dictionary having weighted pearson rating as keys and the list of all movies having that number/value as values.
              #Note:There might be two or more movies with same weighted pearson rating.
    dictio[0]=[]
    p=[]
    for x in user_movies_dict[user]:
        p.append(x[0])
    for x in set(movies['movieId']).difference(set(p)):
        c=0
        d=0
        for y in movie_user_dict[x]:
            c=c+pearson_list[min(user,y)-1][abs(user-y)-1]
            d=d+pearson_list[min(user,y)-1][abs(user-y)-1]*find_rating(y,x)
        if c!=0:
            if d/c in dictio.keys():
                dictio[d/c].append(x)
            else:       
                dictio[d/c]=[x]
        else: dictio[0].append(x)
    r=list(dictio.keys())
    b=sorted(r,reverse=True)
    n=1
    for x in b[0:10]:
        for j in range(len(dictio[x])): #there might be two or more movies having the same key(d/c value)
                                        #this loop takes care of not omitting such movies
                if n!=11: #takes care of printing only 10 movies
                    print(n,movie_id_dict[dictio[x][j]])
                    n=n+1

Top 10 movie recommendations of user 1 which he hasn't watched earlier

In [9]:
top_10(1)

1 Female Perversions (1996)
2 Kull the Conqueror (1997)
3 Showgirls (1995)
4 Theodore Rex (1995)
5 Extremely Loud and Incredibly Close (2011)
6 Phantom of the Opera, The (2004)
7 Get Out (2017)
8 17 Again (2009)
9 Big Boss, The (Fists of Fury) (Tang shan da xiong) (1971)
10 La Cérémonie (1995)


Top 10 movie recommendations of user 101 which he hasn't watched earlier

In [10]:
top_10(101)

1 I Origins (2014)
2 Alphaville (Alphaville, une étrange aventure de Lemmy Caution) (1965)
3 First Kid (1996)
4 Fiddler on the Roof (1971)
5 Swept Away (Travolti da un insolito destino nell'azzurro mare d'Agosto) (1975)
6 Welcome to Sarajevo (1997)
7 Gods Must Be Crazy II, The (1989)
8 Room (2015)
9 Claymation Christmas Celebration, A (1987)
10 Morning Glory (2010)
