# Item-Based Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np

movie_columns = ['movie_id', 'title']
rating_columns = ['user_id', 'movie_id', 'rating']


movies = pd.read_csv( 'ml-100k/u.item', sep='|', names = movie_columns, usecols = range(2), encoding='ISO-8859-1'  )
ratings = pd.read_csv( 'ml-100k/u.data', sep='\t', names = rating_columns, usecols = range(3), encoding='ISO-8859-1'  )

ratings = pd.merge( movies, ratings )

ratings.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


In [2]:
# movie ratings pivot against user id and movie for movie rating.

pivot_movie_rating = ratings.pivot_table( index = 'user_id', columns = 'title', values = 'rating' )
pivot_movie_rating.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [3]:
# Here corr will give us the correlation score for every pair of movies.
# If at least one user has given rating to movies then it will show value else it will show NaN
correlationMatrix = pivot_movie_rating.corr()
correlationMatrix.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),1.0,,-1.0,-0.5,-0.5,0.522233,,-0.426401,,,...,,,,,,,,,,
1-900 (1994),,1.0,,,,,,-0.981981,,,...,,,,-0.944911,,,,,,
101 Dalmatians (1996),-1.0,,1.0,-0.04989,0.269191,0.048973,0.266928,-0.043407,,0.111111,...,,-1.0,,0.15884,0.119234,0.680414,0.0,0.707107,,
12 Angry Men (1957),-0.5,,-0.04989,1.0,0.666667,0.256625,0.274772,0.178848,,0.457176,...,,,,0.096546,0.068944,-0.361961,0.144338,1.0,1.0,
187 (1997),-0.5,,0.269191,0.666667,1.0,0.596644,,-0.5547,,1.0,...,,0.866025,,0.455233,-0.5,0.5,0.475327,,,


we will drop the ratings which are lesser than 100, that is nothing but the pairs to which less than 100 users have given rating 
it will show NaN

In [4]:
correlationMatrix = correlationMatrix.corr( method = 'pearson', min_periods = 150 )
correlationMatrix.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),1.0,,-0.083122,-0.001783,0.00781,0.016478,-0.045689,-0.106936,,-0.050076,...,,-0.126438,,0.025007,0.235476,0.200514,-0.171901,,0.169189,
1-900 (1994),,1.0,-0.117646,0.307351,0.116903,0.183626,0.150794,-0.108262,,0.034537,...,,,,-0.327217,-0.426902,-0.305907,0.110891,,-0.182892,
101 Dalmatians (1996),-0.083122,-0.117646,1.0,-0.08072,-0.001623,-0.107862,0.133264,-0.21546,,-0.061033,...,,-0.033482,,-0.026113,0.031005,0.12369,-0.002398,0.268281,0.09006,
12 Angry Men (1957),-0.001783,0.307351,-0.08072,1.0,0.061435,0.174111,0.318366,-0.038083,,0.232662,...,,-0.037358,,-0.310915,-0.175203,-0.198362,0.06727,0.021777,-0.006793,
187 (1997),0.00781,0.116903,-0.001623,0.061435,1.0,0.176649,0.017449,-0.077263,,0.000854,...,,0.31611,,0.013541,0.022529,0.021274,0.094839,-0.120914,0.150086,


In [5]:
myRatings = pivot_movie_rating.loc[0].dropna()
myRatings

title
Empire Strikes Back, The (1980)    5.0
Gone with the Wind (1939)          1.0
Star Wars (1977)                   5.0
Name: 0, dtype: float64

Now we will go through all the movies we like (We are user wwith id 0), we will map each movie for the movie we rated from the corr matrix and if we find the simmilaritiy between movie we rated and movie in corr matrix, liked count more than movies similar to ones we hated

In [6]:
similarMovieCandidates = pd.Series()
for i in range(0, len(myRatings)) :
    print( 'Adding Similarity for -> ' + myRatings.index[i] )
    #retrive the same we rated from corr matrix
    similarCandidates = correlationMatrix[myRatings.index[i]].dropna()
    # Scale the similarity by how well we rated this movie
    similarCandidates = similarCandidates.map( lambda x : x * myRatings[i] )
    similarMovieCandidates = similarMovieCandidates.append( similarCandidates )

print( 'Sorting....' )
similarMovieCandidates.sort_values( inplace = True, ascending = False )
similarMovieCandidates.head(10)

Adding Similarity for -> Empire Strikes Back, The (1980)
Adding Similarity for -> Gone with the Wind (1939)
Adding Similarity for -> Star Wars (1977)
Sorting....


Star Wars (1977)                   5.000000
Empire Strikes Back, The (1980)    5.000000
Star Wars (1977)                   3.826474
Empire Strikes Back, The (1980)    3.826474
Return of the Jedi (1983)          3.807545
Return of the Jedi (1983)          3.404667
Raiders of the Lost Ark (1981)     3.302913
Back to the Future (1985)          3.245897
Back to the Future (1985)          3.236432
Raiders of the Lost Ark (1981)     3.227294
dtype: float64

In [7]:
# Same name showing for movies as we rated the same type of movie, using groupy to remove duplicates
similarMovieCandidates = similarMovieCandidates.groupby(similarMovieCandidates.index).sum()
similarMovieCandidates.sort_values( inplace = True, ascending = False )
similarMovieCandidates.head(10)

Empire Strikes Back, The (1980)      9.041985
Star Wars (1977)                     9.030547
Return of the Jedi (1983)            7.353962
Raiders of the Lost Ark (1981)       6.791547
Back to the Future (1985)            6.756945
Usual Suspects, The (1995)           5.418716
Jaws (1975)                          5.317281
E.T. the Extra-Terrestrial (1982)    5.240905
Shawshank Redemption, The (1994)     5.125241
Batman (1989)                        4.923013
dtype: float64

In [8]:
# Drop the moview we already rated
similarMovieCandidates = similarMovieCandidates.drop( myRatings.index )
similarMovieCandidates.head(10)

Return of the Jedi (1983)                    7.353962
Raiders of the Lost Ark (1981)               6.791547
Back to the Future (1985)                    6.756945
Usual Suspects, The (1995)                   5.418716
Jaws (1975)                                  5.317281
E.T. the Extra-Terrestrial (1982)            5.240905
Shawshank Redemption, The (1994)             5.125241
Batman (1989)                                4.923013
Field of Dreams (1989)                       4.832605
Indiana Jones and the Last Crusade (1989)    4.719969
dtype: float64