# Item-based Collaborative Filtering

## Import Library

In [2]:
#importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error # Summation(Predicted - actual)^2/N
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

import matplotlib.pyplot as plt
%matplotlib inline


## Data Preparation

In [3]:
ratings = pd.read_csv("u.data", sep = "\t",  names = ['user_id', 'movie_id', 'rating', 'timestamp'])
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
ratings02 = ratings.groupby(["movie_id", "user_id"], as_index=False).agg({"rating":"max"})
ratings02.head(20)

Unnamed: 0,movie_id,user_id,rating
0,1,1,5
1,1,2,4
2,1,5,4
3,1,6,4
4,1,10,4
5,1,13,3
6,1,15,1
7,1,16,5
8,1,17,4
9,1,18,5


In [5]:
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('u.item',  sep='|', names=i_cols, encoding='latin-1')
movies.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
user_rating = ratings02.merge(movies[["movie_id", "title"]], on = "movie_id", how = "left")
user_rating.head()

Unnamed: 0,movie_id,user_id,rating,title
0,1,1,5,Toy Story (1995)
1,1,2,4,Toy Story (1995)
2,1,5,4,Toy Story (1995)
3,1,6,4,Toy Story (1995)
4,1,10,4,Toy Story (1995)


## Create User-Item Interaction Matrix

In [7]:
# User-Item Interaction Matrix
user_rating02 = user_rating.pivot_table(index="user_id", columns = "title", values = "rating")
print (user_rating02.shape)
user_rating02.head()

(943, 1664)


title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


In [8]:
user_rating03 = user_rating02.dropna(thresh = 10, axis = 1).fillna(0) # thresh =10 หมายถึง เอาเฉพาะคนที่ดู 10เรื่อง น้อยกว่า 10 เรื่องตัดออก
print(user_rating03.shape)
user_rating03.head()

(943, 1144)


title,101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),"39 Steps, The (1935)",8 1/2 (1963),Absolute Power (1997),"Abyss, The (1989)",...,Wolf (1994),"Women, The (1939)","Wonderful, Horrible Life of Leni Riefenstahl, The (1993)",Wonderland (1997),"Wrong Trousers, The (1993)",Wyatt Earp (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)"
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,5.0,0.0,5.0,3.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,4.0,0.0,0.0,0.0


In [10]:
#user_rating03

## Check Similarity

In [11]:
#check similarity
item_similarity_df = user_rating03.corr(method="pearson") # ใช้ correlation หรือ cosine similarlity ก็ได้
item_similarity_df

title,101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),"39 Steps, The (1935)",8 1/2 (1963),Absolute Power (1997),"Abyss, The (1989)",...,Wolf (1994),"Women, The (1939)","Wonderful, Horrible Life of Leni Riefenstahl, The (1993)",Wonderland (1997),"Wrong Trousers, The (1993)",Wyatt Earp (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101 Dalmatians (1996),1.000000,0.059246,-0.001097,0.052877,0.128745,0.078260,0.015504,0.005750,0.221025,0.121153,...,0.057741,0.051950,-0.034417,0.000720,0.074635,0.109054,0.155450,0.115195,0.039172,-0.005917
12 Angry Men (1957),0.059246,1.000000,-0.014343,0.066339,0.230274,0.298716,0.339195,0.174500,0.019790,0.156721,...,0.048741,0.145040,0.138347,-0.004803,0.170189,0.160140,0.290188,0.164959,0.079341,0.038111
187 (1997),-0.001097,-0.014343,1.000000,0.078770,-0.010333,-0.039939,-0.021414,-0.006246,0.127531,0.017270,...,0.084986,-0.024094,-0.020299,0.115321,-0.025833,-0.000838,-0.021873,0.006811,0.053843,0.063789
2 Days in the Valley (1996),0.052877,0.066339,0.078770,1.000000,0.056283,0.090983,-0.019962,-0.008210,0.245191,0.129204,...,0.087568,0.069019,-0.001841,-0.026994,0.028208,0.116497,0.061328,0.197620,0.176032,0.146777
"20,000 Leagues Under the Sea (1954)",0.128745,0.230274,-0.010333,0.056283,1.000000,0.384535,0.274525,0.118108,0.117512,0.231247,...,0.244088,0.130652,0.062662,-0.001717,0.101581,0.286848,0.309511,0.243306,0.057977,0.071111
2001: A Space Odyssey (1968),0.078260,0.298716,-0.039939,0.090983,0.384535,1.000000,0.266171,0.200358,0.037861,0.264770,...,0.180504,0.078725,0.113774,-0.023812,0.262674,0.183051,0.429298,0.168440,0.071705,0.065870
"39 Steps, The (1935)",0.015504,0.339195,-0.021414,-0.019962,0.274525,0.266171,1.000000,0.295960,0.040896,0.094085,...,0.077421,0.081457,0.086217,0.023274,0.127264,0.137189,0.205543,0.057132,0.025560,0.016375
8 1/2 (1963),0.005750,0.174500,-0.006246,-0.008210,0.118108,0.200358,0.295960,1.000000,0.020419,0.051034,...,0.133663,0.141878,0.165348,-0.018537,0.094927,0.041004,0.170677,0.035488,0.022169,0.074051
Absolute Power (1997),0.221025,0.019790,0.127531,0.245191,0.117512,0.037861,0.040896,0.020419,1.000000,0.142817,...,0.128878,0.066173,-0.038617,-0.018542,0.038565,0.121793,0.100104,0.136609,0.135415,0.042236
"Abyss, The (1989)",0.121153,0.156721,0.017270,0.129204,0.231247,0.264770,0.094085,0.051034,0.142817,1.000000,...,0.337522,0.076574,0.074891,-0.013863,0.147020,0.166022,0.204965,0.310994,0.210445,0.046175


## Make User_Rating Standardization

In [12]:
def get_similar_movies(movie_name, user_rating):
    similar_score = item_similarity_df[movie_name]*(user_rating - 2.5)
    similar_score = similar_score.sort_values(ascending = False)
    return similar_score

## Movie Recommendation

In [13]:
print (get_similar_movies("Indiana Jones and the Last Crusade (1989)", 5))

title
Indiana Jones and the Last Crusade (1989)      2.500000
Empire Strikes Back, The (1980)                1.648923
Back to the Future (1985)                      1.626152
Raiders of the Lost Ark (1981)                 1.607513
Jurassic Park (1993)                           1.461949
True Lies (1994)                               1.442496
Terminator, The (1984)                         1.435339
Blues Brothers, The (1980)                     1.393882
Speed (1994)                                   1.390704
Groundhog Day (1993)                           1.387334
Fugitive, The (1993)                           1.374923
Monty Python and the Holy Grail (1974)         1.347504
Batman (1989)                                  1.340349
Terminator 2: Judgment Day (1991)              1.333490
Hunt for Red October, The (1990)               1.309458
Forrest Gump (1994)                            1.305628
Dances with Wolves (1990)                      1.300896
Braveheart (1995)                         

In [14]:
action_lover = [("Independence Day (ID4) (1996)", 5), ("Star Wars (1977)", 4), ("Rock, The (1996)", 2)]

similar_movies = pd.DataFrame()
for movie, rating in action_lover:
    similar_movies = similar_movies.append(get_similar_movies(movie, rating), ignore_index = True)

similar_movies.head()

Unnamed: 0,101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),"39 Steps, The (1935)",8 1/2 (1963),Absolute Power (1997),"Abyss, The (1989)",...,Wolf (1994),"Women, The (1939)","Wonderful, Horrible Life of Leni Riefenstahl, The (1993)",Wonderland (1997),"Wrong Trousers, The (1993)",Wyatt Earp (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)"
0,0.742668,0.186368,0.036481,0.642908,0.540931,0.363039,0.046834,0.005766,0.924352,0.710321,...,0.464351,0.070712,-0.04294,-0.055636,0.211521,0.457901,0.500482,0.6534,0.403043,0.10856
1,0.23367,0.319713,-0.051884,0.147901,0.330366,0.474809,0.188469,0.056574,0.216112,0.404983,...,0.235223,0.088844,0.129454,-0.032357,0.299163,0.209856,0.438636,0.371071,0.206536,-0.004082
2,-0.09678,-0.036341,-0.005202,-0.149391,-0.061774,-0.05108,0.019825,0.043598,-0.17399,-0.0874,...,-0.066823,0.000243,0.014841,0.02583,-0.04138,-0.068613,-0.061578,-0.118659,-0.073557,-0.008072


In [15]:
similar_movies.sum().sort_values(ascending=False)

Independence Day (ID4) (1996)                     2.827289
Star Wars (1977)                                  2.351633
Return of the Jedi (1983)                         2.075307
Mission: Impossible (1996)                        1.833478
Twister (1996)                                    1.774402
Star Trek: First Contact (1996)                   1.628493
Toy Story (1995)                                  1.606982
Empire Strikes Back, The (1980)                   1.537807
Indiana Jones and the Last Crusade (1989)         1.492317
Raiders of the Lost Ark (1981)                    1.481678
Men in Black (1997)                               1.470867
Jurassic Park (1993)                              1.452315
Eraser (1996)                                     1.446412
Broken Arrow (1996)                               1.406468
Rock, The (1996)                                  1.396564
Willy Wonka and the Chocolate Factory (1971)      1.384965
Dragonheart (1996)                                1.3642