# Recommender Systems - User Based Matrix Factorization

In [41]:
import pandas as pd
import numpy as np

In [42]:
r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', header=None, usecols=range(3), names=r_cols)

In [43]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


In [44]:
r_cols = ['movie_id', 'title']
movies = pd.read_csv('ml-100k/u.item', sep='|', header=None, encoding='latin1', usecols=range(2), names=r_cols)

In [45]:
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [46]:
r_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
user = pd.read_table('ml-100k/u.user', sep='|', header=None, names=r_cols)

In [47]:
user

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [48]:
user.iloc[195]

user_id          196
age               49
gender             M
occupation    writer
zip_code       55105
Name: 195, dtype: object

In [49]:
movies.iloc[241]

movie_id             242
title       Kolya (1996)
Name: 241, dtype: object

In [50]:
ratings= pd.merge(movies, ratings)

In [51]:
ratings

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3
...,...,...,...,...
99995,1678,Mat' i syn (1997),863,1
99996,1679,B. Monkey (1998),863,3
99997,1680,Sliding Doors (1998),863,2
99998,1681,You So Crazy (1994),896,3


In [52]:
ratings.title.value_counts()

title
Star Wars (1977)                                583
Contact (1997)                                  509
Fargo (1996)                                    508
Return of the Jedi (1983)                       507
Liar Liar (1997)                                485
                                               ... 
Tigrero: A Film That Was Never Made (1994)        1
Eye of Vichy, The (Oeil de Vichy, L') (1993)      1
Promise, The (Versprechen, Das) (1994)            1
To Cross the Rubicon (1991)                       1
Scream of Stone (Schrei aus Stein) (1991)         1
Name: count, Length: 1664, dtype: int64

In [53]:
ratings.movie_id.nunique()

1682

In [54]:
movie_ratings = ratings.pivot_table(index=['user_id'], columns=['title'], values='rating') 

In [55]:
movie_ratings # Matrix Factorization

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,,,,,,,,...,,,,,,,,,,
941,,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,3.0,,3.0,...,,,,,,,,,,


In [56]:
movie_ratings['Star Wars (1977)']

user_id
1      5.0
2      5.0
3      NaN
4      5.0
5      4.0
      ... 
939    NaN
940    4.0
941    NaN
942    5.0
943    4.0
Name: Star Wars (1977), Length: 943, dtype: float64

In [57]:
starwars_ratings = movie_ratings['Star Wars (1977)']

In [58]:
starwars_ratings.value_counts()

Star Wars (1977)
5.0    325
4.0    176
3.0     57
2.0     16
1.0      9
Name: count, dtype: int64

In [59]:
movie_ratings[['101 Dalmatians (1996)', 'Star Wars (1977)']].corr()

title,101 Dalmatians (1996),Star Wars (1977)
title,Unnamed: 1_level_1,Unnamed: 2_level_1
101 Dalmatians (1996),1.0,0.211132
Star Wars (1977),0.211132,1.0


In [60]:
movie_ratings.corrwith(starwars_ratings)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


title
'Til There Was You (1997)                0.872872
1-900 (1994)                            -0.645497
101 Dalmatians (1996)                    0.211132
12 Angry Men (1957)                      0.184289
187 (1997)                               0.027398
                                           ...   
Young Guns II (1990)                     0.228615
Young Poisoner's Handbook, The (1995)   -0.007374
Zeus and Roxanne (1997)                  0.818182
unknown                                  0.723123
Á köldum klaka (Cold Fever) (1994)            NaN
Length: 1664, dtype: float64

In [61]:
similar_movies = movie_ratings.corrwith(starwars_ratings)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [62]:
similar_movies = similar_movies.dropna()

In [63]:
similar_movies.sort_values(ascending=False)

title
Hollow Reed (1996)                        1.0
Commandments (1997)                       1.0
Cosi (1996)                               1.0
No Escape (1994)                          1.0
Stripes (1981)                            1.0
                                         ... 
Roseanna's Grave (For Roseanna) (1997)   -1.0
For Ever Mozart (1996)                   -1.0
American Dream (1990)                    -1.0
Frankie Starlight (1995)                 -1.0
Fille seule, La (A Single Girl) (1995)   -1.0
Length: 1410, dtype: float64

In [64]:
movie_stats = ratings.groupby('title').agg({'rating': [np.size, np.mean]})

  movie_stats = ratings.groupby('title').agg({'rating': [np.size, np.mean]})


In [65]:
movie_stats

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
'Til There Was You (1997),9,2.333333
1-900 (1994),5,2.600000
101 Dalmatians (1996),109,2.908257
12 Angry Men (1957),125,4.344000
187 (1997),41,3.024390
...,...,...
Young Guns II (1990),44,2.772727
"Young Poisoner's Handbook, The (1995)",41,3.341463
Zeus and Roxanne (1997),6,2.166667
unknown,9,3.444444


In [66]:
movie_stats.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
'Til There Was You (1997),9,2.333333
1-900 (1994),5,2.6
101 Dalmatians (1996),109,2.908257
12 Angry Men (1957),125,4.344
187 (1997),41,3.02439


In [67]:
popular_movies = movie_stats[movie_stats['rating']['size'] > 100]

In [68]:
popular_movies.sort_values([('rating', 'mean')], ascending=False)

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Close Shave, A (1995)",112,4.491071
Schindler's List (1993),298,4.466443
"Wrong Trousers, The (1993)",118,4.466102
Casablanca (1942),243,4.456790
"Shawshank Redemption, The (1994)",283,4.445230
...,...,...
Spawn (1997),143,2.615385
Event Horizon (1997),127,2.574803
Crash (1996),128,2.546875
Jungle2Jungle (1997),132,2.439394


In [69]:
similar_movies_df = pd.DataFrame(similar_movies, columns=['similarity'])

In [70]:
popular_movies.columns = popular_movies.columns.get_level_values(0)
df = popular_movies.join(similar_movies_df)

In [71]:
df.head()

Unnamed: 0_level_0,rating,rating,similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101 Dalmatians (1996),109,2.908257,0.211132
12 Angry Men (1957),125,4.344,0.184289
2001: A Space Odyssey (1968),259,3.969112,0.230884
Absolute Power (1997),127,3.370079,0.08544
"Abyss, The (1989)",151,3.589404,0.203709


In [72]:
df.sort_values(['similarity'], ascending=False)

Unnamed: 0_level_0,rating,rating,similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Star Wars (1977),583,4.358491,1.000000
"Empire Strikes Back, The (1980)",367,4.204360,0.747981
Return of the Jedi (1983),507,4.007890,0.672556
Raiders of the Lost Ark (1981),420,4.252381,0.536117
Austin Powers: International Man of Mystery (1997),130,3.246154,0.377433
...,...,...,...
"Edge, The (1997)",113,3.539823,-0.127167
As Good As It Gets (1997),112,4.196429,-0.130466
Crash (1996),128,2.546875,-0.148507
G.I. Jane (1997),175,3.360000,-0.176734
