### SVD matrix factorization

In [37]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.decomposition import TruncatedSVD

In [38]:
frame = pd.read_table('ml-100k/u.data', names=['user_id','item_id','rating','timestamp'])
frame.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [39]:
columns = ['item_id','movie title','release date','x', 'video release date' 
           'IMDb URL', 'movie id',' Action' , 'Adventure' , 'Animation' ,
              "Children's" , 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy' ,
              'Film-Noir' , 'Horror' , 'Musical' , 'Mystery' , 'Romance' , 'Sci-Fi' ,
              'Thriller' , 'War' , 'Western']
movies = pd.read_table('ml-100k/u.item', sep='|', names=columns, encoding='latin-1')
movies.head()

Unnamed: 0,item_id,movie title,release date,x,video release dateIMDb URL,movie id,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [40]:
combined_movie_data = pd.merge(frame, movies, on='item_id')
combined_movie_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie title,release date,x,video release dateIMDb URL,movie id,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
1,63,242,3,875747190,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
2,226,242,5,883888671,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
3,154,242,3,879138235,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
4,306,242,5,876503793,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
combined_movie_data.groupby('item_id')['rating'].count().sort_values(ascending=False).head()

item_id
50     583
258    509
100    508
181    507
294    485
Name: rating, dtype: int64

In [42]:
combined_movie_data[combined_movie_data.item_id == 50].nunique()

user_id                       583
item_id                         1
rating                          5
timestamp                     583
movie title                     1
release date                    1
x                               0
video release dateIMDb URL      1
movie id                        1
 Action                         1
Adventure                       1
Animation                       1
Children's                      1
Comedy                          1
Crime                           1
Documentary                     1
Drama                           1
Fantasy                         1
Film-Noir                       1
Horror                          1
Musical                         1
Mystery                         1
Romance                         1
Sci-Fi                          1
Thriller                        1
War                             1
Western                         1
dtype: int64

In [43]:
combined_movie_data[combined_movie_data.item_id == 50]['movie title'].unique()

array(['Star Wars (1977)'], dtype=object)

In [46]:
# bulding a utility matrix
rating_crosstab = combined_movie_data.pivot_table(values='rating', 
                                                  index='user_id', 
                                                  columns='movie title',
                                                  fill_value='0')
rating_crosstab.head()

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2.0,5.0,0.0,0,3.0,4.0,0.0,0,...,0,0,0,5.0,3.0,0,0,0,4.0,0
2,0,0,0.0,0.0,0.0,0,0.0,0.0,1.0,0,...,0,0,0,0.0,0.0,0,0,0,0.0,0
3,0,0,0.0,0.0,2.0,0,0.0,0.0,0.0,0,...,0,0,0,0.0,0.0,0,0,0,0.0,0
4,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,...,0,0,0,0.0,0.0,0,0,0,0.0,0
5,0,0,2.0,0.0,0.0,0,0.0,4.0,0.0,0,...,0,0,0,4.0,0.0,0,0,0,4.0,0


In [48]:
# Transposing the matrix
rating_crosstab.shape

(943, 1664)

In [50]:
# Transposing the matrix
X = rating_crosstab.values.T
X.shape

(1664, 943)

In [51]:
# decomposing the matrix
sdv = TruncatedSVD(n_components=12, random_state=17)
resultant_matrix = sdv.fit_transform(X)
resultant_matrix.shape

(1664, 12)

In [52]:
# Generating a correlation matrix
corr_mat = np.corrcoef(resultant_matrix)
corr_mat.shape

(1664, 1664)

In [54]:
# isolating Star wars from the correlatin matrix 
movies_names = rating_crosstab.columns
movies_list = list(movies_names)
star_wars = movies_list.index('Star Wars (1977)')
star_wars

1398

In [55]:
corr_star = corr_mat[star_wars]
corr_star.shape

(1664,)

In [56]:
# Recommending a highly correlated movie
list(movies_names[(corr_star < 1.0)&(corr_star > 0.9)])

['Die Hard (1988)',
 'Empire Strikes Back, The (1980)',
 'Fugitive, The (1993)',
 'Raiders of the Lost Ark (1981)',
 'Return of the Jedi (1983)',
 'Terminator 2: Judgment Day (1991)',
 'Terminator, The (1984)',
 'Toy Story (1995)']

In [57]:
# Recommending a highly correlated movie
list(movies_names[(corr_star < 1.0)&(corr_star > 0.95)])

['Return of the Jedi (1983)']