In [1]:
#Part 1
#Importing headers
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
movie_df = pd.read_csv("ml-1m/ml-1m/movies.dat", delimiter="::", names=["movie_id","movie_name","genre"],engine="python")
movie_df.head(10)

Unnamed: 0,movie_id,movie_name,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [3]:
ratings_df = pd.read_csv("ml-1m/ml-1m/ratings.dat", delimiter="::", names=["user_id","movie_id","ratings","time"],engine="python")
ratings_df

Unnamed: 0,user_id,movie_id,ratings,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [4]:
# 1. Create m x u matrix with movies as row and users as column. 
shape1 = (max(ratings_df["movie_id"]), max(ratings_df["user_id"]))
mu_matrix = np.ndarray(shape = shape1)  
#Sorting the dataframe
mu_matrix_df = ratings_df.sort_values(["user_id","movie_id"], ascending=[True, True]).reset_index(drop=True)
mu_matrix_df["user_id"] = mu_matrix_df["user_id"]-1
mu_matrix_df["movie_id"] = mu_matrix_df["movie_id"]-1
mu_matrix[mu_matrix_df["movie_id"],mu_matrix_df["user_id"]] = mu_matrix_df["ratings"]
print("m*u matrix: ")
print(mu_matrix)

"""
Comments and Observations:

1) Creating a dummy numpy array with the shape of maximum value movie_id and user_id
2) Sorting the dataframe based in user id and movie id
3) Since movie id and user ids start from 1, making it in compatible with index(By subtracting 1).
4) Filling the matrix index with ratings

"""

m*u matrix: 
[[5. 0. 0. ... 0. 0. 3.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


'\nComments and Observations:\n\n1) Creating a dummy numpy array with the shape of maximum value movie_id and user_id\n2) Sorting the dataframe based in user id and movie id\n3) Since movie id and user ids start from 1, making it in compatible with index(By subtracting 1).\n4) Filling the matrix index with ratings\n\n'

In [5]:
# 1. Normalize the matrix
mean_col = np.mean(mu_matrix, 1)
matrix_normal = mu_matrix - np.array([mean_col]).T
print(matrix_normal)

"""
Comments and Observations:

Normalising the matrix means taking the mean and subtracting the mean across columns.

"""

[[ 3.57400662 -1.42599338 -1.42599338 ... -1.42599338 -1.42599338
   1.57400662]
 [-0.37152318 -0.37152318 -0.37152318 ... -0.37152318 -0.37152318
  -0.37152318]
 [-0.23874172 -0.23874172 -0.23874172 ... -0.23874172 -0.23874172
  -0.23874172]
 ...
 [-0.03278146 -0.03278146 -0.03278146 ... -0.03278146 -0.03278146
  -0.03278146]
 [-0.02582781 -0.02582781 -0.02582781 ... -0.02582781 -0.02582781
  -0.02582781]
 [-0.24288079 -0.24288079 -0.24288079 ... -0.24288079 -0.24288079
  -0.24288079]]


'\nComments and Observations:\n\nNormalising the matrix means taking the mean and subtracting the mean across columns.\n\n'

In [6]:
# 2. Compute SVD to get U, S and V. Use np.linalg.svd()
mat_shape = np.sqrt(mu_matrix.shape[0] - 1)
de_matrix = matrix_normal.T / mat_shape
U, S, V = np.linalg.svd(de_matrix)
print(U)
print(S)
print(V)

"""

Comments and Observations:
Singular-Value Decomposition(SVD) is a matrix decomposition method for reducing a matrix to its constituent parts in order to make certain subsequent matrix calculations simpler.
SVD decomposes the matrix in analagous to an algebric expression factorization(eg.,30=>(5*3*2))

"""

[[ 7.13393053e-03  1.64099327e-03  2.14622406e-03 ...  4.52266370e-03
   6.00783778e-03  8.85674875e-03]
 [ 6.40383513e-04 -2.70126226e-03 -2.00478360e-04 ...  2.54577717e-03
   5.40385541e-04 -1.55791869e-02]
 [ 6.72473390e-03 -3.34737240e-03 -3.95617989e-03 ...  1.71114905e-03
   5.91124462e-03  8.44640940e-03]
 ...
 [ 1.13666709e-02  1.80896437e-03 -5.62198713e-04 ...  7.73909103e-01
  -1.48780057e-02 -2.19959777e-03]
 [ 3.49381899e-03  1.87620989e-02 -1.08962191e-02 ... -9.78238175e-03
   3.32703088e-01 -3.14995403e-03]
 [-1.32856412e-02  4.08015550e-02 -3.63311909e-03 ... -1.09584563e-03
   5.44124149e-03  8.83627250e-02]]
[2.06617808e+01 1.06804930e+01 9.14055972e+00 ... 1.71271486e-15
 1.71271486e-15 3.80173012e-16]
[[-5.72245537e-02 -2.69349804e-02 -1.44300959e-02 ... -3.50288960e-03
  -8.19971409e-04 -1.25419482e-02]
 [-2.09461200e-02 -2.97843268e-02 -1.66998921e-02 ...  1.87545849e-03
   2.26486119e-03  5.02192377e-03]
 [ 2.86333511e-02 -8.62001932e-03  1.36557569e-02 ...  1.

'\n\nComments and Observations:\nSingular-Value Decomposition(SVD) is a matrix decomposition method for reducing a matrix to its constituent parts in order to make certain subsequent matrix calculations simpler.\nSVD decomposes the matrix in analagous to an algebric expression factorization(eg.,30=>(5*3*2))\n\n'

In [8]:
# 3. From your V.T select 50 components.
Vt = V.T[:,0:50]
Vt

array([[-0.05722455, -0.02094612,  0.02863335, ...,  0.01362589,
        -0.00193866,  0.05997806],
       [-0.02693498, -0.02978433, -0.00862002, ...,  0.00439231,
         0.01130363, -0.02143723],
       [-0.0144301 , -0.01669989,  0.01365576, ...,  0.00700238,
        -0.02176034,  0.01457793],
       ...,
       [-0.00350289,  0.00187546,  0.00191671, ...,  0.00069592,
        -0.00668185, -0.006408  ],
       [-0.00081997,  0.00226486,  0.00347437, ..., -0.00036645,
        -0.00401153, -0.00217063],
       [-0.01254195,  0.00502192,  0.02208081, ...,  0.02609   ,
        -0.02060116, -0.01451355]])

In [20]:
# 4. Implement a function that take movieID as input and then implement cosine similarity along with sorting to recommend top 10 movies
def implement_cosine_similarity(vt, movieID):
    movie_row = vt[movieID-1, :]
    norm = np.sqrt(np.diag(np.dot(vt,vt.T)))
    cos_sim_num = np.dot(movie_row, vt.T)
    cos_sim_de =  (norm[movieID-1] * norm)
    cos_sim = cos_sim_num/ cos_sim_de
    index = np.argsort(-cos_sim)
    return index[:10]


indexes = implement_cosine_similarity(Vt, 5)# movie id : 5
print("Recommended Movies similar to {} are:".format(movie_df[movie_df["movie_id"]==5]["movie_name"].iloc[0]))
for i in indexes:
    print(movie_df[movie_df["movie_id"]==i]["movie_name"].iloc[0])

"""
Comments and Observations:

Implementations of Cosine Similarity:
Cosine similarity = dot(a,b)/(norm(a)*norm(b))

"""

Recommended Movies similar to Father of the Bride Part II (1995) are:
Waiting to Exhale (1995)
Brady Bunch Movie, The (1995)
Hard 8 (a.k.a. Sydney, a.k.a. Hard Eight) (1996)
Heavy (1995)
Cobb (1994)
Red Rock West (1992)
Visitors, The (Les Visiteurs) (1993)
Flubber (1997)
Net, The (1995)
Little Mermaid, The (1989)


  cos_sim = cos_sim_num/ cos_sim_de


'\nComments and Observations:\n\nImplementations of Cosine Similarity:\nCosine similarity = dot(a,b)/(norm(a)*norm(b))\n\n'

In [21]:
# 5. Repeat the same process except now instead of using SVD you will use PCA to get the eigenvectors.
# 6. You will require co-variance matrix as an input to your eig function.
# Use np.cov() for getting co-variance matrix.
# Use np.linalg.eig() for getting eigen vectors.
mean_col = np.mean(mu_matrix, 1)
trans_mat = np.matrix(mean_col).T
mat_norm = mu_matrix - trans_mat
cov_matrix = np.cov(mat_norm)
result = np.linalg.eig(cov_matrix)
print(type(result))

<class 'tuple'>


In [22]:
eigenvectors = result[1]

"""
Comments and Observations:

np.linalg.eig() provides both eigen values and eigen vectors

"""

'\nComments and Observations:\n\nnp.linalg.eig() provides both eigen values and eigen vectors\n\n'

In [23]:
#7. Use that same steps after that to get 50 components. Use cosine similarity to get the results
evt = eigenvectors[:,0:50] 
indexes = implement_cosine_similarity(evt, 5)# movie id : 5
print("Recommended Movies similar to {} are:".format(movie_df[movie_df["movie_id"]==5]["movie_name"].iloc[0]))
for i in indexes:
    print(movie_df[movie_df["movie_id"]==i]["movie_name"].iloc[0])
    
"""
Comments and Observations:

Insead of SVD, PCA has been used. Eigen vectors has been passed as input for cosine similarity. 

"""

Recommended Movies similar to Father of the Bride Part II (1995) are:
Waiting to Exhale (1995)
Brady Bunch Movie, The (1995)
Hard 8 (a.k.a. Sydney, a.k.a. Hard Eight) (1996)
Heavy (1995)
Cobb (1994)
Red Rock West (1992)
Visitors, The (Les Visiteurs) (1993)
Flubber (1997)
Net, The (1995)
Little Mermaid, The (1989)


  cos_sim = cos_sim_num/ cos_sim_de


'\nComments and Observations:\n\nInsead of SVD, PCA has been used. Eigen vectors has been passed as input for cosine similarity. \n\n'

In [26]:
# 8. Compare the results for SVD and PCA.


**Comments and Observations:**

From the results of both SVD and PCA, it is evident that both SVD and PCA provides similar recommendations.

Few observations are:

1) In general, SVD helps us to decompose and untangle the data into separate components whereas PCA shrinks the number of component by omitting the less significant component. In both the cases, significant components are being considered. In our case, Covariance Matrix of normalised matrix(mat_norm) equals the eigen vector. Hence the results are same.


2) SVD is comparatively faster and singular values from SVD are sorted.

3) PCA is a special case of SVD. PCA needs the data to be normalised which used SVD in calculation with extra analysis. 
