In [39]:
#import packages
import pandas as pd
import numpy as np
import os
from scipy.linalg import svd

In [10]:
#return a dataframe of all ratings
def get_ratings(part='u.data'):
    """Return a DataFrame of user-movie ratings."""
    return pd.read_csv(
        os.path.join('ml-100k', part), header=None, sep='\t',
        names=['user_id', 'item_id', 'rating', 'timestamp'],
    ).rename(columns={'item_id': 'movie_id'})

get_ratings()
#read data file and transform it into data frame
df_train = get_ratings('ua.base')
df_test = get_ratings('ua.test')

df_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [11]:
ITEM_PROPS = ['movie_id', 'movie_title', 'video_release_date', 'unknown', 'IMDb_URL']
GENRES = ['Action', 'Adventure', 'Animation',
          'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
          'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
          'Thriller', 'War', 'Western']

def get_movies():
    """Return a DataFrame of all movies."""
    return pd.read_csv(
        os.path.join('ml-100k', 'u.item'), header=None, index_col=False, sep='|', encoding="utf-16",
        names=ITEM_PROPS + GENRES,
    )
movies = get_movies()
df_train = pd.merge(df_train,movies[movies.columns[0:2]],on=['movie_id'])
df_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title
0,1,1,5,874965758,Toy Story (1995)
1,2,1,4,888550871,Toy Story (1995)
2,6,1,4,883599478,Toy Story (1995)
3,10,1,4,877888877,Toy Story (1995)
4,13,1,3,882140487,Toy Story (1995)


In [None]:
# apply svd
# U, sigma, V = np.linalg.svd(post_words)


In [23]:
user_item_matrix = df_train.pivot_table(index='user_id', columns='movie_title',values='rating')
user_item_matrix.head(20)
user_item_matrix = user_item_matrix.fillna(0)
user_item_matrix.head(20)

movie_title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,� k�ldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0
6,0.0,0.0,0.0,4.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,4.0,0.0,0.0,5.0,5.0,0.0,4.0,...,0.0,0.0,0.0,5.0,3.0,0.0,3.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
# Extract the user and movies as lists
user_list = list(user_item_matrix.index)
movie_list = list(user_item_matrix.columns)

In [74]:
from scipy.sparse import coo_matrix

R = coo_matrix(user_item_matrix.values)

print ("R Shape::", R.shape)
print ("R Columns::", R.col)
print ("R Rows::",R.row)
print(R)

R Shape:: (943, 1662)
R Columns:: [   2    3    6 ... 1642 1643 1657]
R Rows:: [  0   0   0 ... 942 942 942]
  (0, 2)	2.0
  (0, 3)	5.0
  (0, 6)	3.0
  (0, 7)	4.0
  (0, 16)	3.0
  (0, 17)	3.0
  (0, 31)	1.0
  (0, 35)	4.0
  (0, 36)	4.0
  (0, 41)	5.0
  (0, 44)	5.0
  (0, 46)	1.0
  (0, 50)	5.0
  (0, 84)	5.0
  (0, 87)	3.0
  (0, 88)	4.0
  (0, 93)	2.0
  (0, 94)	4.0
  (0, 104)	4.0
  (0, 109)	1.0
  (0, 113)	5.0
  (0, 115)	2.0
  (0, 127)	3.0
  (0, 131)	1.0
  (0, 133)	1.0
  :	:
  (942, 1454)	4.0
  (942, 1458)	4.0
  (942, 1459)	4.0
  (942, 1476)	3.0
  (942, 1486)	1.0
  (942, 1493)	5.0
  (942, 1495)	2.0
  (942, 1505)	4.0
  (942, 1511)	4.0
  (942, 1514)	4.0
  (942, 1522)	5.0
  (942, 1523)	4.0
  (942, 1531)	4.0
  (942, 1532)	5.0
  (942, 1557)	5.0
  (942, 1558)	3.0
  (942, 1570)	5.0
  (942, 1579)	2.0
  (942, 1613)	4.0
  (942, 1630)	2.0
  (942, 1631)	4.0
  (942, 1639)	1.0
  (942, 1642)	3.0
  (942, 1643)	2.0
  (942, 1657)	3.0


In [77]:
M,N = R.shape
factor = 3
P=np.random.rand(M,factor)
Q=np.random.rand(factor,N)

In [81]:
from numpy.linalg import norm

def error(R,P,Q,lamda=0.02):
    ratings = R.data
    rows = R.row
    cols = R.col
    e = 0 
    for ui in range(len(ratings)):
        rui=ratings[ui]
        u = rows[ui]
        i = cols[ui]
        if rui>0:
            e= e + pow(rui-np.dot(P[u,:],Q[:,i]),2)+\
                lamda*(pow(norm(P[u,:]),2)+pow(norm(Q[:,i]),2))
    return e

# Calculate the initial rmse
rmse = np.sqrt(error(R,P,Q)/len(R.data))
rmse

3.0278075530101587

In [82]:
def SGD(R, K, lamda=0.02,steps=10, gamma=0.001):
    
    M,N = R.shape
    P = np.random.rand(M,K)
    Q = np.random.rand(K,N)
    
    rmse = np.sqrt(error(R,P,Q,lamda)/len(R.data))
    print("Initial RMSE: "+str(rmse))
    
    for step in range(steps):
        for ui in range(len(R.data)):
            rui=R.data[ui]
            u = R.row[ui]
            i = R.col[ui]
            if rui>0:
                eui=rui-np.dot(P[u,:],Q[:,i])
                P[u,:]=P[u,:]+gamma*2*(eui*Q[:,i]-lamda*P[u,:])
                Q[:,i]=Q[:,i]+gamma*2*(eui*P[u,:]-lamda*Q[:,i])
        rmse = np.sqrt(error(R,P,Q,lamda)/len(R.data))
        if rmse<0.5:
            break
    print("Final RMSE: "+str(rmse))
    return P,Q
P,Q=SGD(R,K=3,gamma=0.0007,lamda=0.01, steps=100)

Initial RMSE: 3.0314629898374514
Final RMSE: 0.9268945832989074


In [87]:
all_user_ratings =np.matmul(P, Q)
all_user_ratings_df = pd.DataFrame(np.round(all_user_ratings,4),columns=movie_list, index=user_list).round(2)
all_user_ratings_df.shape
all_user_ratings_df.head(10)

Unnamed: 0,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,� k�ldum klaka (Cold Fever) (1994)
1,2.23,3.28,3.02,4.53,3.28,3.43,3.6,3.98,1.54,4.36,...,1.96,3.35,2.07,4.22,3.36,3.08,3.83,2.56,3.46,2.64
2,2.19,3.2,3.0,4.51,3.19,3.4,3.61,4.11,1.53,4.29,...,1.85,3.3,2.09,4.13,3.36,2.98,3.67,2.51,3.46,2.73
3,2.23,3.02,2.68,3.58,2.47,2.77,3.03,3.1,1.63,3.7,...,1.64,2.62,1.72,3.42,3.06,2.35,3.13,2.01,2.89,2.51
4,2.58,3.87,3.69,5.71,4.06,4.28,4.51,5.27,1.77,5.32,...,2.28,4.17,2.62,5.18,4.1,3.78,4.57,3.18,4.32,3.33
5,2.09,2.88,2.7,3.75,2.5,2.86,3.16,3.53,1.54,3.7,...,1.51,2.69,1.83,3.44,3.08,2.34,2.98,2.05,3.0,2.66
6,2.15,3.06,2.82,4.07,2.84,3.1,3.33,3.69,1.53,3.98,...,1.71,2.97,1.92,3.78,3.19,2.67,3.36,2.27,3.18,2.62
7,2.22,3.31,3.17,4.87,3.43,3.65,3.87,4.54,1.54,4.54,...,1.93,3.55,2.25,4.41,3.53,3.19,3.86,2.7,3.7,2.91
8,2.32,3.42,3.15,4.75,3.44,3.59,3.76,4.17,1.59,4.56,...,2.05,3.51,2.16,4.42,3.5,3.24,4.02,2.69,3.62,2.74
9,2.26,3.37,3.13,4.79,3.46,3.61,3.78,4.28,1.55,4.54,...,2.01,3.53,2.18,4.42,3.48,3.24,3.98,2.7,3.64,2.75
10,2.26,3.35,3.14,4.77,3.41,3.59,3.79,4.32,1.56,4.52,...,1.98,3.5,2.19,4.38,3.49,3.19,3.92,2.68,3.64,2.8


In [89]:
## Test dataframe RMSE
test_user_item_matrix = df_test.pivot_table(index='user_id', columns='movie_id',values='rating')
test_user_item_matrix = test_user_item_matrix.fillna(0)

In [91]:
Rt = coo_matrix(test_user_item_matrix.values)

In [92]:
Mt,Nt = Rt.shape
factor = 3
Pt=np.random.rand(Mt,factor)
Qt=np.random.rand(factor,Nt)

In [93]:
Pt,Qt=SGD(Rt,K=3,gamma=0.0007,lamda=0.01, steps=100)

Initial RMSE: 3.071264441417131
Final RMSE: 0.8947554434342804
