In [1]:
import sys, codecs, re, pdb, time, datetime, math, tqdm, pickle, numpy as np
from numpy import genfromtxt
from numpy import linalg as LA
from scipy.sparse import csr_matrix

def build_movies_dict(movies_file):
    i = 0
    movie_id_dict = {}
    with codecs.open(movies_file, 'r', 'utf-8') as f:
        for line in f:
            if i == 0:
                i = i+1
            else:
                line1 = line.split(',')
                movieId = line1[0]
                genres = line1[2][:-1]
                title = "".join(line1[1:-1])
                movie_id_dict[int(movieId)] = i-1
                i = i+1
    return movie_id_dict

def read_data(input_file,movies_dict):
    users =  138493
    movies = 27278
    i = 0
    #col, row, data = [], [], []
    ret = []

    prev_user = 0;
    with open(input_file,'r') as f:
        for line in f:
            #print(line)
            if i == 0:
                i = i +1
            else:
                user,movie_id,rating,timestamp = line.split(',')
                id = movies_dict[int(movie_id)]
                cur_row = int(user) - 1
                cur_col = id
                #cur_data = int(float(rating) * 2)
                cur_data = float(rating)
                ret.append((cur_row, cur_col, cur_data))
    
    #X = csr_matrix((data, (row, col)), shape=(users, movies))
    #print(X.shape, X.shape)
    #print(len(col), len(row), len(data))
    
    #return X
    return ret

def loader(test_ratings_file, train_ratings_file, no_of_features, movies_mapping_file):
    movies_dict = build_movies_dict(movies_mapping_file)
    print("Reading dicitionary")
    test_numpy_arr = read_data(test_ratings_file,movies_dict)
    train_numpy_arr = read_data(train_ratings_file,movies_dict)
    print("Reading ratings")
    return test_numpy_arr, train_numpy_arr


In [2]:
print(datetime.datetime.now().time())
R_train, R_test = loader("1_test_ratings.txt", "1_train_ratings.txt", "20", "ml-20m/movies.csv")
print(datetime.datetime.now().time())

16:14:08.517171
Reading dicitionary
Reading ratings
16:14:34.509455


In [40]:
def matrix_factorization(R_train, R_test, V, W, rank, steps, lambd, eta):
    RMSE_old = 0.0
    for i in range(steps):
        print("Step" + str(i + 1), "\t", datetime.datetime.now().time())
        
        error_sum = 0.0
        for data in R_train:
            (user,movie,rating) = data
            error = rating - np.dot(V[user,:],W[movie,:])
            error_sum = error_sum + error**2
            
            V[user, :] = V[user, :] + eta * ((error * W[movie,:]) - (lambd * V[user,:]))
            W[movie, :] = W[movie, :] + eta * ((error * V[user,:]) - (lambd * W[movie,:]))

        RMSE_train = math.sqrt(error_sum / len(R_train))
        print("Train:\t", datetime.datetime.now().time(), "\t", RMSE_train)
        
        error_sum = 0.0
        predicted_ranking=[[] for i in range(138493)]
        real_rating=[[] for i in range(138493)]
        
        for data in R_test:
            (user,movie,rating) = data
            calc_rating = np.dot(V[user,:],W[movie,:])
            
            error = rating - calc_rating
            error_sum = error_sum + error**2
            
            predicted_ranking[user].append((movie, calc_rating))
            if(rating >= 3.0):
                real_rating[user].append(movie)
        

        RMSE_test = math.sqrt(error_sum / len(R_test))
        print("Test:\t", datetime.datetime.now().time(), "\t", RMSE_test)
        
        MRR = 0.0
        for idx, user_movies in enumerate(real_rating):
            predicted_ranking[idx].sort(key=lambda x: x[1], reverse=True)
            RR = 0.0
            for movie in user_movies:
                RR = RR + 1.0/(next((i for i, v in enumerate(predicted_ranking[idx]) if v[0] == movie), None) + 1)
            
            cnt = len(user_movies)
            if(cnt > 0):
                MRR = MRR + RR / cnt
        MRR = MRR / 138493.0
        print("MRR:\t", datetime.datetime.now().time(), "\t", MRR, "\n")
        
        with open("output/rank_" + str(rank) + "_lambda_" + str(lambd) + "_iter_" + str(i+1) + "_step_" + str(eta) + ".pkl", 'wb') as f:
            pickle.dump([V, W, rank, lambd, i+1, eta, RMSE_train, RMSE_test, MRR], f)

        '''
        with open('objs.pkl', 'rb') as f:
            A1, B1 = pickle.load(f)
            print(A1, B1)
        '''
        
        if(abs(RMSE_train - RMSE_old) < 0.001):
           break
        RMSE_train
        
    return V, W, RMSE_train, RMSE_test, MRR

def train_V_W():
    n = 138493
    m = 27278
    rank=10
    steps=15
    lambd=0.02
    eta=0.002
    
    V = np.random.rand(n,rank)
    W = np.random.rand(m,rank)
    
    est_V, est_W, RMSE_train, RMSE_test, MRR = matrix_factorization(R_train, R_test, V, W, rank, steps, lambd, eta)

train_V_W()


Step1 	 19:04:00.409123
Train:	 19:05:45.254858 	 1.0374772348432586
Test:	 19:06:04.946873 	 0.9506846901931513
MRR:	 19:07:05.267936 	 0.14467265850482763 

Step2 	 19:07:05.283894
Train:	 19:08:49.922779 	 0.9193230880409824
Test:	 19:09:07.725400 	 0.9061679712382976
MRR:	 19:10:12.563627 	 0.1447838849155008 

Step3 	 19:10:12.583711
Train:	 19:11:57.777240 	 0.8915218493664653
Test:	 19:12:15.312098 	 0.8903523337298446
MRR:	 19:13:17.131527 	 0.1448083220384447 

Step4 	 19:13:17.153909
Train:	 19:14:57.571975 	 0.8788086035179083
Test:	 19:15:15.287643 	 0.8824449008929348
MRR:	 19:16:19.091973 	 0.14483250758640848 

Step5 	 19:16:19.124074
Train:	 19:18:01.587700 	 0.8715796757920908
Test:	 19:18:19.200475 	 0.8778818371369137
MRR:	 19:19:22.393752 	 0.14483842222187862 

Step6 	 19:19:22.420642
Train:	 19:21:02.641238 	 0.8669794375165654
Test:	 19:21:21.313349 	 0.8750170194725415
MRR:	 19:22:24.498863 	 0.14482708194764515 

Step7 	 19:22:24.517405
Train:	 19:24:09.387056 