In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import scipy.sparse as ss
import pandas as pd
from matrixfactorization import MatrixFactorizationRec

In [2]:
def initialize(file_name,cols,num_movies,num_users,header=True):

    if header:
        skip = 1
    else:
        skip = 0
    movies = np.loadtxt(file_name, delimiter=',',skiprows=skip,usecols=cols)
    print type(movies)
    
    global_mean = np.mean(movies[:,2])
    global_var = np.var(movies[:,2])

    all_movies = np.unique(movies[:,1])
    movie_averages = np.zeros(num_movies)
    all_users = np.unique(movies[:,0])
    user_bias = np.zeros(num_users)
    
    
    # Movie baseline prediction

    for movie in all_movies:
        ratings = movies[movies[:,1]==movie,2]
        movie_var = np.var(ratings)
        k = movie_var/global_var
        movie_averages[movie-1] = (global_mean*k + ratings.sum())/(k + ratings.shape[0])
    movie_averages[movie_averages==0] = global_mean

    # User bias baseline prediction
    # (bias = users avg rating - global avg rating)

    for user in all_users:
        user_ratings = movies[movies[:,0]==user,2]
        user_avg = np.mean(user_ratings)
        user_bias[user-1] = user_avg - global_mean

    return movies,user_bias,movie_averages

In [3]:
ratings,bias,movie_averages = initialize('data/training_ratings_for_kaggle_comp.csv',(0,1,2),3952,6040)
### movies columns (user_id,movie_id,rating)

<type 'numpy.ndarray'>


In [4]:
ratings[1,:]

array([ 2783.,   589.,     5.])

In [5]:
bias

array([ 0.        ,  0.        ,  0.        , ...,  0.19777644,
        0.27582523, -0.0581984 ])

In [6]:
movie_averages[1:5]

array([ 3.23153168,  2.98056173,  2.77824185,  2.96398931])

In [7]:
mfr = MatrixFactorizationRec(30,0.001,0.02)

In [8]:
mfr.fit(ratings, movie_averages, bias)

0 , 428728.003057 , -100.0
100000 , 428405.787464 , 0.0752127077796
200000 , 428099.590248 , 0.0715247624967
300000 , 427809.894242 , 0.0677160605328
400000 , 427530.546606 , 0.0653398073593
500000 , 427274.027551 , 0.0600361918285
600000 , 427021.648621 , 0.059102139431
700000 , 426781.586719 , 0.0562493578922
800000 , 426550.473365 , 0.0541819475692
900000 , 426331.29397 , 0.0514105809558
1000000 , 426116.26254 , 0.0504630891691
1100000 , 425912.570601 , 0.047824824561
1200000 , 425720.026748 , 0.0452278118283
1300000 , 425529.402047 , 0.0447970692511
1400000 , 425341.547125 , 0.0441656649521
1500000 , 425160.78296 , 0.0425166601011
1600000 , 424985.573383 , 0.041227182173
1700000 , 424820.927846 , 0.0387564563363
1800000 , 424657.591959 , 0.0384629615799
1900000 , 424500.811119 , 0.0369329894509
2000000 , 424349.427965 , 0.0356741743168
2100000 , 424199.086874 , 0.0354411632929
2200000 , 424052.233242 , 0.0346310242538
2300000 , 423906.82007 , 0.0343030980557
2400000 , 423769.121643

KeyboardInterrupt: 

In [9]:
mfr.pred_one_user(15)

array([ 4.14776508,  3.3061175 ,  3.06619199, ...,  3.68242466,
        3.68296147,  4.04829151])