In [1]:
import numpy as np
from scipy.sparse import csr_matrix
from numba import jit
from pickle import dump, load
import pandas as pd

In [25]:
def load_data():
    with open('data/train.txt', 'r') as f:
        data = f.readlines()
    data = np.array([line.strip().split('\t') for line in data], dtype=np.int64)
    users, items, ratings = data.T
    users -= 1
    items -= 1
    data_matrix = csr_matrix((ratings,(users,items))).toarray()
    
    with open('test.txt', 'r') as f:
        test = f.readlines()
    test = np.array([line.strip().split('\t') for line in test], dtype=np.int64)
    
    return data, data_matrix, test

In [26]:
data, data_matrix, test = load_data()

In [27]:
data_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]], dtype=int64)

In [4]:
class ALS:
    def __init__(self, data_matrix, alpha=7):
        were_rated = data_matrix > 0
        ratings = data_matrix#.astype(np.float64)
        ratings[np.logical_not(were_rated)] = np.mean(ratings[were_rated])
        self.were_rated = were_rated
        self.preference = ratings
        self.alpha = alpha
        self.confidence = 1 + self.alpha*ratings
        self.n_users, self.n_items = data_matrix.shape
    
    def fit(self, n_factors=5, iterations=15, l=0.1, initializer_coefficient=0.01):
        self.n_factors = n_factors
        self.iterations = iterations
        self.l = l
        self.initializer_coefficient = initializer_coefficient
        
        user_factor = self.initializer_coefficient * np.random.rand(self.n_users, self.n_factors) # X
        item_factor = self.initializer_coefficient * np.random.rand(self.n_factors, self.n_items) # Y
        lI = l * np.identity(n_factors)


        for i in range(self.iterations):
    
            for user, Cu in enumerate(self.confidence):
                YtCuY = np.dot(Cu*item_factor, item_factor.T)#np.dot(ItemFactor, np.dot(np.diag(Cu), ItemFactor.T)
                user_factor[user] = np.linalg.solve(YtCuY + lI,
                               np.dot(item_factor, np.dot(np.diag(Cu), self.preference[user].T))).T #
            for item, Ci in enumerate(self.confidence.T):
                XtCiX = np.dot(Ci*user_factor.T, user_factor)#np.dot(UserFactor.T, np.dot(np.diag(Ci), UserFactor)
                item_factor[:,item] = np.linalg.solve(XtCiX + lI,
                                 np.dot(user_factor.T, np.dot(np.diag(Ci), self.preference[:, item]))) #
        self.user_factor, self.item_factor = user_factor, item_factor

In [5]:
model = ALS(data_matrix=data_matrix)

In [6]:
model.fit()

In [8]:
def get_error(Q, Predictions, W):
    return np.sqrt(np.sum((W * (Q - Predictions))**2)/W.sum())

In [9]:
U, I = model.user_factor, model.item_factor

In [10]:
predictions = np.dot(U, I)

In [11]:
get_error(data_matrix.astype(np.float64), predictions, data_matrix > 0)

0.26554033082126893

In [28]:
class ImplicitALS:
    def __init__(self, data_matrix, alpha=7):
        were_rated = data_matrix > 0
        ratings = data_matrix.astype(np.float64)
        ratings[np.logical_not(were_rated)] = np.mean(ratings[were_rated])
        self.were_rated = were_rated
        self.preference = ratings
        self.alpha = alpha
        self.confidence = 1 + self.alpha*ratings
        self.n_users, self.n_items = data_matrix.shape
    
    def fit(self, n_factors=5, iterations=15, l=0.1, initializer_coefficient=0.01):
        self.n_factors = n_factors
        self.iterations = iterations
        self.l = l
        self.initializer_coefficient = initializer_coefficient
        
        user_factor = self.initializer_coefficient * np.random.rand(self.n_users, self.n_factors) # X
        item_factor = self.initializer_coefficient * np.random.rand(self.n_items, self.n_factors) # Y
        lI = l * np.identity(n_factors)


        for i in range(self.iterations):
    
            for user, Cu in enumerate(self.confidence):
                YtCuY = np.dot(Cu*item_factor.T, item_factor)#np.dot(ItemFactor, np.dot(np.diag(Cu), ItemFactor.T)
                user_factor[user] = np.linalg.solve(YtCuY + lI,
                               np.dot(item_factor.T, np.dot(np.diag(Cu), self.preference[user].T))).T #
            for item, Ci in enumerate(self.confidence.T):
                XtCiX = np.dot(Ci*user_factor.T, user_factor)#np.dot(UserFactor.T, np.dot(np.diag(Ci), UserFactor)
                item_factor[item] = np.linalg.solve(XtCiX + lI,
                                np.dot(user_factor.T, np.dot(np.diag(Ci), self.preference[:, item]))) #
        self.user_factor, self.item_factor = user_factor, item_factor

In [29]:
model = ImplicitALS(data_matrix=data_matrix)

In [30]:
model.fit(iterations=1)

In [31]:
U, I = model.user_factor, model.item_factor

In [32]:
predictions = np.dot(U, I.T)

In [33]:
get_error(data_matrix.astype(np.float64), predictions, data_matrix > 0)

1.065890998442184

In [199]:
class ImplicitALSBiasesPlain:
    def __init__(self, data_matrix, test_indices, alpha=7, eps=0.02):
        self.eps = eps
        self.alpha = alpha
        
        self.were_rated_train = data_matrix > 0
        self.were_rated_test = np.full_like(data_matrix, False)
        self.were_rated_test[test_indices[:,0], test_indices[:,1]] = True
        
        ratings = data_matrix.astype(np.float64)
        self.mean = np.mean(ratings[self.were_rated_train])
        
        
        self.preference = data_matrix.astype(np.float64) - self.were_rated_train*self.mean
        #self.preference[np.logical_not(self.were_rated_train)] = 0.5*self.mean # is fixed for test in the next line
        #self.preference[self.were_rated_test] = self.mean
        
        self.confidence = data_matrix.astype(np.float64)
        self.confidence[self.were_rated_test] = 0.85
        #self.confidence = np.log(self.confidence/eps + 1)
        self.confidence = 1 + alpha * np.log(1 + self.confidence / eps)
        
        #self.confidence = 1 + self.alpha*ratings
        self.n_users, self.n_items = data_matrix.shape
    
    def fit(self, n_factors=5, iterations=15, l=0.1, initializer_coefficient=0.01):
        self.n_factors = n_factors
        self.iterations = iterations
        self.l = l
        self.initializer_coefficient = initializer_coefficient
        
        user_factor = self.initializer_coefficient * np.random.rand(self.n_users, self.n_factors + 1) # X change init
        item_factor = self.initializer_coefficient * np.random.rand(self.n_items, self.n_factors + 1) # Y change init
        lI = l * np.identity(n_factors + 1)
        

        user_bias = np.zeros(self.n_users)
        item_bias = np.zeros(self.n_items)

       

        for i in range(self.iterations):
            preference_user = self.preference - user_bias[:, None]
            preference_item = self.preference - item_bias[None, :]
            
            user_factor[:, 0] = user_bias
            item_factor[:, 0] = np.ones(self.n_items)
            

            YtY = np.dot(item_factor.T, item_factor)
            
            for user in range(self.n_users):
                left = YtY + np.dot(item_factor.T * (self.confidence[user, :] - 1), item_factor) + lI
                right = np.dot(item_factor.T * self.confidence[user, :], preference_item[user, :])
                user_factor[user, :] = np.linalg.solve(left, right)

            
            user_bias = np.copy(user_factor[:, 0])
            
            
            
            
            user_factor[:, 0] = np.ones(self.n_users)
            item_factor[:, 0] = item_bias

            XtX = np.dot(user_factor.T, user_factor)
        
            for item in range(self.n_items):
                left = XtX + np.dot(user_factor.T * (self.confidence[:, item] - 1), user_factor) + lI
                right = np.dot(user_factor.T * self.confidence[:, item], preference_user[:, item])
                item_factor[item, :] = np.linalg.solve(left, right)
                
                
            
            item_bias = np.copy(item_factor[:, 0])
            #for user, Cu in enumerate(self.confidence):
                #YtCuY = np.dot(Cu*item_factor.T, item_factor)#np.dot(ItemFactor, np.dot(np.diag(Cu), ItemFactor.T)
                #user_factor[user] = np.linalg.solve(YtCuY + lI,
                               #np.dot(item_factor.T, np.dot(np.diag(Cu), self.preference[user].T))).T #
            #for item, Ci in enumerate(self.confidence.T):
                #XtCiX = np.dot(Ci*user_factor.T, user_factor)#np.dot(UserFactor.T, np.dot(np.diag(Ci), UserFactor)
                #item_factor[item] = np.linalg.solve(XtCiX + lI,
                                #np.dot(user_factor.T, np.dot(np.diag(Ci), self.preference[:, item]))) #
        self.user_bias = user_bias
        self.item_bias = item_bias
        self.user_factor = user_factor[:, 1:]
        self.item_factor = item_factor[:, 1:]

In [224]:
model = ImplicitALSBiasesPlain(data_matrix=data_matrix, test_indices=test-1, eps=0.025, alpha=500)

In [225]:
model.fit(iterations=200, n_factors=4, l=0.6)

In [226]:
U, I = model.user_factor, model.item_factor

In [227]:
predictions = np.dot(U, I.T)

In [228]:
predictions

array([[-4.64997646e-02,  1.02345605e-01,  4.94662143e-01, ...,
        -9.37188287e-02, -3.31794427e-02, -2.93368447e-02],
       [-1.61304809e-02,  1.45198707e-01,  5.46676761e-01, ...,
        -2.19745803e-02, -2.92519173e-02,  1.65252444e-03],
       [-4.38621432e-01, -4.83082161e-01,  6.94401243e-01, ...,
         1.64419643e-01, -7.38786687e-02, -2.66580612e-01],
       ...,
       [-6.16087760e-02, -1.06124213e-01, -7.68298320e-01, ...,
        -3.83247863e-01, -1.54888376e-02, -1.01624711e-01],
       [ 1.29125985e-01,  4.19480086e-02, -1.34165719e+00, ...,
         1.55533839e-01,  1.55007720e-02,  8.87717330e-02],
       [ 1.25983403e-01,  3.73870034e-01,  1.69776376e+00, ...,
        -9.12020317e-01,  2.34864697e-02, -5.80149037e-03]])

In [214]:
version_name = 'biased_1log_eps0_02_alpha7_l0_6_factor4'

In [215]:
with open ('models/'+version_name+'.pkl', 'wb') as f:
    dump(model, f)

In [216]:
def make_csv(predictions, test, version_name):
    predictions = np.clip(predictions, 1, 5)
    df = pd.DataFrame(predictions[test[:,0] - 1, test[:,1] - 1], columns=['Score'])
    df.index += 1
    df.index.name='Id' 
    df.to_csv('submissions/'+version_name+'.csv',index=True)

In [217]:
make_csv(try_pred, test, version_name)