In [1]:
from pyspark import SparkContext
from scipy import sparse as sm
from sklearn.preprocessing import normalize
import numpy as np
import csv
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm_notebook as tqdm
import time
from sparsesvd import sparsesvd
sc = SparkContext.getOrCreate()

### Data initialization and matrices creation

In [2]:
train_rdd = sc.textFile("data/train.csv")
icm_rdd = sc.textFile("data/icm_fede.csv")
test_rdd= sc.textFile("data/target_users.csv")

train_header = train_rdd.first()
icm_header = icm_rdd.first()
test_header= test_rdd.first()

train_clean_data = train_rdd.filter(lambda x: x != train_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1]), float(x[2])))
icm_clean_data = icm_rdd.filter(lambda x: x != icm_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1])))
test_clean_data= test_rdd.filter(lambda x: x != test_header).map(lambda line: line.split(','))

test_users=test_clean_data.map( lambda x: int(x[0])).collect()


grouped_rates = train_clean_data.filter(lambda x: x[0] in test_users).map(lambda x: (x[0],x[1])).groupByKey().map(lambda x: (x[0], list(x[1]))).collect()
grouped_rates_dic = dict(grouped_rates)


item_ratings = train_clean_data.map(lambda x: (x[0], x[2])).aggregateByKey((0,0), lambda x,y: (x[0] + y, x[1] + 1),lambda x,y: (x[0] + y[0], x[1] + y[1]))
user_ratings_mean = item_ratings.mapValues(lambda x: (x[0] / (x[1]))).collect()
user_ratings_mean_dic=dict(user_ratings_mean)


item_ratings_forTop = train_clean_data.map(lambda x: (x[1], x[2])).aggregateByKey((0,0), lambda x,y: (x[0] + y, x[1] + 1),lambda x,y: (x[0] + y[0], x[1] + y[1]))#.sortBy(lambda x: x[1][1], ascending=False)
shrinkage_factor = 5
item_ratings_mean = item_ratings_forTop.mapValues(lambda x: (x[0] / (x[1] + shrinkage_factor))).sortBy(lambda x: x[1], ascending = False).map(lambda x: x[0]).collect()


#matrix interactions with global effect creation
users = train_clean_data.map(lambda x: x[0]).collect()
items = train_clean_data.map(lambda x: x[1]).collect()
ratings = train_clean_data.map(lambda x: x[2]).collect()
UxI_basic= sm.csr_matrix((ratings, (users, items)))

#remove global effect
global_mean = train_clean_data.map(lambda x: x[2]).mean()

trainr2 = train_clean_data.map(lambda x: (x[0], x[1], x[2]-global_mean))
temp = trainr2.map(lambda x: (x[1],(x[2],1))).reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1])).mapValues(lambda x: x[0]/x[1])
i_Bias = temp.collectAsMap()

trainr3 = trainr2.map(lambda x: (x[0],x[1], x[2] - i_Bias.get(x[1],0)))
temp2 = trainr3.map(lambda x: (x[0],(x[2],1))).reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1])).mapValues(lambda x: x[0]/x[1])
u_Bias = temp2.collectAsMap()

train_clean_data = train_clean_data.map(lambda x: (x[0], x[1], x[2] - i_Bias.get(x[1],0) - u_Bias.get(x[0],0)))
train_clean_data.take(5)

#matrix interactions without global effect creation
users = train_clean_data.map(lambda x: x[0]).collect()
items = train_clean_data.map(lambda x: x[1]).collect()
ratings = train_clean_data.map(lambda x: x[2]).collect()
UxI_global1= sm.csr_matrix((ratings, (users, items)))

#matrix icm creations
items_for_features= icm_clean_data.map(lambda x:x[0]).collect()
features = icm_clean_data.map(lambda x:x[1]).collect()
items_for_features.append(37142)
features.append(0)
ones=[1.0]*len(items_for_features)
IxF= sm.csr_matrix((ones, (items_for_features, features)))

In [3]:
n_users,n_items=UxI_basic.shape
n_features=IxF.shape[1]

### From now on we stopped using RDDs because of too many resources required for matrices calculations with them.

# SVD

In [4]:
'''matrix factorization -  TUNE PARAMETER HERE'''
K = 800
%time U, S, Vt = sparsesvd(UxI_global1.tocsc(), K)
S=np.sqrt(S.data)
S=np.diag(S)

CPU times: user 1min 27s, sys: 156 ms, total: 1min 27s
Wall time: 1min 27s


In [5]:
'''calc predictions'''
%time UxI_pred_SVD=sm.csr_matrix(U.T.dot(S).dot(Vt))

CPU times: user 47.5 s, sys: 5.79 s, total: 53.3 s
Wall time: 27.2 s


# Content Based

In [6]:
IDF=[0]*n_features
for i in tqdm(range(n_features)):
    IDF[i]=np.log10(n_items/len(IxF.getcol(i).nonzero()[1]))
%time IxF=normalize(IxF,axis=1) #apply TF
%time IxF_idf=IxF.multiply(IDF) #apply IDF
%time UxF=UxI_global1.dot(IxF_idf) #calc users profiles


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 2.6 ms
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 5.28 ms
CPU times: user 52 ms, sys: 4 ms, total: 56 ms
Wall time: 56.9 ms


In [7]:
'''calc predictions'''
%time UxI_pred_CB=UxF.dot(IxF.T)
del IxF

CPU times: user 8.09 s, sys: 948 ms, total: 9.04 s
Wall time: 9.03 s


# Item Based

In [8]:
'''calc item similarities based on features'''
%time IxI_sim_f=IxF_idf.dot(IxF_idf.T)
IxI_sim_f.setdiag(0)

CPU times: user 6.5 s, sys: 1.5 s, total: 8 s
Wall time: 8 s




In [9]:
'''calc item similarities based on collaborative'''
%time UxI_glob_unbiased=UxI_global1.copy().tolil()
bias=sm.lil_matrix((n_users,n_items))
for usr in tqdm(range(n_users)):
    bias[usr,UxI_global1.getrow(usr).nonzero()]=(UxI_global1[usr,:].data.mean())
UxI_glob_unbiased-=bias
UxI_glob_unbiased=UxI_glob_unbiased.tocsr()
%time IxI_sim_c=UxI_glob_unbiased.T.dot(UxI_glob_unbiased) #"pearson" similarity without denominator
IxI_sim_c.setdiag(0)
del bias

CPU times: user 76 ms, sys: 4 ms, total: 80 ms
Wall time: 80.1 ms


  """
  ret = ret.dtype.type(ret / rcount)



CPU times: user 416 ms, sys: 52 ms, total: 468 ms
Wall time: 466 ms




In [10]:
'''merge item similarities -  TUNE PARAMETER HERE'''
%time IxI_sim = IxI_sim_f + IxI_sim_c*0.0000005
del IxI_sim_f
del IxI_sim_c

CPU times: user 6.7 s, sys: 1.64 s, total: 8.34 s
Wall time: 8.34 s


In [11]:
'''take knn items -  TUNE PARAMETER HERE'''
IxI_sim_knn=sm.lil_matrix((n_items,n_items))
k=200
for i in tqdm(range(n_items)):    
    top_k_idx =IxI_sim.getrow(i).toarray()[0].argpartition(-k)[-k:]
    IxI_sim_knn[i,top_k_idx]=IxI_sim[i,top_k_idx]
del IxI_sim 




In [12]:
'''calc predictions'''
%time UxI_pred_CI=UxI_global1.dot(IxI_sim_knn.T)
del IxI_sim_knn

CPU times: user 3.46 s, sys: 344 ms, total: 3.8 s
Wall time: 3.8 s


# User Based

In [13]:
'''calc user similarities based on profiles'''
%time UxU_sim_p=UxF.dot(UxF.T) #numerators of cosine
%time cos=sm.csr_matrix(cosine_similarity(UxF)) #cosine
cos.data=1/cos.data
denominators=UxU_sim_p.multiply(cos) #get denominators of cosine
del cos
denominators.data+=1 #add shrinkage to cosine denominator
denominators.data=1/denominators.data
%time UxU_sim_p=UxU_sim_p.multiply(denominators) #calc shrinked cosine
del denominators
UxU_sim_p.setdiag(0)

CPU times: user 5.75 s, sys: 536 ms, total: 6.28 s
Wall time: 6.29 s
CPU times: user 14.5 s, sys: 2.95 s, total: 17.5 s
Wall time: 17.5 s
CPU times: user 2.34 s, sys: 540 ms, total: 2.88 s
Wall time: 2.88 s




In [14]:
'''calc user similarities based on collaborative'''
%time UxU_sim_c=UxI_glob_unbiased.dot(UxI_glob_unbiased.T) #pearson similarity only numerator
UxU_sim_c.setdiag(0)
del UxI_glob_unbiased

CPU times: user 1.42 s, sys: 564 ms, total: 1.99 s
Wall time: 1.99 s




In [15]:
'''merge user similarities -  TUNE PARAMETER HERE'''
%time UxU_sim = UxU_sim_p + UxU_sim_c*0.0000005
del UxU_sim_p
del UxU_sim_c

CPU times: user 2.94 s, sys: 1.15 s, total: 4.09 s
Wall time: 4.09 s


In [16]:
'''take knn users -  TUNE PARAMETER HERE'''
UxU_sim_knn=sm.lil_matrix((n_users,n_users))
k=100
for i in tqdm(range(n_users)):    
    top_k_idx =UxU_sim.getrow(i).toarray()[0].argpartition(-k)[-k:]
    UxU_sim_knn[i,top_k_idx]=UxU_sim[i,top_k_idx]  
del UxU_sim




In [17]:
'''calc_predictions'''
%time UxI_pred_CU=UxU_sim_knn.dot(UxI_global1)

CPU times: user 2.82 s, sys: 176 ms, total: 3 s
Wall time: 3 s


In [18]:
del UxU_sim_knn
del UxI_basic
del UxI_global1
del UxF
del IxF_idf

### Removal of already voted items

In [19]:
#create matrix for test users predictions
UxI_pred_CB_test=sm.lil_matrix((n_users,n_items))
UxI_pred_CI_test=sm.lil_matrix((n_users,n_items))
UxI_pred_CU_test=sm.lil_matrix((n_users,n_items))
UxI_pred_SVD_test=sm.lil_matrix((n_users,n_items))

In [20]:
#take only test users predictions
%time UxI_pred_CB_test[test_users,:]=UxI_pred_CB[test_users,:]
%time UxI_pred_CI_test[test_users,:]=UxI_pred_CI[test_users,:]
%time UxI_pred_CU_test[test_users,:]=UxI_pred_CU[test_users,:]
%time UxI_pred_SVD_test[test_users,:]=UxI_pred_SVD[test_users,:]

CPU times: user 21.1 s, sys: 3.32 s, total: 24.4 s
Wall time: 24.4 s
CPU times: user 11 s, sys: 328 ms, total: 11.3 s
Wall time: 11.4 s
CPU times: user 13.6 s, sys: 924 ms, total: 14.5 s
Wall time: 14.5 s
CPU times: user 21.7 s, sys: 4.19 s, total: 25.9 s
Wall time: 25.9 s


In [21]:
#remove items already voted
for user in tqdm(test_users):
    UxI_pred_CB_test[user,grouped_rates_dic[user]]=0
    UxI_pred_CI_test[user,grouped_rates_dic[user]]=0
    UxI_pred_CU_test[user,grouped_rates_dic[user]]=0
    UxI_pred_SVD_test[user,grouped_rates_dic[user]]=0




In [22]:
#convert to csr to improve future operations performances
%time UxI_pred_CB_test=UxI_pred_CB_test.tocsr()
%time UxI_pred_CI_test=UxI_pred_CI_test.tocsr()
%time UxI_pred_CU_test=UxI_pred_CU_test.tocsr()
%time UxI_pred_SVD_test=UxI_pred_SVD_test.tocsr()

CPU times: user 16.2 s, sys: 908 ms, total: 17.1 s
Wall time: 17.1 s
CPU times: user 592 ms, sys: 20 ms, total: 612 ms
Wall time: 613 ms
CPU times: user 3.04 s, sys: 44 ms, total: 3.08 s
Wall time: 3.08 s
CPU times: user 21.5 s, sys: 1.12 s, total: 22.6 s
Wall time: 22.6 s


In [23]:
del UxI_pred_CB
del UxI_pred_CI
del UxI_pred_CU
del UxI_pred_SVD

### Algorithms adjusted with svd in order to remove possible even scores and consequent random ordering

In [24]:
%time UxI_pred_CB_SVD = UxI_pred_CB_test + UxI_pred_SVD_test*180.0

CPU times: user 1.31 s, sys: 348 ms, total: 1.66 s
Wall time: 1.66 s


In [25]:
%time UxI_pred_CI_SVD = UxI_pred_CI_test + UxI_pred_SVD_test*0.000001

CPU times: user 840 ms, sys: 216 ms, total: 1.06 s
Wall time: 1.05 s


In [26]:
%time UxI_pred_CU_SVD = UxI_pred_CU_test + UxI_pred_SVD_test*0.000001

CPU times: user 948 ms, sys: 324 ms, total: 1.27 s
Wall time: 1.27 s


In [27]:
del UxI_pred_CB_test
del UxI_pred_CI_test
del UxI_pred_CU_test
del UxI_pred_SVD_test

In [28]:
'''TUNE PARAMETERS HERE'''
def bordaLikeAggr(rank1,rank2,rank3):    
    result=[0]*n_items
    rg=150
    for i in range(rg):
        item1=rank1.argmax()
        item2=rank2.argmax()
        item3=rank3.argmax()   
        
        if rank1[item1]>0.0:
            result[item1]+=(6/(i+1))
        rank1[item1]=-500

        if rank2[item2]>0.0:
            result[item2]+=(6/(i+1))
        rank2[item2]=-500
        
        if rank3[item3]>0.0:
            result[item3]+=(0.1/(i+1))
        rank2[item3]=-500
    
    return sm.csr_matrix(result)

In [29]:
f = open('submission.csv', 'wt')
writer = csv.writer(f)
writer.writerow(('userId','RecommendedItemIds'))
for user in tqdm(test_users):
    top=[0,0,0,0,0]

    user_predictions=bordaLikeAggr(UxI_pred_CI_SVD.getrow(user).toarray()[0],UxI_pred_CB_SVD.getrow(user).toarray()[0],UxI_pred_CU_SVD.getrow(user).toarray()[0])
    iterator = 0
    for i in range(5):
        prediction = user_predictions.argmax()
        while prediction in grouped_rates_dic[user] and prediction != 0:
            user_predictions[0,prediction]=-9
            prediction=user_predictions.argmax()
        if prediction == 0:
            prediction = item_ratings_mean[iterator]
            while prediction in grouped_rates_dic[user] or prediction in top:
                iterator += 1
                prediction = item_ratings_mean[iterator]
            iterator += 1
        else:
            user_predictions[0,prediction]=-9
        top[i]=prediction    
    writer.writerow((user, '{0} {1} {2} {3} {4}'.format(top[0], top[1], top[2], top[3], top[4])))

f.close()


