In [1]:
from pyspark import SparkContext
from scipy import sparse as sm
from sklearn.preprocessing import normalize
import numpy as np
import csv
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from scipy.stats import spearmanr
from scipy.stats import pearsonr as pears
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
import time
from sparsesvd import sparsesvd
import math as mt
sc = SparkContext.getOrCreate()

In [2]:
def computeSVD(urm, K):
	U, s, Vt = sparsesvd(urm, K)

	dim = (len(s), len(s))
	S = np.zeros(dim, dtype=np.float32)
	for i in range(0, len(s)):
		S[i,i] = mt.sqrt(s[i])

	U = sm.csr_matrix(np.transpose(U), dtype=np.float32)
	S = sm.csr_matrix(S, dtype=np.float32)
	Vt = sm.csr_matrix(Vt, dtype=np.float32)

	return U, S, Vt

In [3]:
train_rdd = sc.textFile("data/train.csv")
icm_rdd = sc.textFile("data/icm_fede.csv")
test_rdd= sc.textFile("data/target_users.csv")

train_header = train_rdd.first()
icm_header = icm_rdd.first()
test_header= test_rdd.first()

train_clean_data = train_rdd.filter(lambda x: x != train_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1]), float(x[2])))
icm_clean_data = icm_rdd.filter(lambda x: x != icm_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1])))
test_clean_data= test_rdd.filter(lambda x: x != test_header).map(lambda line: line.split(','))

test_users=test_clean_data.map( lambda x: int(x[0])).collect()


grouped_rates = train_clean_data.filter(lambda x: x[0] in test_users).map(lambda x: (x[0],x[1])).groupByKey().map(lambda x: (x[0], list(x[1]))).collect()
grouped_rates_dic = dict(grouped_rates)


item_ratings = train_clean_data.map(lambda x: (x[0], x[2])).aggregateByKey((0,0), lambda x,y: (x[0] + y, x[1] + 1),lambda x,y: (x[0] + y[0], x[1] + y[1]))
user_ratings_mean = item_ratings.mapValues(lambda x: (x[0] / (x[1]))).collect()
user_ratings_mean_dic=dict(user_ratings_mean)


item_ratings_forTop = train_clean_data.map(lambda x: (x[1], x[2])).aggregateByKey((0,0), lambda x,y: (x[0] + y, x[1] + 1),lambda x,y: (x[0] + y[0], x[1] + y[1]))#.sortBy(lambda x: x[1][1], ascending=False)
shrinkage_factor = 5
item_ratings_mean = item_ratings_forTop.mapValues(lambda x: (x[0] / (x[1] + shrinkage_factor))).sortBy(lambda x: x[1], ascending = False).map(lambda x: x[0]).collect()


users = train_clean_data.map(lambda x: x[0]).collect()
items = train_clean_data.map(lambda x: x[1]).collect()
ratings = train_clean_data.map(lambda x: x[2]).collect()
#ratings_unbiased = train_clean_data.map(lambda x: x[2]-user_ratings_mean_dic[x[0]]).collect()

items_for_features= icm_clean_data.map(lambda x:x[0]).collect()
features = icm_clean_data.map(lambda x:x[1]).collect()
items_for_features.append(37142)
features.append(0)


unos=[1]*len(items_for_features)

UxI= sm.csr_matrix((ratings, (users, items)))
#UxI_unbiased= sm.csr_matrix((ratings_unbiased, (users, items)))
IxF= sm.csr_matrix((unos, (items_for_features, features)))

In [4]:
n_users,n_items=UxI.shape
n_features=IxF.shape[1]

In [5]:
'''SVD'''
K = 870
%time U, S, Vt = computeSVD(UxI.tocsc(), K)


CPU times: user 1min 44s, sys: 560 ms, total: 1min 45s
Wall time: 1min 44s


In [None]:
#calc predictions
%time UxI_pred_SVD=U.dot(S)

In [7]:
%time UxI_pred_SVD = sm.lil_matrix(np.dot(np.dot(U, S), Vt))

CPU times: user 19min 42s, sys: 13.3 s, total: 19min 55s
Wall time: 19min 55s


In [8]:
%time UxI_pred_SVD=UxI_pred_SVD.tocsr()

CPU times: user 1min 42s, sys: 4.43 s, total: 1min 47s
Wall time: 1min 46s


In [9]:
'''content based'''
IDF=[0]*n_features
for i in tqdm(range(n_features)):
    IDF[i]=np.log10(n_items/len(IxF.getcol(i).nonzero()[1]))
IxF=normalize(IxF,axis=1)
IxF_idf=IxF.multiply(IDF)
UxF=UxI.dot(IxF_idf)
UxI_pred_CB=UxF.dot(IxF.T)






In [10]:
del IxF

In [11]:
'''collaborative filtering item based via content'''
#calc similarities
IxI_sim=IxF_idf.dot(IxF_idf.T)
'''cos=sm.csr_matrix(cosine_similarity(IxF_idf))#
cos.data=1/cos.data
denominators=IxI_sim.multiply(cos)
del cos
denominators.data+=2
denominators.data=1/denominators.data
IxI_sim=IxI_sim.multiply(denominators)
del denominators'''
IxI_sim.setdiag(0)



In [12]:
#take knn items
IxI_sim_knn=sm.lil_matrix((n_items,n_items))
k=200
for i in tqdm(range(n_items)):    
    top_k_idx =IxI_sim.getrow(i).toarray()[0].argpartition(-k)[-k:]
    IxI_sim_knn[i,top_k_idx]=IxI_sim[i,top_k_idx]




In [13]:
del IxI_sim 

In [14]:
#calc predictions
%time UxI_pred_CI=UxI.dot(IxI_sim_knn.T) #k=200->582

CPU times: user 3.62 s, sys: 176 ms, total: 3.8 s
Wall time: 3.8 s


In [15]:
del IxI_sim_knn

In [16]:
'''collaborative filtering user based via content'''
#calc similarities
%time UxU_sim=UxF.dot(UxF.T) #numerators of cosine

CPU times: user 5.83 s, sys: 304 ms, total: 6.14 s
Wall time: 6.13 s


In [17]:
cos=sm.csr_matrix(cosine_similarity(UxF)) #cosine
cos.data=1/cos.data
denominators=UxU_sim.multiply(cos) #get denominators of cosine
del cos

In [18]:
denominators.data+=1 #add shrinkage to cosine denominator
denominators.data=1/denominators.data
UxU_sim=UxU_sim.multiply(denominators) #calc shrinked cosine
del denominators

In [19]:
UxU_sim.setdiag(0)



In [20]:
#take knn users
UxU_sim_knn=sm.lil_matrix((n_users,n_users))
k=50
for i in tqdm(range(n_users)):    
    top_k_idx =UxU_sim.getrow(i).toarray()[0].argpartition(-k)[-k:]
    UxU_sim_knn[i,top_k_idx]=UxU_sim[i,top_k_idx]  




In [21]:
del UxU_sim

In [22]:
#calc_predictions
%time UxI_pred_CU=UxU_sim_knn.dot(UxI) #k=75->336 k=50->382 k=30->378

CPU times: user 1.42 s, sys: 0 ns, total: 1.42 s
Wall time: 1.42 s


In [23]:
del UxU_sim_knn
del UxI
del UxF
del IxF_idf

In [24]:
#remove already voted
for user in tqdm(test_users):
    UxI_pred_CB[user,grouped_rates_dic[user]]=0
    UxI_pred_CI[user,grouped_rates_dic[user]]=0
    UxI_pred_CU[user,grouped_rates_dic[user]]=0
    UxI_pred_SVD[user,grouped_rates_dic[user]]=0






In [31]:
#rescale algorithms
rows , cols, data = [], [], []
for item in tqdm(range(n_users)):    
    maxEl = UxI_pred_CB.getrow(item).max()
    if (maxEl!=0):
        data.append(1/maxEl)
    else:
        data.append(maxEl)
    rows.append(item)
    cols.append(item)
diag = sm.csr_matrix((data,(rows,cols)))
UxI_pred_CB=UxI_pred_CB.T.dot(diag).T




In [32]:
#rescale algorithms
rows , cols, data = [], [], []
for item in tqdm(range(n_users)):    
    maxEl = UxI_pred_CI.getrow(item).max()
    if (maxEl!=0):
        data.append(1/maxEl)
    else:
        data.append(maxEl)
    rows.append(item)
    cols.append(item)
diag = sm.csr_matrix((data,(rows,cols)))
UxI_pred_CI=UxI_pred_CI.T.dot(diag).T




In [34]:
#rescale algorithms
rows , cols, data = [], [], []
for item in tqdm(range(n_users)):    
    maxEl = UxI_pred_CU.getrow(item).max()
    if (maxEl!=0):
        data.append(1/maxEl)
    else:
        data.append(maxEl)
    rows.append(item)
    cols.append(item)
diag = sm.csr_matrix((data,(rows,cols)))
UxI_pred_CU=UxI_pred_CU.T.dot(diag).T




In [35]:
#rescale algorithms
rows , cols, data = [], [], []
for item in tqdm(range(n_users)):    
    maxEl = UxI_pred_SVD.getrow(item).max()
    if (maxEl!=0):
        data.append(1/maxEl)
    else:
        data.append(maxEl)
    rows.append(item)
    cols.append(item)
diag = sm.csr_matrix((data,(rows,cols)))
UxI_pred_SVD=UxI_pred_SVD.T.dot(diag).T




In [40]:
UxI_pred=UxI_pred_CB.multiply(0.75)+UxI_pred_SVD.multiply(0.25)

In [41]:
f = open('submission_CB-SVD_075-025.csv', 'wt')
#f = open('submission_SVD.csv', 'wt')
writer = csv.writer(f)
writer.writerow(('userId','RecommendedItemIds'))

for user in tqdm(test_users):
    top=[0,0,0,0,0]

    user_predictions=UxI_pred.getrow(user)
    iterator = 0
    for i in range(5):
        prediction = user_predictions.argmax()
        while prediction in grouped_rates_dic[user] and prediction != 0:
            user_predictions[0,prediction]=-9
            prediction=user_predictions.argmax()
        if prediction == 0:
            prediction = item_ratings_mean[iterator]
            while prediction in grouped_rates_dic[user] or prediction in top:
                iterator += 1
                prediction = item_ratings_mean[iterator]
            iterator += 1
        else:
            user_predictions[0,prediction]=-9
        top[i]=prediction    
    writer.writerow((user, '{0} {1} {2} {3} {4}'.format(top[0], top[1], top[2], top[3], top[4])))

f.close()




In [None]:
def bordaLikeAggr(rank1,rank2,rank3):
    result=[0]*n_items
    rg=150
    for i in range(rg):
        item1=rank1.argmax()
        item2=rank2.argmax()        
        item3=rank3.argmax()

        if rank1[item1]>0.0:
            result[item1]+=(0.40/(i+1))
        rank1[item1]=-9

        if rank2[item2]>0.0:
            result[item2]+=(0.35/(i+1))
        rank2[item2]=-9

        if rank3[item3]>0.0:
            result[item3]+=(0.25/(i+1))
        rank3[item3]=-9

    return sm.csr_matrix(result)

In [None]:
f = open('submission_bordalike_40_35_25.csv', 'wt')
writer = csv.writer(f)
writer.writerow(('userId','RecommendedItemIds'))
for user in tqdm(test_users):
    top=[0,0,0,0,0]

    user_predictions=bordaLikeAggr(UxI_pred_CI.getrow(user).toarray()[0],UxI_pred_CB.getrow(user).toarray()[0],UxI_pred_CU.getrow(user).toarray()[0])
    iterator = 0
    for i in range(5):
        prediction = user_predictions.argmax()
        while prediction in grouped_rates_dic[user] and prediction != 0:
            user_predictions[0,prediction]=-9
            prediction=user_predictions.argmax()
        if prediction == 0:
            prediction = item_ratings_mean[iterator]
            while prediction in grouped_rates_dic[user] or prediction in top:
                iterator += 1
                prediction = item_ratings_mean[iterator]
            iterator += 1
        else:
            user_predictions[0,prediction]=-9
        top[i]=prediction    
    writer.writerow((user, '{0} {1} {2} {3} {4}'.format(top[0], top[1], top[2], top[3], top[4])))

f.close()