In [1]:
from pyspark import SparkContext
from scipy import sparse as sm
from sklearn.preprocessing import normalize
import numpy as np
import csv
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from scipy.stats import spearmanr
from scipy.stats import pearsonr as pears
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
import time
sc = SparkContext.getOrCreate()

In [2]:
train_rdd = sc.textFile("data/train.csv")
icm_rdd = sc.textFile("data/icm_fede.csv")
test_rdd= sc.textFile("data/target_users.csv")

train_header = train_rdd.first()
icm_header = icm_rdd.first()
test_header= test_rdd.first()

train_clean_data = train_rdd.filter(lambda x: x != train_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1]), float(x[2])))
icm_clean_data = icm_rdd.filter(lambda x: x != icm_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1])))
test_clean_data= test_rdd.filter(lambda x: x != test_header).map(lambda line: line.split(','))

test_users=test_clean_data.map( lambda x: int(x[0])).collect()


grouped_rates = train_clean_data.filter(lambda x: x[0] in test_users).map(lambda x: (x[0],x[1])).groupByKey().map(lambda x: (x[0], list(x[1]))).collect()
grouped_rates_dic = dict(grouped_rates)


item_ratings = train_clean_data.map(lambda x: (x[0], x[2])).aggregateByKey((0,0), lambda x,y: (x[0] + y, x[1] + 1),lambda x,y: (x[0] + y[0], x[1] + y[1]))
user_ratings_mean = item_ratings.mapValues(lambda x: (x[0] / (x[1]))).collect()
user_ratings_mean_dic=dict(user_ratings_mean)


item_ratings_forTop = train_clean_data.map(lambda x: (x[1], x[2])).aggregateByKey((0,0), lambda x,y: (x[0] + y, x[1] + 1),lambda x,y: (x[0] + y[0], x[1] + y[1]))#.sortBy(lambda x: x[1][1], ascending=False)
#item_ratings.take(10)
shrinkage_factor = 5
item_ratings_mean = item_ratings_forTop.mapValues(lambda x: (x[0] / (x[1] + shrinkage_factor))).sortBy(lambda x: x[1], ascending = False).map(lambda x: x[0]).collect()


users = train_clean_data.map(lambda x: x[0]).collect()
items = train_clean_data.map(lambda x: x[1]).collect()
ratings = train_clean_data.map(lambda x: x[2]).collect()
ratings_unbiased = train_clean_data.map(lambda x: x[2]-user_ratings_mean_dic[x[0]]).collect()

items_for_features= icm_clean_data.map(lambda x:x[0]).collect()
features = icm_clean_data.map(lambda x:x[1]).collect()
items_for_features.append(37142)
features.append(0)


unos=[1]*len(items_for_features)

UxI= sm.csr_matrix((ratings, (users, items)))
UxI_unbiased= sm.csr_matrix((ratings_unbiased, (users, items)))
IxF= sm.csr_matrix((unos, (items_for_features, features)))

In [3]:
n_users,n_items=UxI.shape
n_features=IxF.shape[1]

In [4]:
IDF=[0]*n_features
for i in tqdm(range(n_features)):
    IDF[i]=np.log10(n_items/len(IxF.getcol(i).nonzero()[1]))




In [5]:
IxF=normalize(IxF,axis=1)
IxF_idf=IxF.multiply(IDF)
UxF=UxI.dot(IxF_idf)



In [31]:
UxU_sim=sm.csr_matrix(cosine_similarity(UxF))
UxU_sim.setdiag(0)



In [30]:
top_rec=5
threshold=8
UxI_pred=sm.lil_matrix((n_users,n_items))
for user in tqdm(test_users):
    top= []
    while len(top)<=top_rec:
        most_sim_u=UxU_sim.getrow(user).argmax()
        best_rated=UxI.getrow(most_sim_u)
        best_rated[0,grouped_rates_dic[user]]=0
        best_rated_idx=best_rated.toarray()[0].argsort()[::-1]
        for item in best_rated_idx:
            if best_rated[0,item]>=threshold:
                top.append(item)
            else:
                break
        UxU_sim[user,most_sim_u]=-9    
     for i in range(top_rec):
        UxI_pred[user,]




2396/|/ 57%|| 2396/4196 [01:11<00:53, 33.71it/s]

KeyboardInterrupt: 

          2589/|/ 62%|| 2589/4196 [01:31<00:56, 28.42it/s]

In [13]:
UxI.getrow(4).toarray()[0].argsort()[::-1]

array([23217, 25656, 37142, ..., 24759, 24758,     0])

In [27]:
pippo=UxI.getrow(4)


In [24]:
pippo[0,grouped_rates_dic[4]]=0

In [28]:
pippo.nonzero()

(array([0, 0], dtype=int32), array([23217, 25656], dtype=int32))