In [1]:
from pyspark import SparkContext
from scipy import sparse as sm
from sklearn.preprocessing import normalize
import numpy as np
import csv
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from scipy.stats import spearmanr
from scipy.stats import pearsonr as pears
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
import time
sc = SparkContext.getOrCreate()

In [2]:
train_rdd = sc.textFile("data/train.csv")
icm_rdd = sc.textFile("data/icm_fede.csv")
test_rdd= sc.textFile("data/target_users.csv")

train_header = train_rdd.first()
icm_header = icm_rdd.first()
test_header= test_rdd.first()

train_clean_data = train_rdd.filter(lambda x: x != train_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1]), float(x[2])))
icm_clean_data = icm_rdd.filter(lambda x: x != icm_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1])))
test_clean_data= test_rdd.filter(lambda x: x != test_header).map(lambda line: line.split(','))

test_users=test_clean_data.map( lambda x: int(x[0])).collect()


grouped_rates = train_clean_data.filter(lambda x: x[0] in test_users).map(lambda x: (x[0],x[1])).groupByKey().map(lambda x: (x[0], list(x[1]))).collect()
grouped_rates_dic = dict(grouped_rates)


item_ratings = train_clean_data.map(lambda x: (x[0], x[2])).aggregateByKey((0,0), lambda x,y: (x[0] + y, x[1] + 1),lambda x,y: (x[0] + y[0], x[1] + y[1]))
user_ratings_mean = item_ratings.mapValues(lambda x: (x[0] / (x[1]))).collect()
user_ratings_mean_dic=dict(user_ratings_mean)


item_ratings_forTop = train_clean_data.map(lambda x: (x[1], x[2])).aggregateByKey((0,0), lambda x,y: (x[0] + y, x[1] + 1),lambda x,y: (x[0] + y[0], x[1] + y[1]))#.sortBy(lambda x: x[1][1], ascending=False)
#item_ratings.take(10)
shrinkage_factor = 5
item_ratings_mean = item_ratings_forTop.mapValues(lambda x: (x[0] / (x[1] + shrinkage_factor))).sortBy(lambda x: x[1], ascending = False).map(lambda x: x[0]).collect()


users = train_clean_data.map(lambda x: x[0]).collect()
items = train_clean_data.map(lambda x: x[1]).collect()
ratings = train_clean_data.map(lambda x: x[2]).collect()
ratings_unbiased = train_clean_data.map(lambda x: x[2]-user_ratings_mean_dic[x[0]]).collect()

items_for_features= icm_clean_data.map(lambda x:x[0]).collect()
features = icm_clean_data.map(lambda x:x[1]).collect()
items_for_features.append(37142)
features.append(0)


unos=[1]*len(items_for_features)

UxI= sm.csr_matrix((ratings, (users, items)))
UxI_unbiased= sm.csr_matrix((ratings_unbiased, (users, items)))
IxF= sm.csr_matrix((unos, (items_for_features, features)))

In [3]:
'''content based shared'''
IxF_normalized=normalize(IxF,axis=1)
NumItems,NumFeatures=IxF.shape
NumFeatures
IDF=[0]*NumFeatures
for i in range(NumFeatures):
    IDF[i]=np.log10(NumItems/len(IxF.getcol(i).nonzero()[1]))



In [4]:
'''content based with item similarity'''
IxF_idf=sm.csr_matrix(IxF_normalized.multiply(IDF))
IxI_sim=sm.csr_matrix(cosine_similarity(IxF_idf))
IxI_sim.setdiag(0)
n_items=UxI.shape[1]
IxI_sim_knn=sm.lil_matrix((n_items,n_items))



In [5]:
#take knn items
k=200
for i in tqdm(range(n_items)):    
    top_k_idx =IxI_sim.getrow(i).toarray()[0].argpartition(-k)[-k:]
    IxI_sim_knn[i,top_k_idx]=IxI_sim[i,top_k_idx]  




In [6]:
#calc predictions
UxI_pred_CBS=UxI.dot(IxI_sim_knn.T).tolil()

In [7]:
'''content based without similarities'''
UxF=UxI.dot(IxF_normalized)
FxI=IxF_normalized.multiply(IDF).T

In [8]:
#calc predictions
UxI_pred_CB=UxF.dot(FxI).tolil()

In [9]:
#remove already voted
for user in tqdm(test_users):
    UxI_pred_CB[user,grouped_rates_dic[user]]=0
    UxI_pred_CBS[user,grouped_rates_dic[user]]=0




In [43]:
for user in tqdm(test_users):
    row=UxI_pred_CB[4,:].toarray()[0]
    OldMin=min(row)
    OldMax=max(row)
    UxI_pred_CB[user,:]=(((UxI_pred_CB[user,:] - OldMin) * (100 - 0)) / (OldMax - OldMin)) #+ 0




In [47]:
for user in tqdm(test_users):
    row=UxI_pred_CBS[4,:].toarray()[0]
    OldMin=min(row)
    OldMax=max(row)
    UxI_pred_CBS[user,:]=(((UxI_pred_CB[user,:] - OldMin) * (100 - 0)) / (OldMax - OldMin))




In [48]:
UxI_pred_CB=UxI_pred_CB.tocsr()

In [49]:
UxI_pred_CB=UxI_pred_CBS.tocsr()

In [50]:
def medRank(user,rank1,rank2):
    top=list()
    counterDic = defaultdict(int)
    orderedDic = defaultdict(list)
    already_voted=grouped_rates_dic[user]
    nrRanks=2
    doR1=True
    doR2=True
    

    for i in range(len(rank1)):
        #se non è stato beccato un rating predetto a 0 in questo rank
        
        if doR1:
            #prendi il rating massimo
            item1=rank1.argmax()
            #se il rating massimo non è zero
            if rank1[item1] > 0.0:
                rank1[item1]=-9
                if item1 not in already_voted:
                    count = counterDic[item1]
                    counterDic[item1]+=1
                    if counterDic[item1] >= nrRanks:
                        top+=[item1]
                        orderedDic[count].remove(item1)
                    else:
                        if count > 0:
                            orderedDic[count].remove(item1)
                        orderedDic[count + 1] += [item1]
                if len(top)>=5:
                    break
            #altrimenti smetti di prendere in considerazione questo rank
            else:
                doR1=False
                #nrRanks-=1

        if doR2:
            item2=rank2.argmax()
            if rank2[item2] > 0.0:
                rank2[item2]=-9
                if item2 not in already_voted:
                    count = counterDic[item2]
                    counterDic[item2]+=1
                    if counterDic[item2] >= nrRanks:
                        top+=[item2]
                        orderedDic[count].remove(item2)
                    else:
                        if count > 0:
                            orderedDic[count].remove(item2)
                        orderedDic[count + 1] += [item2]
                if len(top)>=5:
                    break
            else:
                doR2=False
        
        
        if (not doR1) and (not doR2):
            break

    key_list = list(orderedDic.keys())
    max_rank = None
    if len(key_list) > 0:
        max_rank = max(key_list)
        while len(orderedDic[max_rank]) == 0:
            key_list.remove(max_rank)
            if len(key_list) > 0:
                max_rank = max(key_list)
            else:
                max_rank = None
                break
    if max_rank != None:
        for i in range(5 - len(top)):
            top += [orderedDic[max_rank].pop(0)]
            while len(orderedDic[max_rank]) == 0:
                key_list.remove(max_rank)
                if len(key_list) > 0:
                    max_rank = max(key_list)
                else:
                    max_rank = None
                    break
            if max_rank == None:
                break
    return top

In [None]:
f = open('submission_Hyb_MedRank_CB-CBS.csv', 'wt')
writer = csv.writer(f)
writer.writerow(('userId','RecommendedItemIds'))
for user in tqdm(test_users):

    top5=medRank(user,UxI_pred_CB.getrow(user).toarray()[0],UxI_pred_CBS.getrow(user).toarray()[0])
   
    iterator = 0
    for i in range(5 - len(top5)):
        
        while (item_ratings_mean[iterator] in grouped_rates_dic[user]) or (item_ratings_mean[iterator] in top5):            
            iterator = iterator + 1
        top5 += [item_ratings_mean[iterator]]        
        iterator+=1   
   
    writer.writerow((user, '{0} {1} {2} {3} {4}'.format(top5[0], top5[1], top5[2], top5[3], top5[4])))

f.close()


In [None]:
def bordaAggr(rank1,rank2):
    nrItems=UxI.shape[1]
    result=[0]*nrItems
    rg=150
    for i in range(rg):
        item1=rank1.argmax()
        item2=rank2.argmax()        

        if rank1[item1]>0.0:
            result[item1]+=((rg-i)*3)
        rank1[item1]=-9

        if rank2[item2]>0.0:
            result[item2]+=((rg-i)*1)
        rank2[item2]=-9        

    return sm.csr_matrix(result)

In [None]:
f = open('submission_Borda_CB-CBS_3-1.csv', 'wt')
writer = csv.writer(f)
writer.writerow(('userId','RecommendedItemIds'))
for user in tqdm(test_users):
    top=[0,0,0,0,0]

    user_predictions=bordaAggr(UxI_pred_CB.getrow(user).toarray()[0],UxI_pred_CBS.getrow(user).toarray()[0])
    iterator = 0
    for i in range(5):
        prediction = user_predictions.argmax()
        while prediction in grouped_rates_dic[user] and prediction != 0:
            user_predictions[0,prediction]=-9
            prediction=user_predictions.argmax()
        if prediction == 0:
            prediction = item_ratings_mean[iterator]
            while prediction in grouped_rates_dic[user] or prediction in top:
                iterator += 1
                prediction = item_ratings_mean[iterator]
            iterator += 1
        else:
            user_predictions[0,prediction]=-9
        top[i]=prediction
    writer.writerow((user, '{0} {1} {2} {3} {4}'.format(top[0], top[1], top[2], top[3], top[4])))

f.close()

In [None]:
UxI_pred=UxI_pred_CB.multiply(7)+UxI_pred_CBS.multiply(5)

In [None]:
f = open('submission_sum_CB-CBS_1-0_rescaled.csv', 'wt')
writer = csv.writer(f)
writer.writerow(('userId','RecommendedItemIds'))

for user in tqdm(test_users):
    top=[0,0,0,0,0]

    user_predictions=UxI_pred_CB.getrow(user)
    iterator = 0
    for i in range(5):
        prediction = user_predictions.argmax()
        while prediction in grouped_rates_dic[user] and prediction != 0:
            user_predictions[0,prediction]=-9
            prediction=user_predictions.argmax()
        if prediction == 0:
            prediction = item_ratings_mean[iterator]
            while prediction in grouped_rates_dic[user] or prediction in top:
                iterator += 1
                prediction = item_ratings_mean[iterator]
            iterator += 1
        else:
            user_predictions[0,prediction]=-9
        top[i]=prediction    
    writer.writerow((user, '{0} {1} {2} {3} {4}'.format(top[0], top[1], top[2], top[3], top[4])))

f.close()

100.0

10.668410134509033

array([  0.        ,   7.33609565,  27.93504428, ...,   8.23920036,
        27.93504428,   0.        ])