In [1]:
from pyspark import SparkContext
from scipy import sparse as sm
from sklearn.preprocessing import normalize
import numpy as np
import csv
from tqdm import tqdm_notebook as tqdm
from sklearn.linear_model import ElasticNet
import heapq

sc = SparkContext.getOrCreate()

In [2]:
train_rdd = sc.textFile("data/train.csv")
icm_rdd = sc.textFile("data/icm_fede.csv")
test_rdd= sc.textFile("data/target_users.csv")

train_header = train_rdd.first()
icm_header = icm_rdd.first()
test_header= test_rdd.first()

train_clean_data = train_rdd.filter(lambda x: x != train_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1]), float(x[2])))
icm_clean_data = icm_rdd.filter(lambda x: x != icm_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1])))
test_clean_data= test_rdd.filter(lambda x: x != test_header).map(lambda line: line.split(','))

test_users=test_clean_data.map( lambda x: int(x[0])).collect()


grouped_rates = train_clean_data.filter(lambda x: x[0] in test_users).map(lambda x: (x[0],x[1])).groupByKey().map(lambda x: (x[0], list(x[1]))).collect()
grouped_rates_dic = dict(grouped_rates)


item_ratings = train_clean_data.map(lambda x: (x[0], x[2])).aggregateByKey((0,0), lambda x,y: (x[0] + y, x[1] + 1),lambda x,y: (x[0] + y[0], x[1] + y[1]))
user_ratings_mean = item_ratings.mapValues(lambda x: (x[0] / (x[1]))).collect()
user_ratings_mean_dic=dict(user_ratings_mean)


item_ratings_forTop = train_clean_data.map(lambda x: (x[1], x[2])).aggregateByKey((0,0), lambda x,y: (x[0] + y, x[1] + 1),lambda x,y: (x[0] + y[0], x[1] + y[1]))#.sortBy(lambda x: x[1][1], ascending=False)
#item_ratings.take(10)
shrinkage_factor = 5
item_ratings_mean = item_ratings_forTop.mapValues(lambda x: (x[0] / (x[1] + shrinkage_factor))).sortBy(lambda x: x[1], ascending = False).map(lambda x: x[0]).collect()
    


In [3]:
users = train_clean_data.map(lambda x: x[0]).collect()
items = train_clean_data.map(lambda x: x[1]).collect()
ratings = train_clean_data.map(lambda x: x[2]).collect()
ratings_unbiased = train_clean_data.map(lambda x: x[2]-user_ratings_mean_dic[x[0]]).collect()

shape = (train_clean_data.map(lambda x: int(x[0])).max()+1,
         train_clean_data.map(lambda x: int(x[1])).max()+1)

UxI = sm.csc_matrix((ratings, (users, items)), shape=shape)

items_for_features= icm_clean_data.map(lambda x:x[0]).collect()
features = icm_clean_data.map(lambda x:x[1]).collect()
items_for_features.append(37142)
features.append(0)

unos=[1]*len(items_for_features)

matrixinteractionsSparse = sm.csr_matrix((unos, (items_for_features, features)))

In [9]:
matrixinteractionsSparseNorm = normalize(matrixinteractionsSparse, norm='l2', axis=1)
matrixSimilarity = matrixinteractionsSparseNorm.dot(matrixinteractionsSparseNorm.T)
matrixinteractionsSparse = matrixinteractionsSparse.T.tocsc()
n_items = matrixinteractionsSparse.shape[1]





In [10]:
listTopSimilar = []
matrixSimilarity = matrixSimilarity.tocsc()
for i in tqdm(range(n_items)):
    minimum = min(400,matrixSimilarity[:,i].nnz)            #prendo minimo tra 100 e il numero di item simili
    #top_k_idx = np.argpartition(matrixSimilarity[i,:], -maximum)[:maximum]
    top_k_idx = matrixSimilarity[:, i].data.argpartition(-minimum)[-minimum:]
    listTopSimilar.append(matrixSimilarity[:, i].indices[top_k_idx])





In [12]:
l1_penalty=0.1
l2_penalty=0.1
positive_only=True
l1_ratio = l1_penalty / (l1_penalty + l2_penalty)

model = ElasticNet(alpha=1.0,
                       l1_ratio=l1_ratio,
                       positive=positive_only,
                       fit_intercept=False,
                       copy_X=False)

In [13]:
values, rows, cols = [], [], []

        # fit each item's factors sequentially (not in parallel)
for j in tqdm(range(n_items)):
            # get the target column
            if(matrixinteractionsSparse[:, j].nnz > 0):
                y = matrixinteractionsSparse[:, j].toarray().ravel()
                #y = column_or_1d(y, warn=True)
                # set the j-th column of X to zero
                startptr = matrixinteractionsSparse.indptr[j]
                endptr = matrixinteractionsSparse.indptr[j + 1]
                bak = matrixinteractionsSparse.data[startptr: endptr].copy()
                matrixinteractionsSparse.data[startptr: endptr] = 0.0
                # fit one ElasticNet model per column
                model.fit(matrixinteractionsSparse[:,listTopSimilar[j]],y)
                #model.fit(matrixinteractionsSparse, y)
                # self.model.coef_ contains the coefficient of the ElasticNet model
                # let's keep only the non-zero values
                nnz_idx = model.coef_ > 0.0
                #values.extend(model.coef_[nnz_idx])
                #rows.extend(np.arange(n_items)[nnz_idx])
                #cols.extend(np.ones(nnz_idx.sum()) * j)
                if (nnz_idx.sum() > 0):
                    values.extend(model.coef_[nnz_idx])
                    rows.extend(listTopSimilar[j][nnz_idx].flatten())
                    # rows.extend(np.arange(nitems)[nnz_idx])
                    cols.extend(np.ones(nnz_idx.sum()) * j)
                # finally, replace the original values of the j-th column
                matrixinteractionsSparse.data[startptr:endptr] = bak
# generate the sparse weight matrix
matrixSimilarity = sm.csc_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32)
matrixinteractionsSparse = matrixinteractionsSparse.T.tocsc()
listUser = []
listValue = []




In [None]:
for index in tqdm(test_users):
    listSimilarity = matrixSimilarity[:,index]
    scores = listSimilarity.T.dot(matrixinteractionsSparse)
    print(scores.nnz)
    #scores =  matrixinteractionsSparse[index,:].dot(matrixSimilarity)
    scores = scores.toarray()[0]
    scores *= np.negative((matrixinteractionsSparse[index,:]).astype(bool).toarray()[0])
    sumTemp = np.sum((matrixinteractionsSparse[index, :]).astype(bool).toarray()[0])
    if (sumTemp <= 2):
        low_values_indices = scores < 0.001  # Where values are low
        scores[low_values_indices] = 0  # All low values set to 0
    topItems = heapq.nlargest(5, range(len(scores)), scores.take)
    if(scores[topItems[0]]>0):
        listUser.append(index)
        listValue.append(str(topItems[0]) + " " + str(topItems[1]) + " " + str(topItems[2]) + " " + str(topItems[3]) + " " + str(topItems[4]))


unos=[1]*len(items_for_features)

UxI= sm.csr_matrix((ratings, (users, items)))
UxI_coo = UxI.tocoo()
IxF= sm.csr_matrix((unos, (items_for_features, features)))
IxF_coo = IxF.tocoo()

def fitOneColumn(colID, URM, l1_penalty=0.1, l2_penalty=0.1, positive_only=True):
    
    l1_ratio = l1_penalty / (l1_penalty + l2_penalty)

    model = ElasticNet(alpha=1.0,
                       l1_ratio=l1_ratio,
                       positive=positive_only,
                       fit_intercept=False,
                       copy_X=False)

    # get the target column
    y = URM[:, colID].toarray()

    # set the colID column of URM to zero
    URM.data[URM.indptr[colID]:URM.indptr[colID + 1]] = 0.0

    # fit one ElasticNet model per column
    model.fit(URM, y)

    # self.model.coef_ contains the coefficient of the ElasticNet model
    # let's keep only the non-zero values
    nnz_idx = model.coef_ > 0.0

    values = model.coef_[nnz_idx]
    rows = np.arange(URM.shape[1])[nnz_idx]
    cols = np.ones(nnz_idx.sum()) * colID

    
    return list(zip(rows, cols, values))

def fitSLIM(URM):
    
    itemNumber = URM.shape[1]
    
    itemList = sc.parallelize(list(range(itemNumber)))
    
    # fit item's factors in parallel
    slimResult = itemList.flatMap(lambda x: fitOneColumn(x, URM))

    rows = slimResult.map(lambda x: x[0]).collect()
    cols = slimResult.map(lambda x: x[1]).collect()
    values = slimResult.map(lambda x: x[2]).collect()   
    
    # generate the sparse weight matrix
    return sps.csc_matrix((values, (rows, cols)), 
                              shape=(itemList.max()+1, itemList.max()+1),
                              dtype=np.float32)


def fitOneColumn(colID, URM, model):

    # get the target column
    y = URM[:, colID].toarray()

    # set the colID column of URM to zero
    URM.data[URM.indptr[colID]:URM.indptr[colID + 1]] = 0.0

    # fit one ElasticNet model per column
    model.fit(URM, y)

    # self.model.coef_ contains the coefficient of the ElasticNet model
    # let's keep only the non-zero values
    nnz_idx = model.coef_ > 0.0

    values = model.coef_[nnz_idx]
    rows = np.arange(URM.shape[1])[nnz_idx]
    cols = np.ones(nnz_idx.sum()) * colID

    
    return list(zip(rows, cols, values))

def fitSLIM(URM):
    
    l1_penalty=0.1
    l2_penalty=0.1
    positive_only=True
    l1_ratio = l1_penalty / (l1_penalty + l2_penalty)

    model = ElasticNet(alpha=1.0,
                       l1_ratio=l1_ratio,
                       positive=positive_only,
                       fit_intercept=False,
                       copy_X=False)
    
    itemNumber = URM.shape[1]
    
    result = list()
    
    for item in tqdm(range(itemNumber)):        
        result += fitOneColumn(item, URM, model)
    # fit item's factors in parallel

similaritySLIM = fitSLIM(UxI)

In [None]:
def sample_recommendation(model, data, i_f, user_ids):


    n_users, _ = data.shape
    n_items, _ = i_f.shape
    
    f = open('submission_light_fede_150_30.csv', 'wt')
    writer = csv.writer(f)
    writer.writerow(('userId','RecommendedItemIds'))

    for user_id in tqdm(user_ids):
        known_positives = set(data[user_id].indices)
        scores = model.predict(user_id, np.arange(n_items), item_features=i_f)
        top_items = np.argsort(-scores)
        top_items = [item for item in top_items if item not in known_positives]
       #mask = np.in1d(top_items, known_positives,invert=True)
       #top_items = top_items[mask][:5]
       #top=[0,0,0,0,0]
        top=top_items[:5]
        iterator = 0
        for i in range(5 - len(top)):
            prediction = item_ratings_mean[iterator]
            while prediction in grouped_rates_dic[user] or prediction in top:
                iterator += 1
                prediction = item_ratings_mean[iterator]
                iterator += 1
            top.append(prediction)  
        writer.writerow((user_id, '{0} {1} {2} {3} {4}'.format(top[0], top[1], top[2], top[3], top[4])))

    f.close()