In [1]:
from pyspark import SparkContext
from scipy import sparse as sm
from sklearn.preprocessing import normalize
import numpy as np
import csv
from tqdm import tqdm_notebook as tqdm
from lightfm import LightFM

sc = SparkContext.getOrCreate()

In [2]:
train_rdd = sc.textFile("data/train.csv")
icm_rdd = sc.textFile("data/icm_fede.csv")
test_rdd= sc.textFile("data/target_users.csv")

train_header = train_rdd.first()
icm_header = icm_rdd.first()
test_header= test_rdd.first()

train_clean_data = train_rdd.filter(lambda x: x != train_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1]), float(x[2])))
icm_clean_data = icm_rdd.filter(lambda x: x != icm_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1])))
test_clean_data= test_rdd.filter(lambda x: x != test_header).map(lambda line: line.split(','))

test_users=test_clean_data.map( lambda x: int(x[0])).collect()


grouped_rates = train_clean_data.filter(lambda x: x[0] in test_users).map(lambda x: (x[0],x[1])).groupByKey().map(lambda x: (x[0], list(x[1]))).collect()
grouped_rates_dic = dict(grouped_rates)


item_ratings = train_clean_data.map(lambda x: (x[0], x[2])).aggregateByKey((0,0), lambda x,y: (x[0] + y, x[1] + 1),lambda x,y: (x[0] + y[0], x[1] + y[1]))
user_ratings_mean = item_ratings.mapValues(lambda x: (x[0] / (x[1]))).collect()
user_ratings_mean_dic=dict(user_ratings_mean)


item_ratings_forTop = train_clean_data.map(lambda x: (x[1], x[2])).aggregateByKey((0,0), lambda x,y: (x[0] + y, x[1] + 1),lambda x,y: (x[0] + y[0], x[1] + y[1]))#.sortBy(lambda x: x[1][1], ascending=False)
#item_ratings.take(10)
shrinkage_factor = 5
item_ratings_mean = item_ratings_forTop.mapValues(lambda x: (x[0] / (x[1] + shrinkage_factor))).sortBy(lambda x: x[1], ascending = False).map(lambda x: x[0]).collect()

def evaluateRating(tuple):
    if tuple[2] >= 8:
        return 1
    return -1
    
users = train_clean_data.map(lambda x: x[0]).collect()
items = train_clean_data.map(lambda x: x[1]).collect()
ratings = train_clean_data.map(evaluateRating).collect()
ratings_unbiased = train_clean_data.map(lambda x: x[2]-user_ratings_mean_dic[x[0]]).collect()

items_for_features= icm_clean_data.map(lambda x:x[0]).collect()
features = icm_clean_data.map(lambda x:x[1]).collect()
items_for_features.append(37142)
features.append(0)


unos=[1]*len(items_for_features)

UxI= sm.csr_matrix((ratings, (users, items)))
UxI_coo = UxI.tocoo()
IxF= sm.csr_matrix((unos, (items_for_features, features)))
IxF_coo = IxF.tocoo()

In [3]:
model = LightFM(no_components=150, loss='logistic')

In [4]:
model = model.fit(UxI_coo,item_features=IxF_coo, epochs=30, num_threads=2)

In [8]:
def sample_recommendation(model, data, i_f, user_ids):


    n_users, _ = data.shape
    n_items, _ = i_f.shape
    
    f = open('submission_light_fede_150_30.csv', 'wt')
    writer = csv.writer(f)
    writer.writerow(('userId','RecommendedItemIds'))

    for user_id in tqdm(user_ids):
        known_positives = set(data[user_id].indices)
        scores = model.predict(user_id, np.arange(n_items), item_features=i_f)
        top_items = np.argsort(-scores)
        top_items = [item for item in top_items if item not in known_positives]
       #mask = np.in1d(top_items, known_positives,invert=True)
       #top_items = top_items[mask][:5]
       #top=[0,0,0,0,0]
        top=top_items[:5]
        iterator = 0
        for i in range(5 - len(top)):
            prediction = item_ratings_mean[iterator]
            while prediction in grouped_rates_dic[user] or prediction in top:
                iterator += 1
                prediction = item_ratings_mean[iterator]
                iterator += 1
            top.append(prediction)  
        writer.writerow((user_id, '{0} {1} {2} {3} {4}'.format(top[0], top[1], top[2], top[3], top[4])))

    f.close()

In [9]:
sample_recommendation(model, UxI, IxF, test_users)

Widget Javascript not detected.  It may not be installed or enabled properly.



