In [1]:
from pyspark import SparkContext
import csv
from scipy import sparse as sm
import numpy as np
from itertools import groupby
from functools import reduce
from operator import itemgetter
from scipy.sparse.linalg import svds
from collections import defaultdict
from tqdm import tqdm

sc = SparkContext.getOrCreate()

In [2]:

train_rdd = sc.textFile("data/train.csv")
icm_rdd = sc.textFile("data/icm_fede.csv")
test_rdd= sc.textFile("data/target_users.csv")

train_header = train_rdd.first()
icm_header = icm_rdd.first()
test_header= test_rdd.first()

train_clean_data = train_rdd.filter(lambda x: x != train_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1]), float(x[2])))
icm_clean_data = icm_rdd.filter(lambda x: x != icm_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1])))
test_clean_data= test_rdd.filter(lambda x: x != test_header).map(lambda line: line.split(',')).map(lambda x: int(x[0]))



In [3]:
users = train_clean_data.map(lambda x: x[0]).collect()
items = train_clean_data.map(lambda x: x[1]).collect()
ratings = train_clean_data.map(lambda x: x[2]).collect()

test_users = test_clean_data.collect()

items_for_features= icm_clean_data.map(lambda x:x[0]).collect()
features = icm_clean_data.map(lambda x:x[1]).collect()
items_for_features.append(37142)
features.append(0)


unos=[1]*len(items_for_features)

UxI= sm.csr_matrix((ratings, (users, items)))
IxF= sm.csr_matrix((unos, (items_for_features, features)))



In [4]:
grouped_rates = train_clean_data.filter(lambda x: x[0] in test_users).map(lambda x: (x[0],x[1])).groupByKey().map(lambda x: (x[0], list(x[1]))).collect()
grouped_rates_dic = dict(grouped_rates)

In [5]:
item_ratings_forTop = train_clean_data.map(lambda x: (x[1], x[2])).aggregateByKey((0,0), lambda x,y: (x[0] + y, x[1] + 1),lambda x,y: (x[0] + y[0], x[1] + y[1]))
shrinkage_factor = 5
item_ratings_mean = item_ratings_forTop.mapValues(lambda x: (x[0] / (x[1] + shrinkage_factor))).sortBy(lambda x: x[1], ascending = False).map(lambda x: x[0]).collect()



In [6]:
user_ratings_mean = np.mean(UxI, axis = 1)

In [7]:
UxI_demeaned = UxI - user_ratings_mean.reshape(-1, 1)

In [8]:
U, sigma, Vt = svds(UxI_demeaned, k = 5000)

In [9]:
sigma = np.diag(sigma)

In [10]:
all_user_predicted_ratings = sm.lil_matrix(np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1))

In [11]:
for user in tqdm(users):
    all_user_predicted_ratings[user, UxI.getrow(user).nonzero()[1]] = 0

100%|██████████| 170149/170149 [01:09<00:00, 2454.49it/s]


In [12]:
UxI_pred = sm.csr_matrix(all_user_predicted_ratings)

In [13]:
f = open('submission_svd_fede_k5000.csv', 'wt')
writer = csv.writer(f)
writer.writerow(('userId','RecommendedItemIds'))
for user in tqdm(test_users):
    top=[0,0,0,0,0]

    user_predictions=UxI_pred.getrow(user)
    for i in range(5):
        prediction = user_predictions.argmax()
        while prediction in grouped_rates_dic[user] and prediction != 0:
            user_predictions[0,prediction]=-9
            prediction=user_predictions.argmax()
        if prediction == 0:
            prediction = item_ratings_mean[iterator]
            while prediction in grouped_rates_dic[user] or prediction in top:
                iterator += 1
                prediction = item_ratings_mean[iterator]
            iterator += 1
        else:
            user_predictions[0,prediction]=-9
        top[i]=prediction    
    writer.writerow((user, '{0} {1} {2} {3} {4}'.format(top[0], top[1], top[2], top[3], top[4])))

f.close()

100%|██████████| 4196/4196 [00:41<00:00, 100.04it/s]
