In [13]:
from surprise import SVD, Dataset, Reader
from collections import defaultdict
from pyspark import SparkContext
import csv
sc = SparkContext.getOrCreate()

In [33]:
train_rdd = sc.textFile("data/train.csv")
icm_rdd = sc.textFile("data/icm_fede.csv")
test_rdd= sc.textFile("data/target_users.csv")

train_header = train_rdd.first()
icm_header = icm_rdd.first()
test_header= test_rdd.first()

train_clean_data = train_rdd.filter(lambda x: x != train_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1]), float(x[2]), ''))
icm_clean_data = icm_rdd.filter(lambda x: x != icm_header).map(lambda line: line.split(',')).map(lambda x: (int(x[0]), int(x[1])))
test_clean_data= test_rdd.filter(lambda x: x != test_header).map(lambda line: line.split(',')).map(lambda x: int(x[0])).collect()

train_test = train_clean_data.filter(lambda x: x[0] in test_clean_data).collect()


In [40]:
item_ratings = train_clean_data.map(lambda x: (x[1], x[2])).aggregateByKey((0,0), lambda x,y: (x[0] + y, x[1] + 1),lambda x,y: (x[0] + y[0], x[1] + y[1]))#.sortBy(lambda x: x[1][1], ascending=False)
#item_ratings.take(10)
shrinkage_factor = 5
item_ratings_mean = item_ratings.mapValues(lambda x: (x[0] / (x[1] + shrinkage_factor))).sortBy(lambda x: x[1], ascending = False).map(lambda x: x[0]).collect()

grouped_rates_dic = dict(train_clean_data.map(lambda x: (x[0],x[1])).groupByKey().map(lambda x: (x[0], list(x[1]))).collect())

In [14]:
def printTestUsers():
    f = open('data/test_data.csv', 'wt')
    
    writer = csv.writer(f)
    writer.writerow(('user','item','rating'))

    for t in train_test:
        writer.writerow((t[0],t[1],t[2]))
    f.close()

In [15]:
printTestUsers()

In [16]:
reader = Reader(line_format='user item rating', sep=',', rating_scale=(1,10), skip_lines=1)

In [17]:
data = Dataset.load_from_file('data/train.csv', reader=reader)

In [29]:
data2 = Dataset.load_from_file('data/test_data.csv', reader=reader)

In [19]:
algo = SVD(n_factors=870, n_epochs=40)

In [20]:
def get_top_n(predictions, n=5):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


In [22]:
trainset = data.build_full_trainset()

In [23]:
algo.train(trainset)

In [34]:
testset = data.construct_testset(train_test)

In [35]:
testset

[(11326, 2, 5.0),
 (3906, 3, 10.0),
 (7393, 3, 8.0),
 (5577, 4, 8.0),
 (6827, 4, 6.0),
 (8071, 4, 10.0),
 (8456, 4, 7.0),
 (8564, 4, 10.0),
 (10046, 4, 9.0),
 (13195, 4, 10.0),
 (7393, 6, 8.0),
 (5351, 10, 6.0),
 (1649, 12, 3.0),
 (1734, 12, 7.0),
 (2785, 12, 6.0),
 (3198, 12, 7.0),
 (3358, 12, 3.0),
 (4394, 12, 1.0),
 (5249, 12, 4.0),
 (6247, 12, 6.0),
 (6586, 12, 7.0),
 (7817, 12, 7.0),
 (8187, 12, 7.0),
 (11229, 12, 4.0),
 (11641, 12, 6.0),
 (11961, 12, 6.0),
 (12180, 12, 9.0),
 (12752, 12, 7.0),
 (13018, 12, 6.0),
 (13928, 12, 8.0),
 (14627, 12, 10.0),
 (1236, 13, 6.0),
 (3418, 15, 5.0),
 (6866, 18, 9.0),
 (14472, 18, 10.0),
 (12567, 24, 1.0),
 (152, 30, 5.0),
 (15116, 31, 7.0),
 (7393, 32, 8.0),
 (14154, 38, 7.0),
 (7965, 39, 8.0),
 (12132, 39, 7.0),
 (1875, 41, 7.0),
 (2345, 43, 5.0),
 (10098, 45, 10.0),
 (12369, 45, 6.0),
 (4386, 46, 7.0),
 (1263, 48, 4.0),
 (7571, 48, 1.0),
 (7886, 48, 8.0),
 (3003, 49, 6.0),
 (3873, 49, 8.0),
 (5001, 49, 9.0),
 (5417, 49, 4.0),
 (5448, 49, 1.0

In [36]:

predictions = algo.test(testset)

In [37]:

top_n = get_top_n(predictions, n=5)


11326 [2, 881, 8733, 9594, 11270]
3906 [3, 259, 347, 541, 667]
7393 [3, 6, 32, 90, 100]
5577 [4, 322, 2391, 4039, 4853]
6827 [4, 114, 146, 180, 238]
8071 [4, 753, 875, 918, 1647]
8456 [4, 52, 909, 1456, 2898]
8564 [4, 279, 612, 663, 1730]
10046 [4, 479, 21679, 29271]
13195 [4, 1144, 2187, 6502, 6587]
5351 [10, 32912, 34602]
1649 [12, 12342, 15452, 22197, 22341]
1734 [12, 1141, 7342, 8496, 10059]
2785 [12, 4421, 7280, 14714, 19127]
3198 [12, 2143, 3587, 5292, 6132]
3358 [12, 7903, 19419, 23507, 30087]
4394 [12, 16072, 20279]
5249 [12, 2902, 3431, 3457, 3626]
6247 [12, 8496, 9349, 15599, 16988]
6586 [12, 2050, 3302, 3474, 5364]
7817 [12, 784, 1442, 2050, 2134]
8187 [12, 2825, 5085, 6459, 7788]
11229 [12, 9242, 11717, 21199, 25624]
11641 [12, 551, 2143, 2623, 4605]
11961 [12, 598, 2073, 2300, 2777]
12180 [12, 12320, 16569, 19163, 29534]
12752 [12, 152, 194, 339, 345]
13018 [12, 5214, 10469, 14734, 24081]
13928 [12, 146, 366, 775, 1434]
14627 [12, 15514, 24479, 27409]
1236 [13, 52, 100, 22

2886 [1910, 3457, 8932, 26951]
11214 [1917, 4045, 6677, 7100, 8189]
14769 [1925, 6699, 21053, 25559, 35549]
3784 [1926, 3903, 3906, 4710, 7081]
13049 [1940, 2232, 35202]
2658 [1942, 18645, 33886]
4553 [1942, 15257, 17871, 19725]
8523 [1942, 1961, 2050, 8619, 11144]
12164 [1947]
418 [1953]
11355 [1957, 14342, 21206, 24289, 30496]
13490 [1961, 4475, 19907, 22341, 32019]
1112 [1963, 5717, 6516, 8013, 9242]
4134 [1976, 7370, 13707, 16653, 16766]
7512 [1977, 3436, 5016, 13130, 14480]
5992 [1981, 5868, 6839, 7314, 7903]
1606 [2003, 2294, 2578, 3305, 3377]
13567 [2003, 22338, 34942]
8573 [2012, 4732, 5350, 8117, 10170]
9491 [2018, 5867, 7133, 9435, 10244]
9823 [2031, 14117, 28720]
4354 [2036, 19642]
186 [2041, 9893, 10082, 13574]
2398 [2041, 31971, 33173]
3543 [2041, 2134, 9420, 18200, 19022]
4828 [2041, 9535, 31992, 33173]
5637 [2041, 6249, 15086]
7036 [2041, 3276, 4288, 4470, 6405]
8385 [2041, 7188, 10469, 13885, 18803]
9868 [2041, 17445, 29777, 30821, 34351]
11549 [2041, 9202, 10446, 16153

5831 [6254]
9921 [6254, 15662, 32038, 36109]
11103 [6255]
3932 [6261, 14873]
8995 [6263, 7903, 9833, 13130, 13977]
9683 [6263]
12293 [6263, 8717, 11366, 18099, 22002]
13425 [6263, 9713, 25677]
14183 [6263, 6798, 10881, 15626, 18006]
6111 [6291, 15585, 16951, 20566, 23372]
10924 [6299]
13961 [6299, 10452, 18662, 27563, 31992]
9106 [6301, 8589, 9626]
1157 [6314, 27612, 32161]
5229 [6314]
5281 [6314, 7923, 10685, 11693, 13752]
9595 [6314, 10446, 18359, 36811]
10633 [6314, 6405, 31220]
12689 [6314, 25150, 25920, 30087, 31768]
14648 [6314, 6821, 20279, 21907, 22904]
15090 [6328, 14707, 15037, 34201, 35129]
919 [6343, 19573]
2268 [6343, 8013, 8519, 35293]
11127 [6343, 10193, 24639, 25007, 26951]
10380 [6363, 26845, 29926, 35497]
12298 [6379, 28119]
5165 [6391]
4219 [6405, 14537]
5404 [6405]
6406 [6405, 6769, 7086, 8935, 12491]
10931 [6405, 7721, 20693, 30544, 30699]
11713 [6405, 6561, 16687, 17472, 24763]
12414 [6405, 6637, 12241, 18200, 21851]
15162 [6405, 9902]
1576 [6409, 12454, 16350, 35

29 [14322]
3828 [14322, 27686]
8784 [14322, 17411, 22720, 25624, 27398]
13983 [14322]
11347 [14336, 24639, 26674]
11350 [14340]
126 [14351, 19642, 21996, 22405, 28102]
8488 [14351, 21263, 31992]
12036 [14351]
13217 [14351, 28053]
14993 [14351, 23240]
968 [14372, 19798]
10227 [14376, 17813, 22760]
7202 [14381]
415 [14387]
3028 [14387]
10330 [14431]
191 [14455, 15636]
11235 [14458, 15599, 17315, 18645, 19127]
1017 [14480, 15142, 18238, 33804]
11166 [14480, 17149, 20279, 21403]
10222 [14505, 19659]
1110 [14525]
3992 [14532, 24488]
8895 [14533, 30360, 36163]
3989 [14535, 27132]
5725 [14535]
12174 [14537]
11404 [14553, 22069]
12502 [14556, 22762]
6676 [14570]
5562 [14572, 14709, 16569, 17287, 17644]
5568 [14572, 15274, 17644, 22618, 28688]
7776 [14572, 15971, 33173, 34595, 35300]
7359 [14573, 21661]
6106 [14579, 19907, 24519]
76 [14583, 19020, 21368]
7391 [14585, 21900, 25939, 31874, 34784]
8993 [14608]
4575 [14616, 19030]
9635 [14616, 19907, 25467]
6370 [14618, 19897]
14908 [14628]
8938 [1

13839 [32362]
11854 [32396]
2926 [32548]
792 [32571]
8085 [32578]
4888 [32604]
140 [32636]
14006 [32641, 36828]
886 [32674]
12738 [32674, 35105]
4300 [32731, 35300]
10846 [32785, 34773, 35129]
6155 [32800]
12229 [32879]
15048 [32879]
12454 [32915, 34411, 36434]
3001 [32916]
8578 [32916]
7143 [33010]
1235 [33033, 35578]
5087 [33050]
7298 [33053]
12949 [33084, 34260, 35628]
1952 [33173]
2058 [33173]
3326 [33173]
7145 [33173]
9383 [33173]
14570 [33187]
8766 [33194]
4726 [33266]
10030 [33266]
37 [33282, 36409]
10491 [33298]
9951 [33416, 35201]
5109 [33475]
2921 [33500]
4058 [33500]
5269 [33520]
6932 [33532]
10311 [33546]
2581 [33562]
3693 [33562]
3917 [33562]
7552 [33562]
3866 [33580]
4594 [33598]
14465 [33598]
12936 [33649, 36957]
8334 [33667]
14331 [33743, 35874]
8559 [33747]
2564 [33755]
11090 [33755]
5056 [33780]
2933 [33818]
2097 [33954]
6834 [33961]
5772 [33971]
15126 [33993, 36387]
15334 [34002, 35562]
8753 [34014]
2397 [34070]
14592 [34070]
9021 [34073]
10735 [34090]
7060 [34159]
1

In [45]:
f = open('submission_surprise_svd.csv', 'wt')

writer = csv.writer(f)
writer.writerow(('userId','RecommendedItemIds'))

for uid, user_ratings in top_n.items():
    predictions = [iid for iid,_ in user_ratings]
    iterator = 0
    already_voted = grouped_rates_dic[uid]
    for i in range(5 - len(predictions)):
        while (item_ratings_mean[iterator] in already_voted) or (item_ratings_mean[iterator] in predictions):
            iterator = iterator + 1
        predictions = predictions + [item_ratings_mean[iterator]]
    writer.writerow((uid, '{0} {1} {2} {3} {4}'.format(predictions[0], predictions[1], predictions[2], predictions[3], predictions[4])))

f.close()