In [3]:
# !pip install pyspark

In [4]:
import pyspark
from itertools import permutations

In [5]:
sc = pyspark.SparkContext('local[*]')

In [6]:
from google.colab import drive 
drive.mount('drive')

Mounted at drive


In [8]:
# Importing db.txt from Google drive

import os
import zipfile
import numpy as np

dir = "/content/drive/My Drive/Colab Notebooks/Spark"
files = os.listdir(dir)
files

['Assignment_1.ipynb', 'db.txt']

In [9]:
# Reading the file

lines = sc.textFile(dir + '/db.txt') 
lines.take(3)

['0\t1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94',
 '1\t0,5,20,135,2409,8715,8932,10623,12347,12846,13840,13845,14005,20075,21556,22939,23520,28193,29724,29791,29826,30691,31232,31435,32317,32489,34394,35589,35605,35606,35613,35633,35648,35678,38737,43447,44846,44887,49226,49985,623,629,4999,6156,13912,14248,15190,17636,19217,20074,27536,29481,29726,29767,30257,33060,34250,34280,34392,34406,34418,34420,34439,34450,34651,45054,49592',
 '2\t0,117,135,1220,2755,12453,24539,24714,41456,45046,49927,6893,13795,16659,32828,41878']

In [11]:
# Splitting the file based on the \t and then splitting the list of friends from a string to an array of characters

lines = lines.map(lambda line:line.split())
friends = lines.filter(lambda x:len(x)==2)
friends = friends.map(lambda x:(x[0], x[1].split(",")))
friends.take(3)

[('0',
  ['1',
   '2',
   '3',
   '4',
   '5',
   '6',
   '7',
   '8',
   '9',
   '10',
   '11',
   '12',
   '13',
   '14',
   '15',
   '16',
   '17',
   '18',
   '19',
   '20',
   '21',
   '22',
   '23',
   '24',
   '25',
   '26',
   '27',
   '28',
   '29',
   '30',
   '31',
   '32',
   '33',
   '34',
   '35',
   '36',
   '37',
   '38',
   '39',
   '40',
   '41',
   '42',
   '43',
   '44',
   '45',
   '46',
   '47',
   '48',
   '49',
   '50',
   '51',
   '52',
   '53',
   '54',
   '55',
   '56',
   '57',
   '58',
   '59',
   '60',
   '61',
   '62',
   '63',
   '64',
   '65',
   '66',
   '67',
   '68',
   '69',
   '70',
   '71',
   '72',
   '73',
   '74',
   '75',
   '76',
   '77',
   '78',
   '79',
   '80',
   '81',
   '82',
   '83',
   '84',
   '85',
   '86',
   '87',
   '88',
   '89',
   '90',
   '91',
   '92',
   '93',
   '94']),
 ('1',
  ['0',
   '5',
   '20',
   '135',
   '2409',
   '8715',
   '8932',
   '10623',
   '12347',
   '12846',
   '13840',
   '13845',
   '14005',
   '200

In [12]:
# For each user id making a key value pair with each seperate friend and 
# assigning a score of -10000 to each unique key value pair between user and friend

directFriends = friends.flatMap(lambda x:[((x[0],friend), -10000) for friend in x[1]])
directFriends.take(1)

[(('0', '1'), -10000)]

In [13]:
# Here we take all the possible pair permutations from the list of friends 
# for each user so we can see the number of times each friend appears with another friend

# For example
# User: List of Friends
# 1: 3, 4, 5
# 2: 4, 5, 6

# Here the pair 4,5 will be mapped twice as ((4,5),1)

mutualFriends = friends.flatMap(lambda data: [(pair, 1) for pair in permutations(data[1], 2)])
mutualFriends.take(1)

[(('1', '2'), 1)]

In [14]:
# We then take a union of all the pairs of mutual friends calculated above and all the direct friend pairs

fullList = directFriends.union(mutualFriends)
fullList.take(3)

[(('0', '1'), -10000), (('0', '2'), -10000), (('0', '3'), -10000)]

In [15]:
# We then reduce by key with the adding operation. This ensures that already 
# direct friend pairs when added up with mutual friends remain in the negative and mutual friends 
# which aren't already direct friends are excluded

fullList = fullList.reduceByKey(lambda x,y:x+y)
fullList.take(2)

[(('0', '54'), -10000), (('1', '14248'), -10000)]

In [16]:
# We exclude these by taking the count above 0 so we get the mutual friends 
# which aren't already direct friends

mutualCount = fullList.filter(lambda x: x[1] > 0)
mutualCount.take(2)

[(('90', '2'), 1), (('25195', '8703'), 1)]

In [17]:
# We then manipulate our  data from (user, friend), count to user, (count, friend)

mutualCount = mutualCount.map(lambda x: (x[0][0], (x[1], x[0][1]))).groupByKey().mapValues(lambda x: list(x))
mutualCount.take(2)

[('15232',
  [(1, '19046'),
   (1, '40240'),
   (2, '8380'),
   (1, '10622'),
   (1, '39821'),
   (1, '15245'),
   (1, '24300'),
   (1, '15270'),
   (1, '15266'),
   (1, '17212'),
   (1, '15240'),
   (1, '15247'),
   (1, '15262'),
   (1, '42025'),
   (1, '15271'),
   (1, '39829'),
   (1, '15255'),
   (1, '15272'),
   (2, '39963'),
   (1, '40776'),
   (1, '39781'),
   (2, '39782'),
   (1, '25733'),
   (1, '1974'),
   (2, '40418'),
   (1, '47824'),
   (1, '15236'),
   (1, '39835'),
   (1, '15260'),
   (1, '15237'),
   (1, '1406'),
   (2, '17767'),
   (2, '39819'),
   (1, '40770'),
   (1, '15246'),
   (1, '15273'),
   (1, '15276'),
   (1, '15243'),
   (1, '15239'),
   (1, '15254'),
   (1, '15253'),
   (1, '15268'),
   (1, '15275'),
   (1, '11132'),
   (1, '39430'),
   (1, '15235'),
   (1, '15251'),
   (1, '15263'),
   (1, '8798'),
   (1, '7018'),
   (1, '15376'),
   (1, '15264'),
   (1, '15248'),
   (1, '16350'),
   (1, '9983'),
   (1, '15259'),
   (1, '15242'),
   (1, '39881'),
   (1, '1

In [18]:
def sortList(inputList):
#     inputList.sort(key=lambda v: -v[0])
    inputList.sort(key=lambda i:i[0],reverse=True)
    return inputList

In [19]:
# We sort the list of each user according to the highest count

mutualCount = mutualCount.map(lambda x:(x[0], sortList(x[1])))
mutualCount.take(2)

[('17257',
  [(3, '17245'),
   (2, '40776'),
   (2, '17227'),
   (2, '39928'),
   (2, '17269'),
   (2, '17208'),
   (2, '17187'),
   (2, '17264'),
   (2, '17196'),
   (2, '39584'),
   (2, '17210'),
   (2, '39784'),
   (2, '17198'),
   (2, '17237'),
   (2, '17255'),
   (2, '17267'),
   (2, '17268'),
   (2, '17235'),
   (2, '17234'),
   (2, '17266'),
   (1, '1436'),
   (1, '40592'),
   (1, '9657'),
   (1, '17217'),
   (1, '14857'),
   (1, '14454'),
   (1, '17213'),
   (1, '11757'),
   (1, '39890'),
   (1, '17197'),
   (1, '26525'),
   (1, '33355'),
   (1, '17242'),
   (1, '39729'),
   (1, '17195'),
   (1, '25372'),
   (1, '40240'),
   (1, '39479'),
   (1, '23476'),
   (1, '17221'),
   (1, '12311'),
   (1, '17189'),
   (1, '17233'),
   (1, '17239'),
   (1, '17175'),
   (1, '35995'),
   (1, '12231'),
   (1, '2902'),
   (1, '17212'),
   (1, '17228'),
   (1, '17229'),
   (1, '1661'),
   (1, '17258'),
   (1, '17181'),
   (1, '2924'),
   (1, '9982'),
   (1, '17582'),
   (1, '31270'),
   (1, '3

In [20]:
# Finally we remove the count and we simply display the friends in order of highest count mutual friends

mutualCount = mutualCount.map(lambda x:(x[0],x[1][:10]))
mutualCount = mutualCount.map(lambda x:(x[0],[int(i[1]) for i in x[1]]))
mutualCount.take(2)

[('33015',
  [33031, 33036, 33026, 33037, 33098, 33052, 33048, 33013, 33029, 33033]),
 ('49878',
  [49876, 24866, 49864, 35596, 35722, 49862, 35593, 13661, 35580, 35671])]

In [21]:
# Here we can see the order is not the same as the one in the file but that is because 
# friends with same count have no specific order and we have capped them at 10

# For example
# User and Counts
# User : [4,3,3,3,3,2,2,2,2,2],2,2,2,2 
# Ones out of 10 cap also may be ignored.

mutualCount.filter(lambda x: x[0] == '11').take(1)

[('11', [27552, 32072, 33192, 27667, 27590, 27600, 27617, 7785, 27620, 27573])]

In [27]:
# Lastly we look at the list of all the recommendations for the given user

recommendedUsersList = ['1211', '9993','19978', '24211', '4774', '2017', '10709', '9206', '24435', '24444']
for l in recommendedUsersList:
    finalOutput = mutualCount.filter(lambda x: x[0] == l).take(1)
    print(finalOutput)

[('1211', [1200, 1214, 1197, 1189, 1184, 1207, 1194, 1170, 1173, 1177])]
[('9993', [9991, 34642, 13134, 34299, 34485, 13877, 13478, 37941])]
[('19978', [20073, 36559, 20078, 4782, 26808, 32892, 45546, 19358, 28722, 45545])]
[('24211', [13572, 24202, 343, 24224, 24213, 24229, 16796, 43249, 24244, 24245])]
[('4774', [367, 13596, 13577, 36559, 385, 12612, 28737, 22621, 14514, 656])]
[('2017', [29276, 4591, 3474, 3770, 3230, 30119, 7402, 42119, 43501, 11544])]
[('10709', [10732, 27312, 13654, 338, 11214, 9553, 30870, 45501, 45453, 12599])]
[('9206', [2036, 6303, 2659, 15843, 7242, 44651, 12344, 15710, 27866, 16643])]
[('24435', [23993, 22633, 40868, 18013, 24432, 23880, 40688, 24428, 7036, 37827])]
[('24444', [24446, 23883, 15843, 7242, 24453, 6545, 40868, 22542, 24433, 24258])]
