# Set up

In [241]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

openjdk-8-jdk-headless is already the newest version (8u242-b08-0ubuntu3~18.04).
0 upgraded, 0 newly installed, 0 to remove and 25 not upgraded.


In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
from pyspark.sql.types import *
import pandas as pd
from operator import add


# create the Spark Session
spark = SparkSession.builder.getOrCreate()

# create the Spark Context
sc = spark.sparkContext

In [0]:
id='1-kKDZ--SDPgClLDT3W6wjogxexzYnIMb'
downloaded = drive.CreateFile({'id': id}) 
downloaded.GetContentFile('soc-LiveJournal1Adj.txt')

In [0]:
data_file = sc.textFile("soc-LiveJournal1Adj.txt")

In [0]:
schema = StructType([
  StructField("user", StringType()),
  StructField("friends", StringType()),
])

In [0]:
 data = spark.read\
        .option('delimiter', '\t')\
        .csv(data_file, schema=schema)

In [248]:
data.show()

+----+--------------------+
|user|             friends|
+----+--------------------+
|   0|1,2,3,4,5,6,7,8,9...|
|   1|0,5,20,135,2409,8...|
|   2|0,117,135,1220,27...|
|   3|0,12,41,55,1532,1...|
|   4|0,8,14,15,18,27,7...|
|   5|0,1,20,2022,22939...|
|   6|0,21,98,2203,3238...|
|   7|0,31993,40218,404...|
|   8|0,4,38,46,72,85,2...|
|   9|  0,6085,18972,19269|
|  10|0,12,16,30,6027,1...|
|  11|0,1754,6027,7789,...|
|  12|0,3,10,16,29,38,4...|
|  13|0,12584,32064,27,...|
|  14|0,4,19,19079,4269...|
|  15|           0,4,27,80|
|  16|0,10,12,18,30,38,...|
|  17|0,19,26,28,95,128...|
|  18|0,4,16,30,89,2406...|
|  19|0,14,17,439,1100,...|
+----+--------------------+
only showing top 20 rows



# Find list recommendation


In [0]:
def find_recommendation(id_us):
  list_friend = data.filter("user = '{}'".format(id_us)).collect()[0]['friends'].split(',')
  friend_may_know_table = data.select('friends').filter(col('user').isin(list_friend))
  friend_may_know_ls = friend_may_know_table.rdd.flatMap(lambda x: x).flatMap(lambda x: x.split(','))
  remove_knew_already = friend_may_know_ls.filter(lambda x: not x in list_friend).filter(lambda x: x != str(id_us))
  set_relation = remove_knew_already.map(lambda x: (x, 1))
  count_relation = set_relation.reduceByKey(add)
  sort = count_relation.map(lambda x: (x[1], x[0])).takeOrdered(10, lambda x: (-x[0], x[1]))
  result = [i[1] for i in sort]
  return result

In [0]:
ls_quest = [924, 8941, 8942, 9019, 9020, 9021, 9022, 9990, 9992, 9993]
result_ls = dict()

for user in ls_quest:
  result_ls[user] = find_recommendation(user)


In [253]:
print(result_ls)

{924: ['11860', '15416', '2409', '43748', '439', '45881', '6995'], 8941: ['8943', '8944', '8940'], 8942: ['8939', '8940', '8943', '8944'], 9019: ['9022', '317', '9023'], 9020: ['9021', '9016', '9017', '9022', '317', '9023'], 9021: ['9020', '9016', '9017', '9022', '317', '9023'], 9022: ['9019', '9020', '9021', '317', '9016', '9017', '9023'], 9990: ['13134', '13478', '13877', '34299', '34485', '34642', '37941'], 9992: ['9987', '9989', '35667', '9991'], 9993: ['9991', '13134', '13478', '13877', '34299', '34485', '34642', '37941']}
