In [32]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [3]:
!hdfs dfs -mkdir data

In [4]:
!hdfs dfs -ls

Found 2 items
drwxr-xr-x   - root hadoop          0 2022-05-27 15:27 .sparkStaging
drwxr-xr-x   - root hadoop          0 2022-05-27 15:29 data


In [11]:
!hdfs dfs -put home/asin_keywords.csv asin_keywords.csv

In [14]:
!hdfs dfs -ls 

Found 4 items
drwxr-xr-x   - root hadoop          0 2022-05-31 23:32 .sparkStaging
-rw-r--r--   2 root hadoop      87971 2022-05-31 23:38 asin_keywords.csv
-rw-r--r--   2 root hadoop      87971 2022-05-31 23:37 data
-rw-r--r--   2 root hadoop     338417 2022-05-31 23:39 reviewerID_keywords.csv


In [13]:
!hdfs dfs -put home/reviewerID_keywords.csv reviewerID_keywords.csv

In [4]:
!ls home

asin_keywords.csv  dataproc  reviewerID_keywords.csv


In [49]:
!wget http://snap.stanford.edu/data/amazon/productGraph/metadata.json.gz

--2022-06-01 00:02:49--  http://snap.stanford.edu/data/amazon/productGraph/metadata.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3358565493 (3.1G) [application/x-gzip]
Saving to: ‘metadata.json.gz’


2022-06-01 00:04:56 (25.3 MB/s) - ‘metadata.json.gz’ saved [3358565493/3358565493]



In [51]:
!gunzip metadata.json.gz

In [54]:
!hdfs dfs -put metadata.json metadata.json

In [55]:
!hdfs dfs -ls

Found 5 items
drwxr-xr-x   - root hadoop           0 2022-05-31 23:32 .sparkStaging
-rw-r--r--   2 root hadoop       87971 2022-05-31 23:38 asin_keywords.csv
-rw-r--r--   2 root hadoop       87971 2022-05-31 23:37 data
-rw-r--r--   2 root hadoop 10544467811 2022-06-01 00:10 metadata.json
-rw-r--r--   2 root hadoop      338417 2022-05-31 23:39 reviewerID_keywords.csv


In [23]:
from pyspark.sql import functions as F


In [65]:
df = spark.read.csv("asin_keywords.csv")

In [66]:
df = df.filter("trim(_c0) != ''")

In [67]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



In [68]:
df = df.withColumnRenamed('_c1', 'id')
df = df.withColumnRenamed('_c2', 'kw')

In [69]:
df = df.select("id", F.translate(F.col("kw"), "[]'' ", '').alias('kw'))

In [70]:
df.show(5)

+----------+--------------------+
|        id|                  kw|
+----------+--------------------+
|0684871408|role,learning,app...|
|1597486310|creepy,quaint,pro...|
|1598692585|light,liked,readi...|
|B00EVDG7HW|enjoyed,talented,...|
|B00KHTMFDE|copy,free,receive...|
+----------+--------------------+
only showing top 5 rows



In [71]:
df = df.select("id", F.split(df.kw, ',').alias("kw"))

df = df.select("id", F.explode(df.kw).alias("kw"))

df = df.groupBy("kw").agg(F.collect_list("id").alias("id"))

In [76]:
df.filter(df.kw.startswith('a')).show(5, truncate=False)

+-------+------------------------------------------------------------+
|kw     |id                                                          |
+-------+------------------------------------------------------------+
|abandon|[0553370758]                                                |
|abba   |[1414318677]                                                |
|able   |[096798176X, 1583551972, B00HUACEIY, 0425183440, 0684871408]|
|abraham|[0934893543]                                                |
|access |[0073524581, 1934148091]                                    |
+-------+------------------------------------------------------------+
only showing top 5 rows



In [33]:
df_user = spark.read.csv("reviewerID_keywords.csv")

In [35]:
df_user = df_user.filter("trim(_c0) != ''")
df_user = df_user.withColumnRenamed('_c1', 'id')
df_user = df_user.withColumnRenamed('_c2', 'kw')
df_user = df_user.select("id", F.translate(F.col("kw"), "[]'' ", '').alias('kw'))

In [36]:
df_user.show(5)

+--------------+--------------------+
|            id|                  kw|
+--------------+--------------------+
|A142DRNE2KWF3N|john,grisham,man,...|
|A1AXNELEE02O08|expected,admit,li...|
|A1AYSS3R6K58VZ|box,example,think...|
+--------------+--------------------+
only showing top 3 rows



In [37]:
user_id = 'A142DRNE2KWF3N'

In [38]:
kw = df_user.filter(df_user.id == user_id).select("kw").collect()

kw = kw[0].kw.split(',')

In [41]:
kw

['john',
 'grisham',
 'man',
 'read',
 'life',
 'love',
 'loved',
 'need',
 'price',
 'reading',
 'kindle',
 'really',
 'recommend',
 'say',
 'story',
 'thoughtful',
 'times',
 'worth',
 'knew',
 'available']

In [43]:
product_time = {}

for w in kw:
    pd_list = df.filter(df.kw == w).select("id").collect()
    if len(pd_list) > 0:
        pds = pd_list[0].id
        for p in pds:
            if p not in product_time:
                product_time[p] = 1
            else:
                product_time[p] += 1

0934893543
1472209621
B00GW6NLP2
B00BPYTNNK
0964668432
B0090GN6UG
B00DJDLZJK
0983234124
0890815372
0199734178
0803218842
B00DQ6PEA6
B00EG0GXJC
0471289280
0312050577
0911977066
193042972X
0312940777
0818406402
B00IXCM4YW
B0076Q8R4C
0786028432
0875168086
0768422531
0736919074
0060283181
1400065062
B00GW6NLP2
1616380268
174059200X
B00768D6DC
0749009918
B00D06YI68
B00EMMW6EU
B00D6NDMLC
1490585907
B0096FR6ES
0735652163
1414318677
1420100009
B00DVRB81O
0373790562
B00K2RCVXK
1599559439
B00C50N1J0
1589230760
157560468X
0916941817
0865717087
1411637682
1468191241
0440378648
0671743813
1440407991
B00EQBAMCK
156512281X
0077346238
B0053PV5H4
0553370758
1843583615
0345499077
0553570951
B00CFQY0JY
0142413046
B00BB002II
0399254617
B008LMLMRY
B000JMKZEA
B003OICBM0
0789326582
B00E4WTJS4
0615825842
0670060682
0786158077
B00KCT1OYU
1500240532
0071786198
080914610X
B007SIGFIS
0425183440
1936120224
0345484827
098337757X
B00KX6FJES
B008AE2QNM
0073193534
1604596910
B0060ZFRQG
B004O0U3P4
B004UJJ2O2
1430325127

In [44]:
product = sorted(product_time.items(), key=lambda x:x[1], reverse=True)
product = [pair[0] for pair in product]

In [46]:
product_time

{'0934893543': 1,
 '1472209621': 1,
 'B00GW6NLP2': 4,
 'B00BPYTNNK': 4,
 '0964668432': 3,
 'B0090GN6UG': 4,
 'B00DJDLZJK': 3,
 '0983234124': 3,
 '0890815372': 3,
 '0199734178': 3,
 '0803218842': 3,
 'B00DQ6PEA6': 1,
 'B00EG0GXJC': 5,
 '0471289280': 4,
 '0312050577': 1,
 '0911977066': 1,
 '193042972X': 3,
 '0312940777': 1,
 '0818406402': 3,
 'B00IXCM4YW': 2,
 'B0076Q8R4C': 3,
 '0786028432': 2,
 '0875168086': 2,
 '0768422531': 2,
 '0736919074': 2,
 '0060283181': 4,
 '1400065062': 1,
 '1616380268': 2,
 '174059200X': 1,
 'B00768D6DC': 4,
 '0749009918': 1,
 'B00D06YI68': 4,
 'B00EMMW6EU': 4,
 'B00D6NDMLC': 2,
 '1490585907': 1,
 'B0096FR6ES': 2,
 '0735652163': 2,
 '1414318677': 1,
 '1420100009': 3,
 'B00DVRB81O': 4,
 '0373790562': 4,
 'B00K2RCVXK': 3,
 '1599559439': 3,
 'B00C50N1J0': 2,
 '1589230760': 4,
 '157560468X': 2,
 '0916941817': 4,
 '0865717087': 1,
 '1411637682': 1,
 '1468191241': 2,
 '0440378648': 4,
 '0671743813': 1,
 '1440407991': 4,
 'B00EQBAMCK': 2,
 '156512281X': 1,
 '00773462

In [47]:
product[:10]

['B00EG0GXJC',
 'B00GW6NLP2',
 'B00BPYTNNK',
 'B0090GN6UG',
 '0471289280',
 '0060283181',
 'B00768D6DC',
 'B00D06YI68',
 'B00EMMW6EU',
 'B00DVRB81O']

In [57]:
df_products = spark.read.json("metadata.json")

22/06/01 00:11:22 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [58]:
df_products.show(2)

+---------------+----------+-----+--------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+
|_corrupt_record|      asin|brand|          categories|         description|               imUrl|price|             related|           salesRank|               title|
+---------------+----------+-----+--------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+
|           null|0001048791| null|           [[Books]]|                null|http://ecx.images...| null|                null|{null, null, null...|The Crucible: Per...|
|           null|0000143561| null|[[Movies & TV, Mo...|3Pack DVD set - I...|http://g-ecx.imag...|12.99|{null, [B0036FO6S...|{null, null, null...|Everyday Italian ...|
+---------------+----------+-----+--------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------

In [62]:
df_products.filter("asin == 'B00EG0GXJC'").select("related").collect()

                                                                                

[Row(related=Row(also_bought=['B00L1AWIWG', 'B00GVKXZXW', 'B00KLGZQWK', 'B00LK0NCO0', 'B00HMT7XV6', 'B00BTFYHWC', 'B00FV4I0SE', 'B00L3KTBBU', 'B007OPIK06', 'B00G2HUWPS', 'B00838YM8C', 'B00FVJLZE0', 'B00F01WMK0', 'B00GEEB15Y', 'B00E251RHY', 'B00DRIT346', 'B00CW6FNXO', 'B00C0B7BYK', 'B00AZZDKFW'], also_viewed=None, bought_together=None, buy_after_viewing=['B00838YM8C', 'B0096IQEZM', 'B00BTFYHWC']))]