In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.enableHiveSupport().appName('AmazonReviewRcommender').getOrCreate()
sc = spark.sparkContext

### Load kcore_5.json

In [4]:
# Load kcore_5.json
df = spark.read.json("/user/ywang27/data_final/kore_5.json")

In [5]:
df.show(2)

+----------+-------+-------+--------------------+-----------+--------------+------------+--------------------+--------------+
|      asin|helpful|overall|          reviewText| reviewTime|    reviewerID|reviewerName|             summary|unixReviewTime|
+----------+-------+-------+--------------------+-----------+--------------+------------+--------------------+--------------+
|0000013714| [0, 0]|    4.0|We use this type ...| 12 3, 2013| ACNGUPJ3A3TM9|         GCM|         Nice Hymnal|    1386028800|
|0000013714| [2, 3]|    5.0|I bought this for...|09 13, 2009|A2SUAM1J3GNN3B| J. McDonald|Heavenly Highway ...|    1252800000|
+----------+-------+-------+--------------------+-----------+--------------+------------+--------------------+--------------+
only showing top 2 rows



In [6]:
df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



### Write to Hive

In [15]:
from pyspark.sql import HiveContext
hive_context = HiveContext(sc)

In [22]:
hive_context.sql("use ywang27")
hive_context.sql("show tables").show()
hive_context.sql("drop table if exists kcore_5")

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| ywang27|      chicago_crimes|      false|
| ywang27|  chicago_crimes_orc|      false|
| ywang27|chicago_crimes_sm...|      false|
| ywang27|chicago_crimes_sm...|      false|
| ywang27|             kcore_5|      false|
+--------+--------------------+-----------+



DataFrame[]

In [23]:
df.write.mode('overwrite').saveAsTable('kcore_5')

In [25]:
hive_context.sql("select count(*) from kcore_5").show()

+--------+
|count(1)|
+--------+
|41135700|
+--------+



### Load metadata.json

In [32]:
df1 = spark.read.json("/user/ywang27/data_final/metadata/metadata.json")

In [33]:
df1.show(2)

+---------------+----------+-----+--------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+
|_corrupt_record|      asin|brand|          categories|         description|               imUrl|price|             related|           salesRank|               title|
+---------------+----------+-----+--------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+
|           null|0001048791| null|           [[Books]]|                null|http://ecx.images...| null|                null|[,,,,, 6334800,,,...|The Crucible: Per...|
|           null|0000143561| null|[[Movies & TV, Mo...|3Pack DVD set - I...|http://g-ecx.imag...|12.99|[, [B0036FO6SI, B...|[,,,,,,,,,,,,,,,,...|Everyday Italian ...|
+---------------+----------+-----+--------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------

In [34]:
df1.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- description: string (nullable = true)
 |-- imUrl: string (nullable = true)
 |-- price: double (nullable = true)
 |-- related: struct (nullable = true)
 |    |-- also_bought: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- also_viewed: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- bought_together: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- buy_after_viewing: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- salesRank: struct (nullable = true)
 |    |-- Appliances: long (nullable = true)
 |    |-- Arts, Crafts & Sewing: long (nullable = true)
 |    |-- Automotive: long

In [35]:
hive_context.sql("drop table if exists metadata")

DataFrame[]

In [41]:
df1.rdd.getNumPartitions()

32

In [38]:
df1 = df1.repartition(32)

### Join metadata with kcore_5

In [44]:
df2 = df.join(df1, df.asin == df1.asin, "inner")

In [45]:
df2.show(5)

+----------+-------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+---------------+----------+-----+----------+-----------+--------------------+-----+--------------------+--------------------+---------------+
|      asin|helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|_corrupt_record|      asin|brand|categories|description|               imUrl|price|             related|           salesRank|          title|
+----------+-------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+---------------+----------+-----+----------+-----------+--------------------+-----+--------------------+--------------------+---------------+
|0002216973| [1, 1]|    5.0|And to me, there'...| 09 3, 2012| AESMLAZX4PI6L|      Annetta Ribken|This is a love st...|    1346630400|           null|0002216973| null| [[Books]

In [46]:
df2.count()

33282097