# Cluster product titles into product category

In [8]:
import pandas as pd
import matplotlib.pyplot as plt

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import Word2Vec
from pyspark.ml.clustering import KMeans

spark = SparkSession.builder.appName('cluster').getOrCreate()

## Read in 12 dataset

In [2]:
schema = StructType([ \
    StructField("marketplace",       StringType(),    True), \
    StructField("customer_id",       StringType(),    True), \
    StructField("review_id",         StringType(),    True), \
    StructField("product_id",        StringType(),    True), \
    StructField("product_parent",    StringType(),    True), \
    StructField("product_title",     StringType(),    True), \
    StructField("product_category",  StringType(),    True), \
    StructField("star_rating",       IntegerType(),   True), \
    StructField("helpful_votes",     IntegerType(),   True), \
    StructField("total_votes",       IntegerType(),   True), \
    StructField("vine",              StringType(),    True), \
    StructField("verified_purchase", StringType(),    True), \
    StructField("review_headline",   StringType(),    True), \
    StructField("review_body",       StringType(),    True), \
    StructField("review_date",       TimestampType(), True), \
  ])

path = ['archive/amazon_reviews_us_Apparel_v1_00.tsv',
        'archive/amazon_reviews_us_Automotive_v1_00.tsv',
        'archive/amazon_reviews_us_Baby_v1_00.tsv',
        'archive/amazon_reviews_us_Beauty_v1_00.tsv',
        'archive/amazon_reviews_us_Books_v1_02.tsv',
        'archive/amazon_reviews_us_Camera_v1_00.tsv',
        'archive/amazon_reviews_us_Electronics_v1_00.tsv',
        'archive/amazon_reviews_us_Furniture_v1_00.tsv',
        'archive/amazon_reviews_us_Sports_v1_00.tsv',
        'archive/amazon_reviews_us_Grocery_v1_00.tsv',
        'archive/amazon_reviews_us_Personal_Care_Appliances_v1_00.tsv',
        'archive/amazon_reviews_us_Music_v1_00.tsv']

data = spark.read.csv(path, schema=schema, header=True, sep='\t', mode='DROPMALFORMED')

## Filter out malformated product category

In [3]:
product_category = ['Sports', 'Baby', 'Apparel', 'Grocery', 'Electronics', 'Automotive', 'Books', 'Music', 'Furniture', 'Personal_Care_Appliances', 'Camera', 'Beauty']
data_filter = data.filter(data.product_category.isin(product_category))

data_filter.count()

37172391

## Select only title and category columns

In [4]:
title = data_filter.select('product_title', 'product_category')

In [5]:
title.show()

+--------------------+----------------+
|       product_title|product_category|
+--------------------+----------------+
|Easy Tool Stainle...|         Apparel|
|V28 Women Cowl Ne...|         Apparel|
|James Fiallo Men'...|         Apparel|
|Belfry Gangster 1...|         Apparel|
|JAEDEN Women's Be...|         Apparel|
|Levi's Boys' 514 ...|         Apparel|
|Minimalist Wallet...|         Apparel|
|Harriton Men's Ba...|         Apparel|
|Jockey Women's Un...|         Apparel|
|Alexander Del Ros...|         Apparel|
|Jockey Women's Un...|         Apparel|
|Bali Passion For ...|         Apparel|
|Lilyette Women's ...|         Apparel|
|Vanity Fair Women...|         Apparel|
|Warner's Women's ...|         Apparel|
|Columbia Women's ...|         Apparel|
|Robes King RK Cla...|         Apparel|
|VIV Collection Be...|         Apparel|
|Fruit of the Loom...|         Apparel|
|Jockey Scrubs Wom...|         Apparel|
+--------------------+----------------+
only showing top 20 rows



## Tokenize title column

In [6]:
tokenizer = Tokenizer(inputCol="product_title", outputCol="words")
wordsData = tokenizer.transform(title)
wordsData.show()

+--------------------+----------------+--------------------+
|       product_title|product_category|               words|
+--------------------+----------------+--------------------+
|Easy Tool Stainle...|         Apparel|[easy, tool, stai...|
|V28 Women Cowl Ne...|         Apparel|[v28, women, cowl...|
|James Fiallo Men'...|         Apparel|[james, fiallo, m...|
|Belfry Gangster 1...|         Apparel|[belfry, gangster...|
|JAEDEN Women's Be...|         Apparel|[jaeden, women's,...|
|Levi's Boys' 514 ...|         Apparel|[levi's, boys', 5...|
|Minimalist Wallet...|         Apparel|[minimalist, wall...|
|Harriton Men's Ba...|         Apparel|[harriton, men's,...|
|Jockey Women's Un...|         Apparel|[jockey, women's,...|
|Alexander Del Ros...|         Apparel|[alexander, del, ...|
|Jockey Women's Un...|         Apparel|[jockey, women's,...|
|Bali Passion For ...|         Apparel|[bali, passion, f...|
|Lilyette Women's ...|         Apparel|[lilyette, women'...|
|Vanity Fair Women...|  

## Learn a mapping from words to Vectors.

In [None]:
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="words", outputCol="features")
model = word2Vec.fit(wordsData)

result = model.transform(wordsData)
#result.show(truncate=False)
# Save this huge object
result.write.save("word2vec.parquet")

In [3]:
# Reload from the disc
result = spark.read.load("word2vec.parquet")

In [3]:
result.show()

+--------------------+----------------+--------------------+--------------------+
|       product_title|product_category|               words|            features|
+--------------------+----------------+--------------------+--------------------+
|DC Sports Muffler...|      Automotive|[dc, sports, muff...|[0.32148125022649...|
|Thrush 17713 Turb...|      Automotive|[thrush, 17713, t...|[0.18454607890453...|
|Max Reflector Jum...|      Automotive|[max, reflector, ...|[-0.0210069790482...|
|40 Inch LED Light...|      Automotive|[40, inch, led, l...|[0.11790505796670...|
|9 MOON Matt Multi...|      Automotive|[9, moon, matt, m...|[0.03611409006749...|
|Rampage Jeep 7689...|      Automotive|[rampage, jeep, 7...|[-0.1092054396867...|
|Ameritree Mazda C...|      Automotive|[ameritree, mazda...|[0.11809038201018...|
|Dorman 425-176 In...|      Automotive|[dorman, 425-176,...|[0.47730810344219...|
|Dupli-Color Ceram...|      Automotive|[dupli-color, cer...|[-0.1952677858727...|
|4-1/2" Dent Pul

In [4]:
result.count()

37172391

## Trains a k-means model

In [4]:
kmeans = KMeans(featuresCol='features', k=12)
model = kmeans.fit(result)

## Evaluate clustering by computing Within Set Sum of Squared Errors

In [5]:
wssse = model.summary.trainingCost
print(wssse)
print("Within Set Sum of Squared Errors = " + str(wssse))

4398708.45345352
Within Set Sum of Squared Errors = 4398708.45345352


## Shows the center of each cluster

In [6]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[-0.32765864  0.74853989 -0.39707047]
[0.54868088 0.48256734 0.71603177]
[0.40245698 0.10310091 0.06635891]
[-0.12326375 -0.05573605 -0.19242188]
[ 0.36637293 -0.07801183 -0.55128742]
[-0.43530421  0.04072187  0.22672879]
[-0.29452914  0.39293028 -0.18468668]
[-0.20184139  0.33606862 -0.68807266]
[ 0.97958788 -0.16640928  0.18653521]
[-0.31947199 -0.30975264 -0.45372551]
[-0.45849956  0.68213419  0.25643467]
[ 0.20469939 -0.67662189 -0.17404804]


## Get the clustering prediction

In [7]:
pred = model.transform(result).select('prediction')
pred.show()

+----------+
|prediction|
+----------+
|        11|
|        11|
|         3|
|         2|
|         3|
|         3|
|        11|
|        11|
|         3|
|         3|
|         3|
|         9|
|        11|
|        11|
|         4|
|         3|
|         3|
|         6|
|        11|
|        11|
+----------+
only showing top 20 rows



In [7]:
pred.count()

37172391

## Calculate the clustering error within each group

In [12]:
df1 = result.withColumn("id1", monotonically_increasing_id() )
df2 = pred.withColumn("id2", monotonically_increasing_id() )

df = df1.join(df2, df1.id1 == df2.id2, how='inner')
# Save the merged cluster into Hadoop disc
df.write.save("cluster.parquet")
#df.show()

In [2]:
# Reload from the disc
df = spark.read.load("cluster.parquet")

In [3]:
count_df = df.groupBy('product_category', 'prediction').count()
count_df.show()

+--------------------+----------+-------+
|    product_category|prediction|  count|
+--------------------+----------+-------+
|               Books|         3| 125667|
|                Baby|         5| 211694|
|             Apparel|         6| 459202|
|             Apparel|        11| 133057|
|              Camera|         3| 164032|
|              Beauty|         8|    141|
|              Sports|         9| 923045|
|Personal_Care_App...|         5|  17327|
|              Camera|         8| 302538|
|           Furniture|        11|  38229|
|          Automotive|         1|    897|
|           Furniture|         8|    264|
|              Beauty|         3| 256088|
|           Furniture|         2|   7821|
|               Music|         6|1544974|
|             Apparel|         0|   6511|
|          Automotive|         7|  41123|
|          Automotive|         6|  40291|
|Personal_Care_App...|         2|   8240|
|          Automotive|        11|1782192|
+--------------------+----------+-

In [4]:
sort_count_df = count_df.sort(['product_category', 'prediction'])
sort_count_df.show(sort_count_df.count(), False)

+------------------------+----------+-------+
|product_category        |prediction|count  |
+------------------------+----------+-------+
|Apparel                 |0         |6511   |
|Apparel                 |1         |27     |
|Apparel                 |2         |6138   |
|Apparel                 |3         |1796500|
|Apparel                 |4         |73920  |
|Apparel                 |5         |286827 |
|Apparel                 |6         |459202 |
|Apparel                 |7         |281188 |
|Apparel                 |8         |234    |
|Apparel                 |9         |2859932|
|Apparel                 |10        |2786   |
|Apparel                 |11        |133057 |
|Automotive              |0         |502    |
|Automotive              |1         |897    |
|Automotive              |2         |298749 |
|Automotive              |3         |786269 |
|Automotive              |4         |165832 |
|Automotive              |5         |72668  |
|Automotive              |6       

In [13]:
max_count = sort_count_df.groupBy('product_category').max('count')
mc=max_count.withColumnRenamed('max(count)', 'max_c')
mc.join(sort_count_df, mc.max_c == sort_count_df.count, 'inner').show()

In [20]:
mc.show()

+--------------------+-------+
|    product_category|  max_c|
+--------------------+-------+
|              Sports|1501059|
|                Baby| 521556|
|             Apparel|2859932|
|             Grocery| 846514|
|         Electronics|1565742|
|          Automotive|1782192|
|               Books|1178054|
|               Music|1544974|
|           Furniture| 386240|
|Personal_Care_App...|  22766|
|              Camera|1049634|
|              Beauty|2316860|
+--------------------+-------+



In [22]:
mc.join(sort_count_df, col('max_c') == col('count')).show()

+--------------------+-------+--------------------+----------+-------+
|    product_category|  max_c|    product_category|prediction|  count|
+--------------------+-------+--------------------+----------+-------+
|                Baby| 521556|                Baby|         3| 521556|
|              Sports|1501059|              Sports|         4|1501059|
|               Books|1178054|               Books|         0|1178054|
|              Camera|1049634|              Camera|         2|1049634|
|               Music|1544974|               Music|         6|1544974|
|Personal_Care_App...|  22766|Personal_Care_App...|         3|  22766|
|              Beauty|2316860|              Beauty|        10|2316860|
|         Electronics|1565742|         Electronics|         2|1565742|
|             Apparel|2859932|             Apparel|         9|2859932|
|          Automotive|1782192|          Automotive|        11|1782192|
|           Furniture| 386240|           Furniture|         3| 386240|
|     