In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import Word2Vec
from pyspark.ml.clustering import KMeans

spark = SparkSession.builder.appName('cluster').getOrCreate()

In [2]:
schema = StructType([ \
    StructField("marketplace",       StringType(),    True), \
    StructField("customer_id",       StringType(),    True), \
    StructField("review_id",         StringType(),    True), \
    StructField("product_id",        StringType(),    True), \
    StructField("product_parent",    StringType(),    True), \
    StructField("product_title",     StringType(),    True), \
    StructField("product_category",  StringType(),    True), \
    StructField("star_rating",       IntegerType(),   True), \
    StructField("helpful_votes",     IntegerType(),   True), \
    StructField("total_votes",       IntegerType(),   True), \
    StructField("vine",              StringType(),    True), \
    StructField("verified_purchase", StringType(),    True), \
    StructField("review_headline",   StringType(),    True), \
    StructField("review_body",       StringType(),    True), \
    StructField("review_date",       TimestampType(), True), \
  ])

path = ['archive/amazon_reviews_us_Apparel_v1_00.tsv',
        'archive/amazon_reviews_us_Automotive_v1_00.tsv',
        'archive/amazon_reviews_us_Baby_v1_00.tsv',
        'archive/amazon_reviews_us_Beauty_v1_00.tsv',
        'archive/amazon_reviews_us_Books_v1_02.tsv',
        'archive/amazon_reviews_us_Camera_v1_00.tsv',
        'archive/amazon_reviews_us_Electronics_v1_00.tsv',
        'archive/amazon_reviews_us_Furniture_v1_00.tsv',
        'archive/amazon_reviews_us_Sports_v1_00.tsv',
        'archive/amazon_reviews_us_Grocery_v1_00.tsv',
        'archive/amazon_reviews_us_Personal_Care_Appliances_v1_00.tsv',
        'archive/amazon_reviews_us_Music_v1_00.tsv']

data = spark.read.csv(path, schema=schema, header=True, sep='\t', mode='DROPMALFORMED')

# Filter product categories

In [3]:
product_category = ['Sports', 'Baby', 'Apparel', 'Grocery', 'Electronics', 'Automotive', 'Books', 'Music', 'Furniture', 'Personal_Care_Appliances', 'Camera', 'Beauty']
data_filter = data.filter(data.product_category.isin(product_category))

data_filter.count()

182530

# Select product info columns

In [22]:
product_info = data_filter.select('product_id', 'product_parent', 'product_title', 'product_category')
product_info.show(5, truncate=False)

+----------+--------------+---------------------------------------------------------------------------+----------------+
|product_id|product_parent|product_title                                                              |product_category|
+----------+--------------+---------------------------------------------------------------------------+----------------+
|B00OZU7BOY|457541908     |Ladies Knee High 3 Pack Fairisle Design Thermal Socks Size 4-7             |Apparel         |
|B00OZU73D8|452184423     |Texere Women's Bamboo Yoga Capri Pants (Atalanta) Luxury Workout Clothing  |Apparel         |
|B00OZU715I|452184423     |Texere Women's Bamboo Yoga Capri Pants (Atalanta) Luxury Workout Clothing  |Apparel         |
|B00OZU6YJC|452184423     |Texere Women's Bamboo Yoga Capri Pants (Atalanta) Luxury Workout Clothing  |Apparel         |
|B00OZU6VD6|138895673     |LOCOMO Men Women Winter Warm Corduroy Folding Ear Flap Warmer Cap FFH222BLK|Apparel         |
+----------+--------------+-----

# Tokenize each column

In [23]:
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover, MinHashLSH, NGram, RegexTokenizer
regexTokenizer_title = RegexTokenizer(inputCol='product_title', outputCol="token_title", pattern="\\W")
tokenized_title = regexTokenizer_title.transform(product_info)

regexTokenizer_category = RegexTokenizer(inputCol='product_category', outputCol="token_category", pattern="\\W")
tokenized_all = regexTokenizer_category.transform(tokenized_title)

# tokenized_all.show(5, truncate=False)

# Stop word removal

In [24]:
remove_stop_title = StopWordsRemover(inputCol='token_title', outputCol='nostop_title')
stop_removed_title = remove_stop_title.transform(tokenized_all)

remove_stop_cat = StopWordsRemover(inputCol='token_category', outputCol='nostop_cat')
stop_removed_all = remove_stop_cat.transform(stop_removed_title)

# stop_removed_all.show(5,truncate=False)

# Split words into characters

In [25]:
concatenated_title = stop_removed_all.withColumn("concat_title", concat_ws(' ', col("nostop_title")))
concatenated_cat = concatenated_title.withColumn("concat_cat", concat_ws(' ', col("nostop_cat")))
concatenated_all = concatenated_cat.withColumn("concat_all", concat_ws(' ', col('product_parent'), col('nostop_title'), col("nostop_cat")))
regexTokenizer = RegexTokenizer(inputCol='concat_all', outputCol="char", pattern="")
split_words = regexTokenizer.transform(concatenated_all)

# split_words.show(5)

# Ngrams (2)

In [26]:
ngrams = NGram(n=2, inputCol='char', outputCol="ngram")
ngram_split = ngrams.transform(split_words)

# ngram_split.show(5)

# Hashing to create term freq

In [27]:
hashingTF = HashingTF(inputCol='ngram', outputCol="vector")
hashed = hashingTF.transform(ngram_split)

# Minihash to generate LSH

In [28]:
mh = MinHashLSH(inputCol="vector", outputCol="lsh", numHashTables=3)
model = mh.fit(hashed)

# Transform data

In [29]:
results = model.transform(hashed)
# results_lens.show(truncate=False)

# Find Similarity

In [37]:
df = model.approxSimilarityJoin(results, results, 0.05, distCol="JaccardDistance")\
    .select(col("datasetA.product_id").alias("idA"),
            col('datasetA.product_title').alias("productA"),
            col("datasetB.product_id").alias("idB"),
            col("datasetB.product_title").alias("productB"),
            col("JaccardDistance"))

In [None]:
df.show(5)