# Product Recommendation Based on Product Similarity

We can help business understand more about their products by calculating product similarity (cosine similarity) based on user purchase data for particular item, then we can use product similarity to recommend similar product for users.

Data
* Customers X Products Matrix

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.types import *
from pyspark import SparkConf

In [2]:
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

In [3]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from itertools import chain

ToArray = udf(lambda row: row.toArray().tolist() ,  ArrayType( DoubleType() , containsNull=False ) )

In [4]:
spark = SparkSession \
        .builder \
        .appName("SuperMarket Analytic") \
        .enableHiveSupport() \
        .getOrCreate()

sc = spark.sparkContext

## Load Customer x Product data

In [5]:
df = spark.sql("""
SELECT *
FROM default.pivot_cust_prod_sum
""")

In [6]:
ignore = ['CUST_CODE']
assembler = VectorAssembler(
    inputCols=[x for x in df.columns if x not in ignore],
    outputCol="features")

# assemble feature into one column.
data = assembler.transform(df).select('CUST_CODE', 'features').cache()

# Normalize Feature
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=2.0)
normalize_data = normalizer.transform(data).select("CUST_CODE", "normFeatures")
normalize_data = normalize_data.withColumn("normFeatures", ToArray( normalize_data.normFeatures ) ).cache()

In [7]:
normalize_data.show(5)

+--------------+--------------------+
|     CUST_CODE|        normFeatures|
+--------------+--------------------+
|CUST0000336458|[0.0, 0.0, 0.0, 0...|
|CUST0000063499|[0.0, 0.0, 0.0, 0...|
|CUST0000032037|[0.0, 0.0, 0.0, 0...|
|CUST0000895912|[0.0, 0.0, 0.0, 0...|
|CUST0000344205|[0.0, 0.0, 0.0, 0...|
+--------------+--------------------+
only showing top 5 rows



### Compute cosine similarities between columns (Product) of this matrix

In [8]:
rowmatrix = RowMatrix( normalize_data.select('normFeatures').rdd.map(list) )
entries = rowmatrix.columnSimilarities()

In [9]:
# MatrixEntry to Dataframe
result_rdd = entries.entries.map(lambda entry:  (entry.i, entry.j, entry.value) )
result_df = spark.createDataFrame(result_rdd, ['I', 'J', 'Value']).cache()

In [10]:
result_df.show(10)

+----+----+--------------------+
|   I|   J|               Value|
+----+----+--------------------+
| 372|2024|2.134202616595427...|
| 918|1768|4.496209982804470...|
|1221|4838|0.001437065595868...|
|4762|4841|0.006215728928976862|
|2286|3181|7.723494644030827E-4|
| 765|3556| 0.01909570156185527|
| 973|2118|5.565617584170745E-4|
|3243|4070|7.693018437447457E-4|
| 320| 788|0.002723796615577...|
| 999|4472|3.395537150058167...|
+----+----+--------------------+
only showing top 10 rows



In [11]:
columns = [x for x in df.columns if x not in ignore]
dict_columns = { i:columns[i] for i in range( len(columns) )}

MapIndexWithName = create_map([lit(x) for x in chain(*dict_columns.items() ) ])

# Map Column Index with Column Name
Product_Similarity = result_df.withColumn("Product1", MapIndexWithName.getItem( col("I") ) ) \
                              .withColumn("Product2", MapIndexWithName.getItem( col("J") ) ) \
                              .select("Product1", "Product2", "Value") \
                              .cache()

### Top 15 Product that are simililar

In [12]:
Product_Similarity.orderBy( col("Value").desc() ).show(15)

+----------+----------+------------------+
|  Product1|  Product2|             Value|
+----------+----------+------------------+
|PRD0900831|PRD0902580| 0.983213862373871|
|PRD0901814|PRD0903406|0.9773287691165502|
|PRD0902237|PRD0903643|0.9484370041654808|
|PRD0900585|PRD0901208|0.9462486961404734|
|PRD0901517|PRD0904708|0.9455417399357372|
|PRD0900918|PRD0904576|0.9398997657902919|
|PRD0903843|PRD0904187|0.9356220652495086|
|PRD0901061|PRD0901960|0.9218063568108338|
|PRD0904677|PRD0904778|0.9216428116462088|
|PRD0900350|PRD0904765| 0.914648521705544|
|PRD0900216|PRD0901177|0.9144144765220199|
|PRD0900861|PRD0904694|0.9124786224521093|
|PRD0901593|PRD0903305|0.9104669706293007|
|PRD0901005|PRD0903807|0.9035063940391864|
|PRD0903946|PRD0904718|0.9034095106847959|
+----------+----------+------------------+
only showing top 15 rows

