#### Final Project: H&M Personalized Fashion Recommendations
#### Market Basket Analysis (MBA) Preprocessing
#### Will Jarrard (wej5ar) Abhi Dommalapati (ad4bu), Sebastian Ranasinghe (sar2jf)
<br>

In [34]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%cd /project/ds5559/h_and_m/
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import collect_set, col, count, row_number, lit
from pyspark.sql.window import Window
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import from_unixtime, unix_timestamp, year, month, col, date_format
from pyspark.ml.feature import Bucketizer
from pyspark.mllib.evaluation import RankingMetrics
import random
# spark = SparkSession.builder.getOrCreate()
spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "100g") \
    .appName('my-cool-app') \
    .getOrCreate()

/project/ds5559/h_and_m


#### Create Article Lookup Table

In [2]:
articles = spark.read.csv('/project/ds5559/h_and_m/articles.csv',  inferSchema=True, header = True)

In [3]:
# Create distinct pairings of product types, and index

art_lookup = articles.select(col('product_type_name'), col('index_name')) \
                       .distinct() \
                       .sort(col('product_type_no'))


# Create ID for unique product, index types

w = Window().orderBy(lit('A'))
art_lookup = art_lookup.withColumn("prod_index_id", row_number().over(w))


# Rejoin with articles, extract relevant columns

art_lookup = articles.join(art_lookup, 
                           on = ['product_type_name', 'index_name'], 
                           how = 'inner')

art_lookup = art_lookup.select(col('article_id'),
                  col('product_type_name'), col('product_code'),
                  col('product_type_no'), 
                  col('index_name'), 
                  col('prod_index_id')).sort(col('prod_index_id'))

art_lookup_short = art_lookup.select(col('article_id'), 
                  col('prod_index_id')).sort(col('prod_index_id'))

<br>

#### Create demographic table

In [4]:
customers = spark.read.csv('/project/ds5559/h_and_m/customers.csv',  inferSchema=True, header = True)

customers.show(5)

+--------------------+----+------+------------------+----------------------+---+--------------------+
|         customer_id|  FN|Active|club_member_status|fashion_news_frequency|age|         postal_code|
+--------------------+----+------+------------------+----------------------+---+--------------------+
|00000dbacae5abe5e...|null|  null|            ACTIVE|                  NONE| 49|52043ee2162cf5aa7...|
|0000423b00ade9141...|null|  null|            ACTIVE|                  NONE| 25|2973abc54daa8a5f8...|
|000058a12d5b43e67...|null|  null|            ACTIVE|                  NONE| 24|64f17e6a330a85798...|
|00005ca1c9ed5f514...|null|  null|            ACTIVE|                  NONE| 54|5d36574f52495e81f...|
|00006413d8573cd20...| 1.0|   1.0|            ACTIVE|             Regularly| 52|25fa5ddee9aac01b3...|
+--------------------+----+------+------------------+----------------------+---+--------------------+
only showing top 5 rows



In [5]:
# Add in age buckets

bucketizer = Bucketizer(splits=[0, 18, 20, 25, 30, 40, 50, 120],inputCol="age", outputCol="buckets")
customers = bucketizer.setHandleInvalid("keep").transform(customers)

In [6]:
dem_short = customers.select(col('customer_id'), col('buckets'))

dem_short.show(5)

+--------------------+-------+
|         customer_id|buckets|
+--------------------+-------+
|00000dbacae5abe5e...|    5.0|
|0000423b00ade9141...|    3.0|
|000058a12d5b43e67...|    2.0|
|00005ca1c9ed5f514...|    6.0|
|00006413d8573cd20...|    6.0|
+--------------------+-------+
only showing top 5 rows



<br>

#### Find Top-n Items Per Demographic Group

In [7]:
parquetFile = spark.read.parquet("trans2020_par")
parquetFile.createOrReplaceTempView("trans_2020")

In [8]:
trans_2020 = spark.sql("SELECT * FROM trans_2020")

trans_2020 = trans_2020.join(dem_short, on = 'customer_id', how = 'inner')
trans_2020 = trans_2020.join(art_lookup_short, on = 'article_id', how = 'inner')

In [9]:
trans_2020.show(5)

+----------+--------------------+----------+--------------------+----------------+-------------------+----+-------+-------------+
|article_id|         customer_id|     t_dat|               price|sales_channel_id|               date|year|buckets|prod_index_id|
+----------+--------------------+----------+--------------------+----------------+-------------------+----+-------+-------------+
| 126589006|5d363f0446b54a831...|2020-06-27|0.001677966101694915|               1|2020-06-27 00:00:00|2020|    3.0|           66|
| 126589006|2c012ed3ee6496bb9...|2020-02-01|0.001677966101694915|               1|2020-02-01 00:00:00|2020|    2.0|           66|
| 126589006|e1a51bd06f36e5c70...|2020-02-15|0.001677966101694915|               1|2020-02-15 00:00:00|2020|    6.0|           66|
| 126589006|12b97d40968d85c6b...|2020-05-30|0.001677966101694915|               1|2020-05-30 00:00:00|2020|    1.0|           66|
| 126589006|8159c99366e1f966d...|2020-02-10|0.001677966101694915|               1|2020-02-

In [10]:
# Below gives the final Top-n purchase table

trans_2020_small = trans_2020.select(col('article_id'), col('buckets'), col('prod_index_id'))

top_n = trans_2020_small.groupBy('buckets', 'prod_index_id', 'article_id').count()
top_n = top_n.sort(col('buckets'), col('prod_index_id'), col('count').desc())

windowDept = Window.partitionBy(['buckets', 'prod_index_id']).orderBy(col("count").desc())

top_n = top_n.withColumn("row",row_number().over(windowDept))
top_n = top_n.filter(col('row') <= 3)

top_n = top_n.sort(col('buckets'), col('prod_index_id'), col('count').desc())

windowDept = Window.partitionBy(['buckets', 'prod_index_id']).orderBy(col("count").desc())

top_n = top_n.withColumn("row",row_number().over(windowDept))
top_n = top_n.filter(col('row') <= 3)

top_n = top_n.sort(col('buckets'), col('prod_index_id'), col('count').desc())

top_n.show()

+-------+-------------+----------+-----+---+
|buckets|prod_index_id|article_id|count|row|
+-------+-------------+----------+-----+---+
|   null|            2| 867969003|    3|  1|
|   null|            2| 724906019|    3|  3|
|   null|            2| 774113006|    3|  2|
|   null|            4| 852584001|   17|  1|
|   null|            4| 730683050|   13|  2|
|   null|            4| 815434001|    8|  3|
|   null|            5| 700701002|   21|  1|
|   null|            5| 855080001|   15|  2|
|   null|            5| 719957006|   14|  3|
|   null|            8| 858811003|    1|  1|
|   null|            9| 851780003|    5|  1|
|   null|            9| 851780001|    4|  2|
|   null|            9| 851780002|    3|  3|
|   null|           10| 856667004|    6|  1|
|   null|           10| 856667005|    3|  2|
|   null|           10| 895555001|    3|  3|
|   null|           12| 885584001|    1|  1|
|   null|           13| 811783002|   16|  1|
|   null|           13| 844409002|   13|  2|
|   null| 

#### Load Data and Merge

##### Read in data from train_par

In [11]:
parquetFile = spark.read.parquet("train_par")
parquetFile.createOrReplaceTempView("train")

In [12]:
train_final = spark.sql("SELECT * FROM train")

train_final = train_final.groupBy('customer_id') \
                      .agg(collect_set('prod_index_id') \
                      .alias('items'))

In [13]:
train_final.show(10)

+--------------------+--------------------+
|         customer_id|               items|
+--------------------+--------------------+
|037ecdde0c1ccf9aa...|[37, 103, 243, 35...|
|038107c7f10c14bac...|[299, 263, 306, 2...|
|03a250a3def5fdeb2...|[299, 304, 258, 265]|
|03b286afcf95433cc...|               [306]|
|03cbb6ef35c9d7f4d...|               [258]|
|03deb76b617aee9ed...|[215, 168, 37, 24...|
|03f4c3fe994d9ab76...|[335, 455, 238, 2...|
|03fadee866717dbb6...|[218, 212, 9, 456...|
|03fccd820834e598d...|     [230, 216, 304]|
|04068da583e91ef4c...|               [212]|
+--------------------+--------------------+
only showing top 10 rows



Read in data from testprior_par and testafter_par, then merge

In [14]:
parquetFile = spark.read.parquet("testprior_par")
parquetFile.createOrReplaceTempView("test_prior")

In [15]:
testprior_final = spark.sql("SELECT * FROM test_prior")

testprior_final = testprior_final.groupBy('customer_id') \
                      .agg(collect_set('prod_index_id') \
                      .alias('history'))

In [16]:
testprior_final.show(10)

+--------------------+--------------------+
|         customer_id|             history|
+--------------------+--------------------+
|05a45a5e4c53a72f1...|[230, 306, 285, 2...|
|05a8260d130f082b0...|[230, 430, 227, 454]|
|05aeca0642a1d1d14...|[354, 13, 304, 35...|
|05c2293d6ab5896be...|               [289]|
|05c7d4f73fcde0580...|[299, 306, 223, 2...|
|05caad573b928bb68...|     [350, 243, 381]|
|05e0a597465504e1e...|[299, 379, 227, 1...|
|05f65801b9a2d28a5...|[230, 299, 379, 3...|
|05fb9d8e4d5a03715...|                [97]|
|060d675c6b3339f5a...|[306, 241, 410, 2...|
+--------------------+--------------------+
only showing top 10 rows



In [17]:
parquetFile = spark.read.parquet("testafter_par")
parquetFile.createOrReplaceTempView("test_after")

In [18]:
testafter_final = spark.sql("SELECT * FROM test_after")

testafter_final = testafter_final.groupBy('customer_id') \
                      .agg(collect_set('article_id') \
                      .alias('labels'))

In [19]:
testafter_final.show(10)

+--------------------+--------------------+
|         customer_id|              labels|
+--------------------+--------------------+
|04daaa60957b280cb...|[794575001, 62448...|
|054324bf3c4451750...|         [806778001]|
|058306b9a1720d1a0...|[924250001, 89616...|
|05a45a5e4c53a72f1...|         [832481003]|
|05a8260d130f082b0...|[716672001, 27038...|
|05aeca0642a1d1d14...|         [825714001]|
|05c2293d6ab5896be...|[776237011, 59958...|
|05c7d4f73fcde0580...|[877769001, 86579...|
|05caad573b928bb68...|[892937003, 75268...|
|05e0a597465504e1e...|[837306010, 76284...|
+--------------------+--------------------+
only showing top 10 rows



In [20]:
test_full = testprior_final.join(testafter_final, on = 'customer_id', how = 'inner')

In [21]:
test_full.show(10)

+--------------------+--------------------+--------------------+
|         customer_id|             history|              labels|
+--------------------+--------------------+--------------------+
|05a45a5e4c53a72f1...|[230, 306, 285, 2...|         [832481003]|
|05a8260d130f082b0...|[230, 430, 227, 454]|[716672001, 27038...|
|05aeca0642a1d1d14...|[354, 13, 304, 35...|         [825714001]|
|05c2293d6ab5896be...|               [289]|[776237011, 59958...|
|05c7d4f73fcde0580...|[299, 306, 223, 2...|[877769001, 86579...|
|05caad573b928bb68...|     [350, 243, 381]|[892937003, 75268...|
|05e0a597465504e1e...|[299, 379, 227, 1...|[837306010, 76284...|
|05f65801b9a2d28a5...|[230, 299, 379, 3...|[815629005, 87575...|
|05fb9d8e4d5a03715...|                [97]|[794468003, 90741...|
|060d675c6b3339f5a...|[306, 241, 410, 2...|[808648001, 82397...|
+--------------------+--------------------+--------------------+
only showing top 10 rows



<br>

#### Market Basket Analysis

In [22]:
# Specify Model

fpGrowth = FPGrowth(itemsCol="items", minSupport=0.05, minConfidence=0.01)

In [23]:
model = fpGrowth.fit(train_final)

In [24]:
# Show top 10 association rules

rules = model.associationRules.sort(col('Confidence').desc())

rules.show(20)

+----------+----------+------------------+------------------+--------------------+
|antecedent|consequent|        confidence|              lift|             support|
+----------+----------+------------------+------------------+--------------------+
| [21, 454]|     [415]|0.9312902525508358| 4.468263972332483| 0.05855215416148192|
| [21, 334]|     [415]| 0.917356615986753|4.4014113814325615| 0.05545154473135255|
| [21, 304]|     [415]|0.9165929855585028| 4.397747537296556|  0.0636728231469707|
|      [21]|     [415]|0.9101162709740305| 4.366672724307146| 0.17842039532201523|
|[454, 415]|      [21]|0.8666621771776828| 4.420813817072974| 0.05855215416148192|
|[415, 334]|      [21]|0.8625619249823071| 4.399898572314249| 0.05545154473135255|
|[415, 304]|      [21]|0.8618672250277127| 4.396354931852181|  0.0636728231469707|
|     [415]|      [21]|0.8560482858733259| 4.366672724307145| 0.17842039532201523|
|[306, 334]|     [304]|0.7171623986002863| 2.338901067320764| 0.05128403447750295|
|[25

In [25]:
rules.count()

163

In [26]:
print("Antedents:")

art_lookup.filter(col('prod_index_id') == '21').select(col('prod_index_id'), col('product_type_name')).show(1)
art_lookup.filter(col('prod_index_id') == '454').select(col('prod_index_id'), col('product_type_name')).show(1)

print("")
print("Consequent:")

art_lookup.filter(col('prod_index_id') == '415').select(col('prod_index_id'), col('product_type_name')).show(1)

Antedents:
+-------------+-----------------+
|prod_index_id|product_type_name|
+-------------+-----------------+
|           21|  Swimwear bottom|
+-------------+-----------------+
only showing top 1 row

+-------------+-----------------+
|prod_index_id|product_type_name|
+-------------+-----------------+
|          454|              Bra|
+-------------+-----------------+
only showing top 1 row


Consequent:
+-------------+-----------------+
|prod_index_id|product_type_name|
+-------------+-----------------+
|          415|       Bikini top|
+-------------+-----------------+
only showing top 1 row



<br>

#### Make Predictions On Test Set

##### Create dataframe of all subsets of basket for each customer

In [27]:
import itertools
n_test = test_full.count()
#n_test = 1

cust_id = test_full.rdd.map(lambda x: x[0]).collect()
history = test_full.rdd.map(lambda x: x[1]).collect()

In [28]:
# Create list of all possible subsets of baskets of length 1, 2, by customer

train_antedecent = []

for i in range(n_test):
    
    combo1 = []
    combo2 = []
    combo3 = []
    combo_all = []
    
    cust = cust_id[i]
    hist = history[i]
    
    combo1 = hist
    combo2 = list(itertools.combinations(hist, 2))
    #combo3 = list(itertools.combinations(hist, 3))
    
    for item in combo1:
        combo_all.append(item)
    
    for item in combo2:
        combo_all.append(item)
    
    temp = list(itertools.product([cust], combo_all))
    
    for item in temp:
        train_antedecent.append(item)

print(len(train_antedecent))

4398581


In [29]:
customerid = [train_antedecent[i][0] for i in range(len(train_antedecent))]
combinations = [train_antedecent[i][1] for i in range(len(train_antedecent))]

sample_pred = pd.DataFrame(zip(customerid, combinations), columns = ['customer_id', 'basket'])

sample_pred['basket_clean'] = sample_pred['basket'].apply(lambda x: list(x) if (isinstance(x, tuple) == True) else [x])
sample_pred.basket_clean = sample_pred.basket_clean.apply(sorted)

sample_pred['basket_clean'] = sample_pred['basket_clean'].apply(lambda x: str(x[0]) if (len(x) == 1) else str(x[0]) + '-' + str(x[1]))

In [30]:
sample_pred.head(15)

Unnamed: 0,customer_id,basket,basket_clean
0,05a45a5e4c53a72f1fa1aaa812b0e907c91aa2b59c156a...,230,230
1,05a45a5e4c53a72f1fa1aaa812b0e907c91aa2b59c156a...,306,306
2,05a45a5e4c53a72f1fa1aaa812b0e907c91aa2b59c156a...,285,285
3,05a45a5e4c53a72f1fa1aaa812b0e907c91aa2b59c156a...,238,238
4,05a45a5e4c53a72f1fa1aaa812b0e907c91aa2b59c156a...,221,221
5,05a45a5e4c53a72f1fa1aaa812b0e907c91aa2b59c156a...,21,21
6,05a45a5e4c53a72f1fa1aaa812b0e907c91aa2b59c156a...,177,177
7,05a45a5e4c53a72f1fa1aaa812b0e907c91aa2b59c156a...,25,25
8,05a45a5e4c53a72f1fa1aaa812b0e907c91aa2b59c156a...,258,258
9,05a45a5e4c53a72f1fa1aaa812b0e907c91aa2b59c156a...,415,415


##### Process rules dataset

In [31]:
rules_pd = rules.toPandas()

rules_pd.antecedent = rules_pd.antecedent.apply(sorted)
rules_pd['antecedent'] = rules_pd['antecedent'].apply(lambda x: str(x[0]) if (len(x) == 1) else str(x[0]) + '-' + str(x[1]))
rules_pd['consequent'] = rules_pd['consequent'].apply(lambda x: x[0])

rules_pd.head(10)

Unnamed: 0,antecedent,consequent,confidence,lift,support
0,21-454,415,0.93129,4.468264,0.058552
1,21-334,415,0.917357,4.401411,0.055452
2,21-304,415,0.916593,4.397748,0.063673
3,21,415,0.910116,4.366673,0.17842
4,415-454,21,0.866662,4.420814,0.058552
5,334-415,21,0.862562,4.399899,0.055452
6,304-415,21,0.861867,4.396355,0.063673
7,415,21,0.856048,4.366673,0.17842
8,306-334,304,0.717162,2.338901,0.051284
9,230-258,304,0.645283,2.104478,0.056885


##### Merge the sample_pred dataframe on the rules dataframe using basket_clean = antecedent. We then know that for each customer, the "consequent" shows which item they would likely buy. 

##### We take the top 4 rules for each customer based on the confidence of the rule

In [32]:
# Merge sample_pred with rules

item_pred = pd.merge(sample_pred, rules_pd, left_on = 'basket_clean', right_on = 'antecedent', how = 'inner')

# Format table and take top 4 rules

item_pred = item_pred.sort_values(by = ['customer_id', 'confidence'], ascending = [True, False])
item_pred = item_pred[['customer_id', 'consequent', 'confidence']]
item_pred = item_pred.groupby('customer_id').head(4)
item_pred = item_pred.drop_duplicates(subset = ['customer_id', 'consequent'], keep = 'first')

# Get age grouping

dem_short_pd = dem_short.toPandas()
item_pred = pd.merge(item_pred, dem_short_pd, on = 'customer_id', how = 'inner')

item_pred.head(10)

Unnamed: 0,customer_id,consequent,confidence,buckets
0,04ba60f45066f9491d85f1db5f200318b23745ce3043ac...,304,0.717162,3.0
1,04ba60f45066f9491d85f1db5f200318b23745ce3043ac...,334,0.506106,3.0
2,04ba60f45066f9491d85f1db5f200318b23745ce3043ac...,230,0.497208,3.0
3,04ba912d6da150012aba2f4d0c295c94882c1ca78d3439...,415,0.917357,3.0
4,04ba912d6da150012aba2f4d0c295c94882c1ca78d3439...,21,0.862562,3.0
5,04ba9a16a49027a298374a243243ed7f25f13550fa0340...,415,0.917357,6.0
6,04ba9a16a49027a298374a243243ed7f25f13550fa0340...,21,0.862562,6.0
7,04bacf040099b9f157d45bf310959f86fdb8ffeb2ca396...,454,0.587031,2.0
8,04bacf040099b9f157d45bf310959f86fdb8ffeb2ca396...,304,0.535383,2.0
9,04bacf040099b9f157d45bf310959f86fdb8ffeb2ca396...,334,0.506428,2.0


##### Finally join to top_n to make predictions of specific products

In [35]:
top_n_pd = top_n.toPandas()

pred_final = pd.merge(item_pred, top_n_pd, left_on = ['consequent', 'buckets'], 
                      right_on = ['prod_index_id', 'buckets'], 
                      how = 'inner')

pred_final = pred_final[['customer_id', 'article_id']]
pred_final = pred_final.groupby('customer_id', sort=False)['article_id'].apply(list).to_frame()
pred_final.reset_index(inplace=True)

pred_final['article_sample'] = pred_final['article_id'].apply(lambda line: random.sample(line, min(12, len(line))))

pred_final.head(10)

Unnamed: 0,customer_id,article_id,article_sample
0,04ba60f45066f9491d85f1db5f200318b23745ce3043ac...,"[883033002, 841434001, 817353008, 751471001, 7...","[832361002, 883033002, 751471001, 841434001, 8..."
1,04bead3ad988400817bc3953fba650102a05318dec051e...,"[883033002, 841434001, 817353008, 751471001, 7...","[810557001, 817353008, 883033002, 562245046, 7..."
2,04bf2365923c8a34cc62010c91c6d4bbbd01beba0577c2...,"[883033002, 841434001, 817353008, 751471001, 7...","[562245046, 753737001, 841434001, 817353008, 7..."
3,04bf83f5656b666ef6d093d9d05609ea993492afed1e40...,"[883033002, 841434001, 817353008, 751471001, 7...","[562245046, 841434001, 753737001, 883033002, 7..."
4,04c09031ed511c14ab85d9e82809d17da88e919414862d...,"[883033002, 841434001, 817353008, 751471001, 7...","[562245046, 753737001, 751471001, 841434001, 8..."
5,04c17e7e3d62d81de574116a8ec2bebac0e76719d57ade...,"[883033002, 841434001, 817353008, 751471001, 7...","[753737001, 883033002, 850259001, 841434001, 8..."
6,04c2ef7fbc1aead545caaec0ce03c2b9433129bfbfa082...,"[883033002, 841434001, 817353008, 751471001, 7...","[841434001, 562245046, 817353008, 464297007, 8..."
7,04c8230c64ab0f2394421312c431cab7fdf4f44b8e1222...,"[883033002, 841434001, 817353008]","[841434001, 817353008, 883033002]"
8,04c9b2cab2c8f7f8084d4b31d82d01641a2e4cd5765cfb...,"[883033002, 841434001, 817353008, 751471001, 7...","[599580055, 751471001, 817353008, 753737001, 5..."
9,04ca80476733408a907e3e171cacd382b378d05e7f09d0...,"[883033002, 841434001, 817353008, 751471001, 7...","[883033002, 599580038, 841434001, 562245046, 5..."


<br>

#### Evaluate Predictions

In [36]:
pred_sc = spark.createDataFrame(pred_final) 

eval_df = test_full.join(pred_sc, on = 'customer_id', how = 'inner')
eval_df = eval_df.select(col('article_id'), col('labels'))
eval_df = eval_df.withColumnRenamed('article_id', 'predictions')

eval_df.show(10)

+--------------------+--------------------+
|         predictions|              labels|
+--------------------+--------------------+
|[859136002, 86300...|         [832481003]|
|[856840001, 81736...|[716672001, 27038...|
|[859136002, 86300...|         [825714001]|
|[883033002, 84143...|[877769001, 86579...|
|[883033002, 81735...|[837306010, 76284...|
|[751471001, 75373...|[815629005, 87575...|
|[859136002, 86300...|[808648001, 82397...|
|[883033002, 84143...|[772234002, 86507...|
|[678942001, 66407...|         [711239006]|
|[841434001, 84143...|[739362008, 82316...|
+--------------------+--------------------+
only showing top 10 rows



In [37]:
pred_labels = eval_df.select(col("predictions"), col("labels")).rdd
pred_labels = pred_labels.map(lambda x: (x[0], x[1]))

In [38]:
metrics = RankingMetrics(pred_labels)

In [39]:
metrics.meanAveragePrecisionAt(12)

0.0015986417381309223