# Yocuda Discount Shopper Analysis

#### Importing youcuda data from GCS

In [1]:
data = (spark.read.format("csv").option("header", "true").option("inferSchema", "true").load('gs://westfield-tom/datalab/Yocuda_clean_data_Nov15_Nov17_20171214_v01_*.csv'))

In [23]:
from pyspark.sql.functions import *
import pandas as pd

#### Creating base data

In [14]:
data.registerTempTable("data")
base_data =data.selectExpr('identifier',
                           'brandName as brand', 
                           'cat1 AS categoryL1',
                           'cat2 AS categoryL2',
                           'item_total_txn AS transaction_amount',
                           'item_product_code as product_id', 
                           'item_total', 
                           'item_discounts_amount',
                           'timestamp',
                           'item_quantity as quantity', 
                           'transaction_id')  \
    .withColumn('item_discount', (col('item_discounts_amount')* -1) ) \
    .withColumn('item_unit_price', col('transaction_amount') / col('quantity')) \
  .filter((to_date(col("timestamp"))>'2016-11-30') & \
                  (to_date(col("timestamp"))<'2017-12-01') & \
                  (col("identifier").isNotNull()) & \
          ## added in restriction to look at only matched categories and only Argos transactions
                  (col("categoryL1").isNotNull()) & \
                  (col("retailer_name")=="Argos") & \
                  (col("item_total")>0) & \
                  (col("item_discount")>=0)) 
base_data.show(5)

+--------------------+------------+--------------------+--------------------+------------------+----------+----------+---------------------+--------------------+--------+--------------------+-------------------+------------------+
|          identifier|       brand|          categoryL1|          categoryL2|transaction_amount|product_id|item_total|item_discounts_amount|           timestamp|quantity|      transaction_id|      item_discount|   item_unit_price|
+--------------------+------------+--------------------+--------------------+------------------+----------+----------+---------------------+--------------------+--------+--------------------+-------------------+------------------+
|joanna.hodgetts@g...|      disney|       Toys & Games | Dolls & Accessor...|             42.96|  461/2092|      9.99|  -2.3230935754189943|2017-11-04 09:47:...|       1|20171104 / 825 / ...| 2.3230935754189943|             42.96|
|elizabethkcook@bt...|   energizer|     Home & Kitchen | Household Supplies 

In [15]:
base_data.dtypes

[('identifier', 'string'),
 ('brand', 'string'),
 ('categoryL1', 'string'),
 ('categoryL2', 'string'),
 ('transaction_amount', 'double'),
 ('product_id', 'string'),
 ('item_total', 'double'),
 ('item_discounts_amount', 'string'),
 ('timestamp', 'string'),
 ('quantity', 'string'),
 ('transaction_id', 'string'),
 ('item_discount', 'double'),
 ('item_unit_price', 'double')]

#### Calculating general discount metrics on user cross category L1/L2 level

In [16]:
df1=base_data.groupBy("identifier","categoryL1","categoryL2").agg(sum("item_total").alias("total_spend"),
                                                sum("item_discount").alias("total_discount"),
                                                sum("transaction_amount").alias("total_transaction_amount"),
                                                countDistinct(concat("transaction_id",to_date(col("timestamp")))).alias("total_txns"),
                                                sum("quantity").alias("total_products"))

df2=base_data.filter(col("item_discount")>0) \
             .groupBy("identifier","categoryL1","categoryL2") \
             .agg(sum("quantity").alias("discount_products"))                                                     

df=df1.join(df2,["identifier","categoryL1","categoryL2"],"left")\
      .withColumn("percent_discount_amount",col("total_discount")/col("total_spend"))\
      .withColumn("percent_discount_products",col("discount_products")/col("total_products"))
df.show(5)

+--------------------+---------------+--------------------+-----------+------------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+
|          identifier|     categoryL1|          categoryL2|total_spend|    total_discount|total_transaction_amount|total_txns|total_products|discount_products|percent_discount_amount|percent_discount_products|
+--------------------+---------------+--------------------+-----------+------------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+
|          .@mail.com|   Electronics |     Camera & Photo |      15.96|              7.96|                   15.96|         2|           4.0|              4.0|    0.49874686716791977|                      1.0|
|0107tracey@gmail.com|   Electronics | Accessories & Su...|      20.98|1.4953510028653294|                  111.68|         1|           2.0|              2.0| 

#### Assigning percentile ranks for percent_discount_amount & percent_discount_products

In [17]:
from pyspark.sql.window import Window
percentile_df = df.withColumn("percentile_amount", percent_rank().over(Window.partitionBy("categoryL1","categoryL2") \
                                                                       .orderBy('percent_discount_amount'))) \
                  .withColumn("percentile_products", percent_rank().over(Window.partitionBy("categoryL1","categoryL2") \
                                                                       .orderBy('percent_discount_products'))) 
percentile_df.show(5)

+--------------------+--------------------+-------------+-----------+--------------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+--------------------+-------------------+
|          identifier|          categoryL1|   categoryL2|total_spend|      total_discount|total_transaction_amount|total_txns|total_products|discount_products|percent_discount_amount|percent_discount_products|   percentile_amount|percentile_products|
+--------------------+--------------------+-------------+-----------+--------------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+--------------------+-------------------+
|jaclynness@ymail.com|Health & Personal...| Bath & Body |      19.99|7.433161045625256E-4|                  268.93|         1|           1.0|              1.0|   3.718439742683970...|                      1.0|                 0.0|                0

#### Generating quantile buckets by usinf overall_rank(calculated using score)

In [18]:
score=percentile_df.withColumn("score",(col("percentile_amount")*0.7)+(col("percentile_products")*0.3))
overall_rank = score.withColumn("overall_rank", percent_rank().over(Window.partitionBy("categoryL1","categoryL2") \
                                                                       .orderBy('score'))) 
quantiles = overall_rank.withColumn('quantiles',  when(col('overall_rank') <= 0.01,"First percentile") \
                                                 .when(col('overall_rank') <= 0.02,"Second percentile") \
                                                 .when(col('overall_rank') <= 0.03,"Third percentile")\
                                                 .when(col('overall_rank') <= 0.10,"4 to 10 percentile") \
                                                 .when(col('overall_rank') <= 0.20,"10 to 20 percentile") \
                                                 .when(col('overall_rank') <= 0.40,"20 to 40 percentile") \
                                                 .when(col('overall_rank') <= 0.60,"40 to 60 percentile") \
                                                 .when(col('overall_rank') <= 0.80,"60 to 80 percentile") \
                                                 .when(col('overall_rank') <= 0.90,"80 to 90 percentile") \
                                                 .when(col('overall_rank') >= 1,"90 to 100 percentile") )      
quantiles.show(5)

+--------------------+--------------------+-------------+-----------+--------------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+--------------------+-------------------+--------------------+--------------------+----------------+
|          identifier|          categoryL1|   categoryL2|total_spend|      total_discount|total_transaction_amount|total_txns|total_products|discount_products|percent_discount_amount|percent_discount_products|   percentile_amount|percentile_products|               score|        overall_rank|       quantiles|
+--------------------+--------------------+-------------+-----------+--------------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+--------------------+-------------------+--------------------+--------------------+----------------+
|jaclynness@ymail.com|Health & Personal...| Bath & Body |      19.99|7

#### Percentile bins

In [19]:
xx1=quantiles.groupBy("categoryL1","categoryL2") \
             .agg(count("*").alias("count"))        
  
xx=quantiles.join(xx1,["categoryL1","categoryL2"],"left")\
      .withColumn("percentile_bin", 11-ceil(10*((row_number().over(Window.partitionBy("categoryL1","categoryL2")\
                                                                   .orderBy(desc("score"))))/col('count'))))
xx.show(5)

+--------------------+-------------+--------------------+-----------+-----------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+------------------+-------------------+------------------+------------------+--------------------+-----+--------------+
|          categoryL1|   categoryL2|          identifier|total_spend|   total_discount|total_transaction_amount|total_txns|total_products|discount_products|percent_discount_amount|percent_discount_products| percentile_amount|percentile_products|             score|      overall_rank|           quantiles|count|percentile_bin|
+--------------------+-------------+--------------------+-----------+-----------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+------------------+-------------------+------------------+------------------+--------------------+-----+--------------+
|Health & Personal...|

In [31]:
xx.count()
sum1=xx.agg(countDistinct("identifier"))
sum1.show()

+--------------------------+
|count(DISTINCT identifier)|
+--------------------------+
|                   1567554|
+--------------------------+



#### Stats on a category(L1,L2) cross percentile bin

In [20]:
a=xx.groupBy("categoryL1","categoryL2","percentile_bin").agg(countDistinct("identifier").alias("shoppers"),
                                                             sum("total_txns").alias("total_txns"),
                                                             sum("total_spend").alias("total_spend"),
                                                             sum("total_products").alias("total_products"),
                                                             sum("total_transaction_amount").alias("transaction_amount_pre_discount"),
                                                             sum("total_discount").alias("total_discount_availed"),
                                                             avg("percent_discount_amount").alias("percent_discount_amount"),
                                                             avg("percent_discount_products").alias("percent_discount_products"))
a.show(5)

+--------------------+-------------+--------------+--------+----------+------------------+--------------+-------------------------------+----------------------+-----------------------+-------------------------+
|          categoryL1|   categoryL2|percentile_bin|shoppers|total_txns|       total_spend|total_products|transaction_amount_pre_discount|total_discount_availed|percent_discount_amount|percent_discount_products|
+--------------------+-------------+--------------+--------+----------+------------------+--------------+-------------------------------+----------------------+-----------------------+-------------------------+
|Health & Personal...| Bath & Body |            10|     726|       730|12973.919999999864|         807.0|              36305.65000000037|     4365.282477122211|    0.33566470573677265|                      1.0|
|Health & Personal...| Bath & Body |             9|     726|       732|11853.189999999882|         792.0|             32377.230000000058|    3695.9391162805

#### Stats on a category(L1,L2) cross overall rank

In [21]:
b=xx.groupBy("categoryL1","categoryL2","overall_rank").agg(countDistinct("identifier").alias("shoppers"),
                                                             sum("total_txns").alias("total_txns"),
                                                             sum("total_spend").alias("total_spend"),
                                                             sum("total_products").alias("total_products"),
                                                             sum("total_transaction_amount").alias("transaction_amount_pre_discount"),
                                                             sum("total_discount").alias("total_discount_availed"),
                                                             avg("percent_discount_amount").alias("percent_discount_amount"),
                                                             avg("percent_discount_products").alias("percent_discount_products")) 
b.show(5)

+--------------------+-------------+--------------------+--------+----------+-----------+--------------+-------------------------------+----------------------+-----------------------+-------------------------+
|          categoryL1|   categoryL2|        overall_rank|shoppers|total_txns|total_spend|total_products|transaction_amount_pre_discount|total_discount_availed|percent_discount_amount|percent_discount_products|
+--------------------+-------------+--------------------+--------+----------+-----------+--------------+-------------------------------+----------------------+-----------------------+-------------------------+
|Health & Personal...| Bath & Body |                 0.0|       1|         1|      19.99|           1.0|                         268.93|  7.433161045625256E-4|   3.718439742683970...|                      1.0|
|Health & Personal...| Bath & Body |1.377410468319559E-4|       1|         1|      19.99|           1.0|             126.40999999999998|  0.001581362234000...| 