# Yocuda Discount Shopper Analysis

#### Importing youcuda data from GCS

In [3]:
data = (spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("gs://ds-mlengine/praneeth/2017_Yocuda_Dummy_Data_V01_20180129.csv"))

In [4]:
from pyspark.sql.functions import *

#### Creating base data

In [7]:
data.registerTempTable("data")
base_data =data.selectExpr('identifier',
                           'brandName as brand', 
                           'cat1 AS categoryL1',
                           'cat2 AS categoryL2',
                           'item_total_txn AS transaction_amount',
                           'item_product_code as product_id', 
                           'item_total', 
                           'item_discounts_amount',
                           'timestamp',
                           'item_quantity as quantity', 
                           'transaction_id')  \
    .withColumn('item_discount', (col('item_discounts_amount')* -1) ) \
    .withColumn('item_unit_price', col('transaction_amount') / col('quantity')) \
  .filter((to_date(col("timestamp"))>'2016-11-30') & \
                  (to_date(col("timestamp"))<'2017-12-01') & \
                  (col("identifier").isNotNull()) & \
                  (col("item_total")>0) & \
                  (col("item_discount")>=0)) 
base_data.show(5)

+----------+-----+----------+----------+------------------+----------+----------+---------------------+--------------------+--------+--------------------+-------------+---------------+
|identifier|brand|categoryL1|categoryL2|transaction_amount|product_id|item_total|item_discounts_amount|           timestamp|quantity|      transaction_id|item_discount|item_unit_price|
+----------+-----+----------+----------+------------------+----------+----------+---------------------+--------------------+--------+--------------------+-------------+---------------+
| xsa@c.com|   CG|    Media |     Books|            115.97|   5004040|       2.0|                  0.0|2017-07-05 13:40:...|       2|2017-07-05 - G123...|         -0.0|         57.985|
| inn@e.com|   PP|    Media |     Books|            144.75|   5036515|    138.75|               -27.75|2017-08-22 00:28:...|       5|2017-08-22 - D188...|        27.75|          28.95|
| amd@c.com|   VL|    Media |     Books|             48.45|   5035216|     

In [8]:
base_data.dtypes

[('identifier', 'string'),
 ('brand', 'string'),
 ('categoryL1', 'string'),
 ('categoryL2', 'string'),
 ('transaction_amount', 'double'),
 ('product_id', 'int'),
 ('item_total', 'double'),
 ('item_discounts_amount', 'double'),
 ('timestamp', 'string'),
 ('quantity', 'int'),
 ('transaction_id', 'string'),
 ('item_discount', 'double'),
 ('item_unit_price', 'double')]

#### Calculating general discount metrics on user cross category L1/L2 level

In [10]:
df1=base_data.groupBy("identifier","categoryL1","categoryL2").agg(sum("item_total").alias("total_spend"),
                                                sum("item_discount").alias("total_discount"),
                                                sum("transaction_amount").alias("total_transaction_amount"),
                                                countDistinct(concat("transaction_id",to_date(col("timestamp")))).alias("total_txns"),
                                                sum("quantity").alias("total_products"))

df2=base_data.filter(col("item_discount")>0) \
             .groupBy("identifier","categoryL1","categoryL2") \
             .agg(sum("quantity").alias("discount_products"))                                                     

df=df1.join(df2,["identifier","categoryL1","categoryL2"],"left")\
      .withColumn("percent_discount_amount",col("total_discount")/col("total_spend"))\
      .withColumn("percent_discount_products",col("discount_products")/col("total_products"))
df.show(5)

+----------+--------------------+--------------------+-----------+--------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+
|identifier|          categoryL1|          categoryL2|total_spend|total_discount|total_transaction_amount|total_txns|total_products|discount_products|percent_discount_amount|percent_discount_products|
+----------+--------------------+--------------------+-----------+--------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+
| nsb@d.com|     Home & Kitchen | Kitchen & Table ...|      33.99|           0.0|                  209.28|         1|             1|             null|                    0.0|                     null|
| tzu@f.com|Computers & Acces...| Touch Screen Tab...|      10.45|         3.135|                    95.7|         1|             1|                1|                    0.3|                      

#### Assigning percentile ranks for percent_discount_amount & percent_discount_products

In [11]:
from pyspark.sql.window import Window
percentile_df = df.withColumn("percentile_amount", percent_rank().over(Window.partitionBy("categoryL1","categoryL2") \
                                                                       .orderBy('percent_discount_amount'))) \
                  .withColumn("percentile_products", percent_rank().over(Window.partitionBy("categoryL1","categoryL2") \
                                                                       .orderBy('percent_discount_products'))) 
percentile_df.show(5)

+----------+--------------------+-------------+-----------+--------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+-----------------+-------------------+
|identifier|          categoryL1|   categoryL2|total_spend|total_discount|total_transaction_amount|total_txns|total_products|discount_products|percent_discount_amount|percent_discount_products|percentile_amount|percentile_products|
+----------+--------------------+-------------+-----------+--------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+-----------------+-------------------+
| zba@a.com|Health & Personal...| Bath & Body |      15.98|           0.0|                   15.98|         1|             2|             null|                    0.0|                     null|              0.0|                0.0|
| bzn@f.com|Health & Personal...| Bath & Body |      44.99|           0.

#### Generating quantile buckets by usinf overall_rank(calculated using score)

In [12]:
score=percentile_df.withColumn("score",(col("percentile_amount")*0.7)+(col("percentile_products")*0.3))
overall_rank = score.withColumn("overall_rank", percent_rank().over(Window.partitionBy("categoryL1","categoryL2") \
                                                                       .orderBy('score'))) 
quantiles = overall_rank.withColumn('quantiles',  when(col('overall_rank') <= 0.01,"First percentile") \
                                                 .when(col('overall_rank') <= 0.02,"Second percentile") \
                                                 .when(col('overall_rank') <= 0.03,"Third percentile")\
                                                 .when(col('overall_rank') <= 0.10,"4 to 10 percentile") \
                                                 .when(col('overall_rank') <= 0.20,"10 to 20 percentile") \
                                                 .when(col('overall_rank') <= 0.40,"20 to 40 percentile") \
                                                 .when(col('overall_rank') <= 0.60,"40 to 60 percentile") \
                                                 .when(col('overall_rank') <= 0.80,"60 to 80 percentile") \
                                                 .when(col('overall_rank') <= 0.90,"80 to 90 percentile") \
                                                 .when(col('overall_rank') >= 1,"90 to 100 percentile") )      
quantiles.show(5)

+----------+--------------------+-------------+-----------+--------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+-----------------+-------------------+-----+------------+----------------+
|identifier|          categoryL1|   categoryL2|total_spend|total_discount|total_transaction_amount|total_txns|total_products|discount_products|percent_discount_amount|percent_discount_products|percentile_amount|percentile_products|score|overall_rank|       quantiles|
+----------+--------------------+-------------+-----------+--------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+-----------------+-------------------+-----+------------+----------------+
| ndh@f.com|Health & Personal...| Bath & Body |      118.0|           0.0|                   118.0|         1|             2|             null|                    0.0|                     null|   

#### Percentile bins

In [13]:
xx1=quantiles.groupBy("categoryL1","categoryL2") \
             .agg(count("*").alias("count"))        
  
xx=quantiles.join(xx1,["categoryL1","categoryL2"],"left")\
      .withColumn("percentile_bin", 11-ceil(10*((row_number().over(Window.partitionBy("categoryL1","categoryL2")\
                                                                   .orderBy(desc("score"))))/col('count'))))
xx.show(5)

+--------------------+-------------+----------+-----------+--------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+------------------+-------------------+------------------+------------------+---------+-----+--------------+
|          categoryL1|   categoryL2|identifier|total_spend|total_discount|total_transaction_amount|total_txns|total_products|discount_products|percent_discount_amount|percent_discount_products| percentile_amount|percentile_products|             score|      overall_rank|quantiles|count|percentile_bin|
+--------------------+-------------+----------+-----------+--------------+------------------------+----------+--------------+-----------------+-----------------------+-------------------------+------------------+-------------------+------------------+------------------+---------+-----+--------------+
|Health & Personal...| Bath & Body | dqu@a.com|       7.45|         2.235|                    

#### Stats on a category(L1,L2) cross percentile bin

In [14]:
a=xx.groupBy("categoryL1","categoryL2","percentile_bin").agg(countDistinct("identifier").alias("shoppers"),
                                                             sum("total_txns").alias("total_txns"),
                                                             sum("total_spend").alias("total_spend"),
                                                             sum("total_products").alias("total_products"),
                                                             sum("total_transaction_amount").alias("transaction_amount_pre_discount"),
                                                             sum("total_discount").alias("total_discount_availed"),
                                                             avg("percent_discount_amount").alias("percent_discount_amount"),
                                                             avg("percent_discount_products").alias("percent_discount_products"))
a.show(5)

+--------------------+-------------+--------------+--------+----------+------------------+--------------+-------------------------------+----------------------+-----------------------+-------------------------+
|          categoryL1|   categoryL2|percentile_bin|shoppers|total_txns|       total_spend|total_products|transaction_amount_pre_discount|total_discount_availed|percent_discount_amount|percent_discount_products|
+--------------------+-------------+--------------+--------+----------+------------------+--------------+-------------------------------+----------------------+-----------------------+-------------------------+
|Health & Personal...| Bath & Body |            10|       5|         5|             224.7|             9|                          376.7|                 67.41|                    0.3|                      1.0|
|Health & Personal...| Bath & Body |             9|       6|         6|             713.2|            16|                         964.14|                 88

#### Stats on a category(L1,L2) cross overall rank

In [15]:
b=xx.groupBy("categoryL1","categoryL2","overall_rank").agg(countDistinct("identifier").alias("shoppers"),
                                                             sum("total_txns").alias("total_txns"),
                                                             sum("total_spend").alias("total_spend"),
                                                             sum("total_products").alias("total_products"),
                                                             sum("total_transaction_amount").alias("transaction_amount_pre_discount"),
                                                             sum("total_discount").alias("total_discount_availed"),
                                                             avg("percent_discount_amount").alias("percent_discount_amount"),
                                                             avg("percent_discount_products").alias("percent_discount_products")) 
b.show(5)

+--------------------+-------------+------------------+--------+----------+-----------+--------------+-------------------------------+----------------------+-----------------------+-------------------------+
|          categoryL1|   categoryL2|      overall_rank|shoppers|total_txns|total_spend|total_products|transaction_amount_pre_discount|total_discount_availed|percent_discount_amount|percent_discount_products|
+--------------------+-------------+------------------+--------+----------+-----------+--------------+-------------------------------+----------------------+-----------------------+-------------------------+
|Health & Personal...| Bath & Body |               0.0|      47|        47|    1654.72|            62|              4083.119999999998|                   0.0|                    0.0|                     null|
|Health & Personal...| Bath & Body |0.8245614035087719|       1|         1|       19.5|             2|                           19.5|                  1.95|    0.09999