# UK Yocuda Luxury Analysis - 31st Jan,2018

#### Importing youcuda data from GCS

In [34]:
data = (spark.read.format("csv").option("header", "true").load("gs://ds-mlengine/praneeth/2017_Yocuda_Dummy_Data_V01_20180129.csv"))

In [44]:
from pyspark.sql.functions import *

 #### Creating the data on a brand cross category level

In [50]:
data.registerTempTable("data")
base_data =data.selectExpr('identifier','brandName as brand', 'cat1 AS category','item_total_txn AS transaction_amount', 'item_product_code as product_id', 'item_total','item_discounts_amount', 'timestamp','item_quantity as quantity', 'transaction_id')  \
    .withColumn('item_unit_price', col('transaction_amount') / col('quantity')) \
  .filter((to_date(col("timestamp"))>'2016-11-30') & \
                  (to_date(col("timestamp"))<'2017-12-01') & \
                  (col("identifier").isNotNull()) & \
                  (col("item_total")>0)) 
base_data.show(5)  

+----------+-----+--------+------------------+----------+----------+---------------------+--------------------+--------+--------------------+---------------+
|identifier|brand|category|transaction_amount|product_id|item_total|item_discounts_amount|           timestamp|quantity|      transaction_id|item_unit_price|
+----------+-----+--------+------------------+----------+----------+---------------------+--------------------+--------+--------------------+---------------+
| xsa@c.com|   CG|  Media |            115.97|   5004040|         2|                    0|2017-07-05 13:40:...|       2|2017-07-05 - G123...|         57.985|
| inn@e.com|   PP|  Media |            144.75|   5036515|    138.75|               -27.75|2017-08-22 00:28:...|       5|2017-08-22 - D188...|          28.95|
| amd@c.com|   VL|  Media |             48.45|   5035216|     28.45|                -5.69|2017-09-05 02:55:...|       1|2017-09-05 - D112...|          48.45|
| uls@a.com|   SR|  Media |            166.05|   503

#### Exporting brand cross category level data file (csv format) into GCS

In [91]:
(base_data.coalesce(1).write.option("header", "true").csv("gs://ds-mlengine/praneeth/yocuda_luxury_analysis_files/yocuda_brand_category_base_data"))

#### Calculating the brand's average price per product, min price, max price, 25th, 75th percentile

In [61]:
from pyspark.sql.window import Window 
# Creating brand level information that would be used for bucketing brands into luxury buckets
brand_prices_percentile = base_data.select('brand','category', 'product_id', 'item_unit_price').distinct() \
                                    .withColumn('in_brand_percentile', percent_rank().over(Window.partitionBy(col('brand')) \
                                    .orderBy(col('item_unit_price')))) 
min_mean_max = brand_prices_percentile.groupBy('brand','category').agg(min('item_unit_price').alias('value_min'), \
                                                            max('item_unit_price').alias('value_max'), \
                                                            avg('item_unit_price').alias('value_mean')) 
quartile_25 = brand_prices_percentile.filter(col('in_brand_percentile') <= 0.25) \
                                     .groupBy('brand','category').agg(max('item_unit_price').alias('value_25')) 
quartile_50 = brand_prices_percentile.filter(col('in_brand_percentile') <= 0.5) \
                                     .groupBy('brand','category').agg(max('item_unit_price').alias('value_50')) 
quartile_75 = brand_prices_percentile.filter(col('in_brand_percentile') <= 0.75) \
                                     .groupBy('brand','category').agg(max('item_unit_price').alias('value_75')) 
brand_metrics = min_mean_max.join(quartile_25, ['brand','category'], 'left').join(quartile_50, ['brand','category'], 'left') \
                            .join(quartile_75, ['brand','category'], 'left')

#### Seeing the data

In [62]:
brand_metrics.show(5)

+-----+--------------------+---------+---------+------------------+--------+--------+--------+
|brand|            category|value_min|value_max|        value_mean|value_25|value_50|value_75|
+-----+--------------------+---------+---------+------------------+--------+--------+--------+
|   VK|       Toys & Games |    34.67|   399.99|            145.55|    null|   53.99|   53.99|
|   WG|Clothing & Access...|    -0.01|    -0.01|             -0.01|   -0.01|   -0.01|   -0.01|
|   UD|     Home & Kitchen |     3.99|   263.98| 72.71928571428572|    10.5|  33.035|  33.035|
|   IL|        Electronics |      7.0|   211.43|104.61250000000001|     7.0|   34.05|   34.05|
|   BO|Clothing & Access...|   103.45|   103.45|            103.45|    null|    null|  103.45|
+-----+--------------------+---------+---------+------------------+--------+--------+--------+
only showing top 5 rows



#### Exporting brand metrics data file (csv format) into GCS

In [92]:
(brand_metrics.coalesce(1).write.option("header", "true").csv("gs://ds-mlengine/praneeth/yocuda_luxury_analysis_files/yocuda_brand_metrics_data"))

#### Calcualting scores

In [58]:
luxury_scores = brand_metrics \
.withColumn('rank1', percent_rank().over(Window.orderBy('value_min'))) \
.withColumn('rank2', percent_rank().over(Window.orderBy('value_25'))) \
.withColumn('rank3', percent_rank().over(Window.orderBy('value_50'))) \
.withColumn('rank4', percent_rank().over(Window.orderBy('value_mean'))) \
.withColumn('rank5', percent_rank().over(Window.orderBy('value_75'))) \
.withColumn('rank6', percent_rank().over(Window.orderBy('value_max'))) \
.withColumn('final_score', (col('rank1') + col('rank2') + col('rank3') + col('rank4') + col('rank5') + col('rank6'))/6) \
.withColumn('percentile_rank', percent_rank().over(Window.orderBy('final_score')))

#### Seeing the data

In [59]:
luxury_scores.show(5)

+-----+--------------------+---------+---------+----------+--------+--------+--------+--------------------+------------------+------------------+--------------------+-------------------+--------------------+-------------------+--------------------+
|brand|            category|value_min|value_max|value_mean|value_25|value_50|value_75|               rank1|             rank2|             rank3|               rank4|              rank5|               rank6|        final_score|     percentile_rank|
+-----+--------------------+---------+---------+----------+--------+--------+--------+--------------------+------------------+------------------+--------------------+-------------------+--------------------+-------------------+--------------------+
|   PI|Health & Personal...|    26.45|    26.45|     26.45|    null|    null|   26.45| 0.41552393272962485|               0.0|               0.0| 0.12095730918499353| 0.2575679172056921| 0.09598965071151358|0.14833980163863733|                 0.0|
|   

#### Exporting luxury scores data file (csv format) into GCS

In [93]:
(luxury_scores.coalesce(1).write.option("header", "true").csv("gs://ds-mlengine/praneeth/yocuda_luxury_analysis_files/yocuda_luxury_scores_data"))

In [66]:
df =luxury_scores.selectExpr('brand', 'category','percentile_rank')  
df.show(5)  

+-----+--------------------+--------------------+
|brand|            category|     percentile_rank|
+-----+--------------------+--------------------+
|   PI|Health & Personal...|                 0.0|
|   ES|  Sports & Outdoors | 1.29366106080207E-4|
|   QX|Tools & Home Impr...| 2.58732212160414E-4|
|   QA|Computers & Acces...|3.880983182406209...|
|   DJ|Industrial & Scie...| 5.17464424320828E-4|
+-----+--------------------+--------------------+
only showing top 5 rows



#### Caculating user's respective luxury score

In [114]:
user_level_luxury_score = base_data.join(luxury_scores \
                          .withColumn('bucket_value', when(col('percentile_rank') >= 0.8,5).when(col('percentile_rank') >= 0.6,4) \
                          .when(col('percentile_rank') >= 0.4,3).when(col('percentile_rank') >= 0.2,2) \
                          .otherwise(1)).select('brand', 'category','bucket_value'), ['brand','category'], 'inner') \
        .select('identifier','category', 'quantity', 'bucket_value') \
        .withColumn('total_bucket_value', col('quantity') * col('bucket_value')) \
        .groupBy('identifier','category').agg(sum(col('quantity')).alias('net_quantity'), sum(col('total_bucket_value')).alias('sum_buckets')) \
        .withColumn('luxury_score', col('sum_buckets') / col('net_quantity')) \
        .select('identifier','category', 'luxury_score','net_quantity')

#### Seeing the data

In [115]:
user_level_luxury_score.show(5)

+----------+--------------------+------------+------------+
|identifier|            category|luxury_score|net_quantity|
+----------+--------------------+------------+------------+
| ssu@e.com|Health & Personal...|         1.0|         1.0|
| czj@e.com|  Sports & Outdoors |         1.0|         1.0|
| qgo@e.com|Tools & Home Impr...|         1.0|         1.0|
| sdb@a.com|Computers & Acces...|         1.0|         1.0|
| aod@e.com|Industrial & Scie...|         1.0|         1.0|
+----------+--------------------+------------+------------+
only showing top 5 rows



#### Exporting user level luxury scores data file (csv format) into GCS

In [94]:
(user_level_luxury_score.coalesce(1).write.option("header", "true").csv("gs://ds-mlengine/praneeth/yocuda_luxury_analysis_files/yocuda_user_level_luxury_score_data"))

#### Number of shoppers with different luxury scores for every category

In [95]:
frequency = user_level_luxury_score.withColumn('luxury_score', floor('luxury_score')) \
                       .groupBy('category','luxury_score').agg(countDistinct('identifier').alias('frequency')) \
                       .orderBy('category',desc('luxury_score'))
frequency.show(10)    

+--------------------+------------+---------+
|            category|luxury_score|frequency|
+--------------------+------------+---------+
|Arts, Crafts & Se...|           5|       57|
|Arts, Crafts & Se...|           4|       24|
|Arts, Crafts & Se...|           3|       34|
|Arts, Crafts & Se...|           2|       42|
|Arts, Crafts & Se...|           1|       68|
|         Automotive |           5|       89|
|         Automotive |           4|       60|
|         Automotive |           3|       43|
|         Automotive |           2|       67|
|         Automotive |           1|       77|
+--------------------+------------+---------+
only showing top 10 rows



#### Exporting category cross luxury scores frequency data file (csv format) into GCS

In [96]:
(frequency.coalesce(1).write.option("header", "true").csv("gs://ds-mlengine/praneeth/yocuda_luxury_analysis_files/yocuda_category_luxury_score_frequency_data"))

#### Categorizing a shopper as Luxury / Non-luxury shopper based on the score

In [87]:
yocuda_luxury_score = base_data.join(user_level_luxury_score.withColumn('luxury_ind', when(col('luxury_score') > 4,"lux_shopper") \
                    .otherwise("non_lux_shopper")).select('identifier','category','luxury_score','luxury_ind'), \
                                     ['identifier','category'], 'left')\
                    .select('identifier','brand', 'category','transaction_amount', \
                            'product_id', 'item_total','item_discounts_amount', 'timestamp',\
                            'quantity', 'transaction_id','item_unit_price','luxury_score', 'luxury_ind') 
yocuda_luxury_score.show(5)

+----------+-----+---------------+------------------+----------+----------+---------------------+--------------------+--------+--------------------+---------------+------------+---------------+
|identifier|brand|       category|transaction_amount|product_id|item_total|item_discounts_amount|           timestamp|quantity|      transaction_id|item_unit_price|luxury_score|     luxury_ind|
+----------+-----+---------------+------------------+----------+----------+---------------------+--------------------+--------+--------------------+---------------+------------+---------------+
| abe@d.com|Other|   Electronics |             65.42|   5047710|     39.45|               -3.945|2017-08-07 21:37:...|       1|2017-08-07 - G130...|          65.42|         5.0|    lux_shopper|
| afh@e.com|   XI|   Electronics |             57.99|   5042937|     57.99|                    0|2017-10-16 18:49:...|       1|2017-10-16 - B180...|          57.99|         5.0|    lux_shopper|
| akv@a.com|   SZ|       Jewel

#### Exporting category wise user segmentation data file (csv format) into GCS

In [97]:
(yocuda_luxury_score.coalesce(1).write.option("header", "true").csv("gs://ds-mlengine/praneeth/yocuda_luxury_analysis_files/yocuda_category_user_segmentation_data"))

#### Generic metrics on a category cross luxury indicator level

In [103]:
df =yocuda_luxury_score.groupBy("category","luxury_ind").agg(countDistinct('identifier').alias('unique_shoppers'),\
                                                        sum('item_total').alias('spend'),\
                                                       countDistinct('transaction_id').alias('num_of_txns'),\
                                                     sum('quantity').alias('quantity'))\
                                                     .withColumn('avg_price_purchased_prods',col('spend')/col('quantity'))
df.show(5)  

+--------------------+---------------+---------------+------------------+-----------+--------+-------------------------+
|            category|     luxury_ind|unique_shoppers|             spend|num_of_txns|quantity|avg_price_purchased_prods|
+--------------------+---------------+---------------+------------------+-----------+--------+-------------------------+
|              Shoes |non_lux_shopper|            335| 16909.94999999999|        336|   542.0|         31.1991697416974|
|         Automotive |    lux_shopper|             89| 4894.769999999997|         89|   141.0|       34.714680851063804|
|              Media |non_lux_shopper|             39|1523.0600000000002|         39|    65.0|        23.43169230769231|
|Industrial & Scie...|    lux_shopper|             19|1025.0800000000002|         19|    27.0|        37.96592592592593|
|       Pet Supplies |non_lux_shopper|            216| 10196.60999999999|        216|   349.0|       29.216647564469884|
+--------------------+----------

#### Exporting category cross luxury indicator level data file (csv format) into GCS

In [104]:
(df.coalesce(1).write.option("header", "true").csv("gs://ds-mlengine/praneeth/yocuda_luxury_analysis_files/yocuda_category_luxury_ind_data"))

#### Top 5 categories by total spend

In [110]:
xx1 =base_data.groupBy("category").agg(sum('item_total').alias('spend')).orderBy(desc("spend"))
xx1.show(5)  

+--------------------+------------------+
|            category|             spend|
+--------------------+------------------+
|     Home & Kitchen | 256787.0400000004|
|       Toys & Games |127602.55000000003|
|Tools & Home Impr...| 113431.2100000001|
|  Sports & Outdoors | 96857.21999999987|
|        Electronics | 91625.14999999992|
+--------------------+------------------+
only showing top 5 rows



#### User level scores on an overall level

In [116]:
user_level_luxury_score_overall = user_level_luxury_score \
          .select('identifier','net_quantity','luxury_score') \
          .withColumn('total_luxury_value', col('net_quantity') * col('luxury_score')) \
          .groupBy('identifier').agg(sum(col('net_quantity')).alias('net_quantity'), sum(col('total_luxury_value')).alias('sum_score')) \
          .withColumn('overall_luxury_score', col('sum_score') / col('net_quantity')) 
user_level_luxury_score_overall.show(5)        

+----------+------------+---------+--------------------+
|identifier|net_quantity|sum_score|overall_luxury_score|
+----------+------------+---------+--------------------+
| ssu@e.com|         1.0|      1.0|                 1.0|
| czj@e.com|         2.0|      4.0|                 2.0|
| qgo@e.com|         1.0|      1.0|                 1.0|
| sdb@a.com|         2.0|      2.0|                 1.0|
| aod@e.com|         1.0|      1.0|                 1.0|
+----------+------------+---------+--------------------+
only showing top 5 rows



#### Exporting overall luxury score on a user-level data file (csv format) into GCS

In [117]:
(user_level_luxury_score_overall.coalesce(1).write.option("header", "true").csv("gs://ds-mlengine/praneeth/yocuda_luxury_analysis_files/yocuda_overall_luxury_score_user_data"))