# UK Yocuda Luxury Analysis - 31st Jan,2018

#### Importing youcuda data from GCS

In [1]:
data = (spark.read.format("csv").option("header", "true").load('gs://westfield-tom/datalab/Yocuda_clean_data_Nov15_Nov17_20171214_v01_*.csv'))

In [2]:
from pyspark.sql.functions import *

 #### Creating the data on a brand cross category level

In [3]:
data.registerTempTable("data")
base_data =data.selectExpr('identifier','brandName as brand', 'cat1 AS category','item_total_txn AS transaction_amount', 'item_product_code as product_id', 'item_total','item_discounts_amount', 'timestamp','item_quantity as quantity', 'transaction_id')  \
    .withColumn('item_unit_price', col('item_total') / col('quantity')) \
  .filter((to_date(col("timestamp"))>'2016-11-30') & \
                  (to_date(col("timestamp"))<'2017-12-01') & \
                  (col("identifier").isNotNull()) & \
                  (col("retailer_name")=='Argos') & \
                  (col("item_total")>0)) 
base_data.show(5)  

+--------------------+------------+--------------------+------------------+----------+----------+---------------------+--------------------+--------+--------------------+---------------+
|          identifier|       brand|            category|transaction_amount|product_id|item_total|item_discounts_amount|           timestamp|quantity|      transaction_id|item_unit_price|
+--------------------+------------+--------------------+------------------+----------+----------+---------------------+--------------------+--------+--------------------+---------------+
|ginagordon13@hotm...|Other Brands|              Shoes |             49.01|  686/5483|      5.99|                 null|2017-09-11 12:52:...|       1|20170911 / 413 / ...|          49.01|
|kirsty.reddem96@g...|      silver|            Jewelry |             19.97|  392/2374|     11.99|                 null|2017-05-06 11:49:...|       1|20170506 / 4221 /...|          19.97|
|andrearobinson2@o...|Other Brands|     Home & Kitchen |         

In [21]:
test1 = base_data.agg(countDistinct('identifier'))
test1.show()

+--------------------------+
|count(DISTINCT identifier)|
+--------------------------+
|                  12103974|
+--------------------------+



#### Exporting brand cross category level data file (csv format) into GCS

In [91]:
(base_data.coalesce(1).write.option("header", "true").csv("gs://westfield-tom/datalab/brand_category_base_data"))

#### Calculating the brand's average price per product, min price, max price, 25th, 75th percentile

In [4]:
from pyspark.sql.window import Window 
# Creating brand level information that would be used for bucketing brands into luxury buckets
brand_prices_percentile = base_data.select('brand','category', 'product_id', 'item_unit_price').distinct() \
                                    .withColumn('in_brand_percentile', percent_rank().over(Window.partitionBy(col('brand')) \
                                    .orderBy(col('item_unit_price')))) 
min_mean_max = brand_prices_percentile.groupBy('brand','category').agg(min('item_unit_price').alias('value_min'), \
                                                            max('item_unit_price').alias('value_max'), \
                                                            avg('item_unit_price').alias('value_mean')) 
quartile_25 = brand_prices_percentile.filter(col('in_brand_percentile') <= 0.25) \
                                     .groupBy('brand','category').agg(max('item_unit_price').alias('value_25')) 
quartile_50 = brand_prices_percentile.filter(col('in_brand_percentile') <= 0.5) \
                                     .groupBy('brand','category').agg(max('item_unit_price').alias('value_50')) 
quartile_75 = brand_prices_percentile.filter(col('in_brand_percentile') <= 0.75) \
                                     .groupBy('brand','category').agg(max('item_unit_price').alias('value_75')) 
brand_metrics = min_mean_max.join(quartile_25, ['brand','category'], 'left').join(quartile_50, ['brand','category'], 'left') \
                            .join(quartile_75, ['brand','category'], 'left')

#### Seeing the data

In [5]:
brand_metrics.show(5)

+---------------+--------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+
|          brand|            category|          value_min|         value_max|        value_mean|         value_25|         value_50|          value_75|
+---------------+--------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+
|       2k Games|        Video Games |            -173.01|           1098.98| 180.3300977843792|            77.47|           124.97|230.01000000000002|
|        Muskoka|          Furniture |            1649.99|           1874.98|1793.9666666666665|          1649.99|          1856.93|           1856.93|
|    donna karan|            Jewelry |-60.000000000000014| 563.1800000000001|134.35552380952367|40.04999999999999|94.97999999999999|            149.99|
|         garmin|Computers & Acces...|              -10.0|1124.7800000000002|157.2676086

#### Exporting brand metrics data file (csv format) into GCS

In [92]:
(brand_metrics.coalesce(1).write.option("header", "true").csv("gs://westfield-tom/datalab/yocuda_brand_metrics_data"))

#### Calcualting scores

In [6]:
luxury_scores = brand_metrics \
.withColumn('rank1', percent_rank().over(Window.orderBy('value_min'))) \
.withColumn('rank2', percent_rank().over(Window.orderBy('value_25'))) \
.withColumn('rank3', percent_rank().over(Window.orderBy('value_50'))) \
.withColumn('rank4', percent_rank().over(Window.orderBy('value_mean'))) \
.withColumn('rank5', percent_rank().over(Window.orderBy('value_75'))) \
.withColumn('rank6', percent_rank().over(Window.orderBy('value_max'))) \
.withColumn('final_score', (col('rank1') + col('rank2') + col('rank3') + col('rank4') + col('rank5') + col('rank6'))/6) \
.withColumn('percentile_rank', percent_rank().over(Window.orderBy('final_score')))

#### Seeing the data

In [47]:
brand_metrics_test = luxury_scores\
              .groupBy('category').agg(min('percentile_rank').alias('min_perc_rank'), \
                max('percentile_rank').alias('max_perc_rank'))
brand_metrics_test.show(25)

+--------------------+--------------------+-------------------+
|            category|       min_perc_rank|      max_perc_rank|
+--------------------+--------------------+-------------------+
|Health & Personal...|                 0.0| 0.9968012794882047|
|        Electronics |                 0.0| 0.9860055977608957|
|     Home & Kitchen |7.996801279488205E-4|  0.998000799680128|
|          Furniture |0.001199520191923...| 0.9992003198720512|
|  Sports & Outdoors |0.001599360255897641| 0.9996001599360256|
|      Baby Products |0.001999200319872051| 0.9568172730907637|
|Clothing & Access...|0.002798880447820...| 0.9936025589764095|
|    Office Products |0.003598560575769692|  0.995201919232307|
|            Jewelry |0.003998400639744102| 0.9856057576969213|
|       Toys & Games |0.009196321471411436|  0.973610555777689|
|Patio, Lawn & Gar...|0.009596161535385846|                1.0|
|Computers & Acces...|0.011195521791283487| 0.9924030387844862|
|         Automotive |0.0127948820471811

In [7]:
luxury_scores.show(5)

+---------------+--------------------+---------+---------+------------------+--------+--------+--------+------------------+-------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+
|          brand|            category|value_min|value_max|        value_mean|value_25|value_50|value_75|             rank1|              rank2|              rank3|               rank4|               rank5|               rank6|        final_score|     percentile_rank|
+---------------+--------------------+---------+---------+------------------+--------+--------+--------+------------------+-------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+
| Bang & Olufsen|        Electronics |      0.0|      0.0|               0.0|     0.0|     0.0|     0.0|0.4962015193922431|0.07716913234706117|0.04798080767692923|                 0.0|0.0267892842

#### Exporting luxury scores data file (csv format) into GCS

In [93]:
(luxury_scores.coalesce(1).write.option("header", "true").csv("gs://westfield-tom/datalab/yocuda_luxury_scores_data"))

In [8]:
df =luxury_scores.selectExpr('brand', 'category','percentile_rank')  
df.show(5)  

+---------------+--------------------+--------------------+
|          brand|            category|     percentile_rank|
+---------------+--------------------+--------------------+
|          Clean|Health & Personal...|                 0.0|
| Bang & Olufsen|        Electronics |                 0.0|
|         sakura|     Home & Kitchen |7.996801279488205E-4|
|flash furniture|          Furniture |0.001199520191923...|
|          Arena|  Sports & Outdoors |0.001599360255897641|
+---------------+--------------------+--------------------+
only showing top 5 rows



#### Caculating user's respective luxury score

In [9]:
user_level_luxury_score = base_data.join(luxury_scores \
                          .withColumn('bucket_value', when(col('percentile_rank') >= 0.8,5).when(col('percentile_rank') >= 0.6,4) \
                          .when(col('percentile_rank') >= 0.4,3).when(col('percentile_rank') >= 0.2,2) \
                          .otherwise(1)).select('brand', 'category','bucket_value'), ['brand','category'], 'inner') \
        .select('identifier','category', 'quantity', 'bucket_value') \
        .withColumn('total_bucket_value', col('quantity') * col('bucket_value')) \
        .groupBy('identifier','category').agg(sum(col('quantity')).alias('net_quantity'), sum(col('total_bucket_value')).alias('sum_buckets')) \
        .withColumn('luxury_score', col('sum_buckets') / col('net_quantity')) \
        .select('identifier','category', 'luxury_score','net_quantity')

#### Seeing the data

In [10]:
user_level_luxury_score.show(5)

+--------------------+---------------+------------+------------+
|          identifier|       category|luxury_score|net_quantity|
+--------------------+---------------+------------+------------+
|miranda-elle@hotm...|  Toys & Games |         4.0|         3.0|
|se_evans@hotmail....|  Toys & Games |         4.0|         1.0|
|andrea.harwood@ym...|  Toys & Games |         3.5|         8.0|
|shazmalik17@hotma...|  Toys & Games |         4.0|         4.0|
|    amcmahon@sky.com|Home & Kitchen |         3.0|         1.0|
+--------------------+---------------+------------+------------+
only showing top 5 rows



In [25]:
test2 = user_level_luxury_score.agg(countDistinct('identifier'))
test2.show()

+--------------------------+
|count(DISTINCT identifier)|
+--------------------------+
|                  11224605|
+--------------------------+



In [18]:
user_level_luxury_score.count()

18760140

## Exporting Shopper-Category level luxury scores data file (csv format) into GCS

In [28]:
(user_level_luxury_score.coalesce(1).write.option("header", "true").csv("gs://westfield-tom/datalab/luxury_score_data_v1"))

#### Number of shoppers with different luxury scores for every category

In [11]:
frequency = user_level_luxury_score.withColumn('luxury_score', floor('luxury_score')) \
                       .groupBy('category','luxury_score').agg(countDistinct('identifier').alias('frequency')) \
                       .orderBy('category',desc('luxury_score'))
frequency.show(10)    

+--------------------+------------+---------+
|            category|luxury_score|frequency|
+--------------------+------------+---------+
|Arts, Crafts & Se...|           5|    16159|
|Arts, Crafts & Se...|           4|   117564|
|Arts, Crafts & Se...|           3|     5805|
|Arts, Crafts & Se...|           2|    22589|
|Arts, Crafts & Se...|           1|       36|
|         Automotive |           5|     4261|
|         Automotive |           4|   216191|
|         Automotive |           3|    14312|
|         Automotive |           2|     3105|
|         Automotive |           1|      258|
+--------------------+------------+---------+
only showing top 10 rows



#### Exporting category cross luxury scores frequency data file (csv format) into GCS

In [96]:
(frequency.coalesce(1).write.option("header", "true").csv("gs://westfield-tom/datalab/yocuda_category_luxury_score_frequency_data"))

#### Categorizing a shopper as Luxury / Non-luxury shopper based on the score

In [12]:
yocuda_luxury_score = base_data.join(user_level_luxury_score.withColumn('luxury_ind', when(col('luxury_score') > 4,"lux_shopper") \
                    .otherwise("non_lux_shopper")).select('identifier','category','luxury_score','luxury_ind'), \
                                     ['identifier','category'], 'left')\
                    .select('identifier','brand', 'category','transaction_amount', \
                            'product_id', 'item_total','item_discounts_amount', 'timestamp',\
                            'quantity', 'transaction_id','item_unit_price','luxury_score', 'luxury_ind') 
yocuda_luxury_score.show(5)

+--------------------+------------+------------------+------------------+----------+----------+---------------------+--------------------+--------+--------------------+---------------+------------------+---------------+
|          identifier|       brand|          category|transaction_amount|product_id|item_total|item_discounts_amount|           timestamp|quantity|      transaction_id|item_unit_price|      luxury_score|     luxury_ind|
+--------------------+------------+------------------+------------------+----------+----------+---------------------+--------------------+--------+--------------------+---------------+------------------+---------------+
|"mike""camilleri@...|Other Brands|Sports & Outdoors |             99.99|  620/9537|     99.99|                 null|2017-03-23 19:35:...|       1|20170323 / 4120 /...|          99.99|               4.0|non_lux_shopper|
|+holliekelly93@ho...|Other Brands|      Electronics |             51.98|  503/3498|     29.99|                 null|201

#### Distribution of luxury score

In [26]:
sum1 =yocuda_luxury_score.withColumn('luxury_score', floor('luxury_score')).groupBy("luxury_score", "category").agg(countDistinct('identifier').alias('unique_shoppers'),\
                                                        sum('item_total').alias('spend'),\
                                                       countDistinct('transaction_id').alias('num_of_txns'),\
                                                     sum('quantity').alias('quantity'))\
                                                     .withColumn('avg_price_purchased_prods',col('spend')/col('quantity'))
sum1.show(6)

+------------+--------------------+---------------+--------------------+-----------+--------+-------------------------+
|luxury_score|            category|unique_shoppers|               spend|num_of_txns|quantity|avg_price_purchased_prods|
+------------+--------------------+---------------+--------------------+-----------+--------+-------------------------+
|           3|          Furniture |          14530|   808811.7499999984|      17342| 20745.0|        38.98827428295967|
|           3|Health & Personal...|         475710|1.5072848660000069E7|     559985|649588.0|       23.203705517959182|
|           5|           Software |           5673|   319696.6500000002|       5747|  5792.0|       55.196244820442026|
|           3|        Electronics |         219516|   7103918.140000129|     261553|329833.0|       21.537924161621575|
|           3|      Baby Products |         273558|1.0463161069999909E7|     300833|335417.0|       31.194486475044226|
|           1|  Sports & Outdoors |     

In [27]:
(sum1.coalesce(1).write.option("header", "true").csv("gs://westfield-tom/datalab/luxury_ind_category_distribution"))

#### Exporting category wise user segmentation data file (csv format) into GCS

In [97]:
(yocuda_luxury_score.coalesce(1).write.option("header", "true").csv("gs://westfield-tom/datalab/yocuda_category_user_segmentation_data"))

#### Generic metrics on a category cross luxury indicator level

In [29]:
df =yocuda_luxury_score.groupBy("category","luxury_ind").agg(countDistinct('identifier').alias('unique_shoppers'),\
                                                        sum('item_total').alias('spend'),\
                                                       countDistinct('transaction_id').alias('num_of_txns'),\
                                                     sum('quantity').alias('quantity'))\
                                                     .withColumn('avg_price_purchased_prods',col('spend')/col('quantity'))
df.show(5)  

+--------------------+---------------+---------------+--------------------+-----------+---------+-------------------------+
|            category|     luxury_ind|unique_shoppers|               spend|num_of_txns| quantity|avg_price_purchased_prods|
+--------------------+---------------+---------------+--------------------+-----------+---------+-------------------------+
|              Shoes |non_lux_shopper|         115670|  3193716.5399999795|     119279| 126097.0|        25.32745854381928|
|         Automotive |    lux_shopper|           4393|           342101.34|       4612|   4699.0|        72.80300915088317|
|              Media |non_lux_shopper|           4405|   42147.39000000003|       4458|   5431.0|        7.760521082673546|
|Industrial & Scie...|    lux_shopper|            639|   37435.25000000002|        665|    675.0|        55.45962962962966|
|                null|           null|        3205143|2.8235315817995405E8|    4011795|4879417.0|        57.86616683508584|
+-------

#### Exporting category cross luxury indicator level data file (csv format) into GCS

In [104]:
(df.coalesce(1).write.option("header", "true").csv("gs://westfield-tom/datalab/yocuda_category_luxury_ind_data"))

#### Top 5 categories by total spend

In [30]:
xx1 =base_data.groupBy("category").agg(sum('item_total').alias('spend')).orderBy(desc("spend"))
xx1.show(5)  

+--------------------+--------------------+
|            category|               spend|
+--------------------+--------------------+
|        Electronics | 2.840659656999931E8|
|                null|2.8235315817999303E8|
|     Home & Kitchen | 2.141566713099843E8|
|       Toys & Games | 8.614666659000352E7|
|Computers & Acces...| 8.348656677000155E7|
+--------------------+--------------------+
only showing top 5 rows



#### User level scores on an overall level

In [31]:
user_level_luxury_score_overall = user_level_luxury_score \
          .select('identifier','net_quantity','luxury_score') \
          .withColumn('total_luxury_value', col('net_quantity') * col('luxury_score')) \
          .groupBy('identifier').agg(sum(col('net_quantity')).alias('net_quantity'), sum(col('total_luxury_value')).alias('sum_score')) \
          .withColumn('overall_luxury_score', col('sum_score') / col('net_quantity')) 
user_level_luxury_score_overall.show(5)        

+--------------------+------------+---------+--------------------+
|          identifier|net_quantity|sum_score|overall_luxury_score|
+--------------------+------------+---------+--------------------+
|dianebatty77@gmai...|         1.0|      2.0|                 2.0|
|karen_terry2000@y...|        55.0|    160.0|   2.909090909090909|
|awhetstone@hotmai...|         1.0|      4.0|                 4.0|
|evangelis77@gmail...|         1.0|      4.0|                 4.0|
|giles.burden@mpsa...|         3.0|     12.0|                 4.0|
+--------------------+------------+---------+--------------------+
only showing top 5 rows



## Exporting overall luxury score on a user-level data file (csv format) into GCS

In [32]:
(user_level_luxury_score_overall.coalesce(1).write.option("header", "true").csv("gs://westfield-tom/datalab/yocuda_overall_luxury_score_user_data"))

#### Overall distribution of Luxury Scores

In [39]:
yocuda_luxury_score_tot = base_data.join(user_level_luxury_score_overall.withColumn('overall_luxury_ind', when(col('overall_luxury_score') > 4,"overall_lux_shopper") \
                    .otherwise("overall_non_lux_shopper")).select('identifier','overall_luxury_score','overall_luxury_ind'), \
                                     ['identifier'], 'left')\
                    .select('identifier','brand', 'category','transaction_amount', \
                            'product_id', 'item_total','item_discounts_amount', 'timestamp',\
                            'quantity', 'transaction_id','item_unit_price','overall_luxury_score', 'overall_luxury_ind') 
yocuda_luxury_score_tot.show(5)

+--------------------+------------+------------------+-------------------+----------+----------+---------------------+--------------------+--------+--------------------+-------------------+--------------------+--------------------+
|          identifier|       brand|          category| transaction_amount|product_id|item_total|item_discounts_amount|           timestamp|quantity|      transaction_id|    item_unit_price|overall_luxury_score|  overall_luxury_ind|
+--------------------+------------+------------------+-------------------+----------+----------+---------------------+--------------------+--------+--------------------+-------------------+--------------------+--------------------+
|   -.tmh24@gmail.com|Other Brands|      Electronics |              13.99|  631/5706|     13.99|                 null|2017-05-31 08:09:...|       1|20170531 / 461 / ...|              13.99|                 4.0|overall_non_lux_s...|
|-indiaismial78672...|Other Brands|   Home & Kitchen |              73.9

In [40]:
sum2 =yocuda_luxury_score_tot.withColumn('overall_luxury_score', floor('overall_luxury_score')).groupBy("overall_luxury_score").agg(countDistinct('identifier').alias('unique_shoppers'),\
                  sum('item_total').alias('spend'),\
                 countDistinct('transaction_id').alias('num_of_txns'),\
                 sum('quantity').alias('quantity'))\
                .withColumn('avg_price_purchased_prods',col('spend')/col('quantity'))
sum2.show(6)

+--------------------+---------------+--------------------+-----------+-----------+-------------------------+
|overall_luxury_score|unique_shoppers|               spend|num_of_txns|   quantity|avg_price_purchased_prods|
+--------------------+---------------+--------------------+-----------+-----------+-------------------------+
|                null|         879369| 9.221364122999571E7|     916617|  1078129.0|        85.53117598171991|
|                   5|        1202310|1.8932506316998821E8|    1344884|  1577653.0|       120.00424882403685|
|                   1|          24674|   638062.8799999978|      27649|    41454.0|       15.392070246538278|
|                   3|        4874789| 5.704587440808098E8|   10150848|1.9197759E7|       29.714861202331473|
|                   2|         516608|2.5633598500002168E7|     756059|  1386273.0|       18.491017642269718|
|                   4|        4606224| 5.485284309703792E8|    6910098|1.0189338E7|        53.83356906703647|
+---------

### Investigating brands in an example category

In [44]:
brand_level_luxury_score = base_data.join(luxury_scores \
                          .withColumn('bucket_value', when(col('percentile_rank') >= 0.8,5).when(col('percentile_rank') >= 0.6,4) \
                          .when(col('percentile_rank') >= 0.4,3).when(col('percentile_rank') >= 0.2,2) \
                          .otherwise(1)).select('brand', 'category','bucket_value'), ['brand','category'], 'inner') \
        .select('brand','category', 'quantity', 'bucket_value', 'item_total') \
        .withColumn('total_bucket_value', col('quantity') * col('bucket_value')) \
        .groupBy('brand','category').agg(sum(col('quantity')).alias('net_quantity'), sum(col('total_bucket_value')).alias('sum_buckets'), sum(col('item_total')).alias('spend')) \
        .withColumn('luxury_score', col('sum_buckets') / col('net_quantity')) \
        .select('brand','category', 'luxury_score','net_quantity','spend')

In [45]:
brand_level_luxury_score.show(10)

+---------------+--------------------+------------+------------+------------------+
|          brand|            category|luxury_score|net_quantity|             spend|
+---------------+--------------------+------------+------------+------------------+
|       2k Games|        Video Games |         5.0|      7575.0| 297116.3099999689|
|        Muskoka|          Furniture |         5.0|         3.0|           5099.97|
|    donna karan|            Jewelry |         4.0|       177.0|15995.229999999967|
|         garmin|Computers & Acces...|         5.0|       262.0| 23251.38000000008|
|rubie's costume|    Office Products |         1.0|         7.0|148.92999999999998|
|           swan|Tools & Home Impr...|         4.0|        95.0|26141.160000000033|
|         Abacus|    Office Products |         1.0|       227.0|  2669.72999999999|
|       Kingsley|     Home & Kitchen |         3.0|      4223.0|41443.320000000625|
|       Sabatier|     Home & Kitchen |         4.0|      1489.0|  47218.1099

In [46]:
(brand_level_luxury_score.coalesce(1).write.option("header", "true").csv("gs://westfield-tom/datalab/brand_level_luxury_score"))