# UK Yocuda Shopper Attributes

#### Importing Yocuda data from GCS

In [1]:
data = (spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("gs://ds-mlengine/praneeth/2017_Yocuda_Dummy_Data_V01_20180129.csv"))

In [2]:
from pyspark.sql.functions import *

#### Recency

In [22]:
data.registerTempTable("data")
df1 =data.selectExpr("identifier","cat1","cat2","timestamp", "1 as ind","item_total","transaction_id","brandName as brand","item_quantity")  \
    .filter((to_date(col("timestamp"))>'2016-11-30') & \
                  (to_date(col("timestamp"))<'2017-12-01') & \
                  (col("identifier").isNotNull()) & \
                  (col("item_total")>0))  \

df2 =data.selectExpr("cat1","cat2","timestamp ", "1 as ind").filter((to_date(col("timestamp"))>'2016-11-30') & \
                  (to_date(col("timestamp"))<'2017-12-01') & \
                  (col("identifier").isNotNull()) & \
                  (col("item_total")>0))\
.groupBy(col("cat1"),col("cat2"),col("ind")).agg(max("timestamp").alias("timestamp_mx_data"))

df3 = df1.join(df2,["cat1","cat2","ind"],"left")

df4 =data.selectExpr("identifier","cat1","cat2","timestamp").filter((to_date(col("timestamp"))>'2016-11-30') & \
                  (to_date(col("timestamp"))<'2017-12-01') & \
                  (col("identifier").isNotNull()) & \
                  (col("item_total")>0))\
.groupBy("identifier","cat1","cat2").agg(max("timestamp").alias("timestamp_mx_user"))

df5 = df3.join(df4,["identifier","cat1","cat2"],"left")

df6 = df5.selectExpr("identifier","cat1","cat2","timestamp_mx_data","timestamp_mx_user") \
         .withColumn('recency',datediff('timestamp_mx_data','timestamp_mx_user'))

df6.show(5)

+----------+------+------+--------------------+--------------------+-------+
|identifier|  cat1|  cat2|   timestamp_mx_data|   timestamp_mx_user|recency|
+----------+------+------+--------------------+--------------------+-------+
| xsa@c.com|Media | Books|2017-10-26 15:16:...|2017-07-05 13:40:...|    113|
| inn@e.com|Media | Books|2017-10-26 15:16:...|2017-08-22 00:28:...|     65|
| amd@c.com|Media | Books|2017-10-26 15:16:...|2017-09-05 02:55:...|     51|
| uls@a.com|Media | Books|2017-10-26 15:16:...|2017-07-13 20:43:...|    105|
| ioq@a.com|Media | Books|2017-10-26 15:16:...|2017-01-29 14:27:...|    270|
+----------+------+------+--------------------+--------------------+-------+
only showing top 5 rows



#### Calculating metrics

In [16]:
total_spend = df5.selectExpr("identifier","cat1","cat2", "item_total")\
.groupBy(col("identifier"),col("cat1"),col("cat2")).agg(sum("item_total").alias("total_spend"))

frequency = df5.selectExpr("identifier","cat1","cat2", "transaction_id","timestamp")\
.groupBy(col("identifier"),col("cat1"),col("cat2")).agg(countDistinct(to_date("timestamp")).alias("frequency"))

total_spend.show(5)
frequency.show(5)

+----------+--------------------+--------------------+-----------+
|identifier|                cat1|                cat2|total_spend|
+----------+--------------------+--------------------+-----------+
| ilu@c.com|          Furniture |     Kids Furniture |       7.05|
| ajg@c.com|       Toys & Games |            Hobbies |        9.0|
| zjs@e.com|          Furniture | Kitchen & Dining...|      36.45|
| cok@b.com|     Home & Kitchen |   Small Appliances |       22.0|
| xdu@a.com|Arts, Crafts & Se...|     Craft Supplies |       18.0|
+----------+--------------------+--------------------+-----------+
only showing top 5 rows

+----------+--------------------+----------------+---------+
|identifier|                cat1|            cat2|frequency|
+----------+--------------------+----------------+---------+
| gsc@e.com|            Jewelry |          Women |        1|
| gem@a.com|Patio, Lawn & Gar...|   Farm & Ranch |        1|
| eck@d.com|     Home & Kitchen |     Home D̩cor |        1|
| ijn@

#### Combining data

In [19]:
overall = data.filter((to_date(col("timestamp"))>'2016-11-30') & (to_date(col("timestamp"))<'2017-12-01')).select("identifier","cat1","cat2").distinct()

combined = overall.join(total_spend,["identifier","cat1","cat2"],"left") \
                        .join(frequency,["identifier","cat1","cat2"],"left") 
  
final_combined = combined.selectExpr("identifier","cat1","cat2","total_spend","frequency") 
                         
final_combined.show(5)

+----------+--------------------+-------------+-----------+---------+
|identifier|                cat1|         cat2|total_spend|frequency|
+----------+--------------------+-------------+-----------+---------+
|      null|Health & Personal...| Bath & Body |       null|     null|
| ajg@c.com|       Toys & Games |     Hobbies |        9.0|        1|
| als@b.com|  Sports & Outdoors |     Cycling |       25.0|        1|
| anx@f.com|     Home & Kitchen |  Home D̩cor |        2.0|        1|
| awv@e.com|Health & Personal...| Health Care |       24.0|        1|
+----------+--------------------+-------------+-----------+---------+
only showing top 5 rows



#### Percentile bins - Spend & Frequency Decile Buckets

In [20]:
from pyspark.sql.window import Window
xx1=final_combined.groupBy("cat1","cat2") \
             .agg(count("*").alias("count"))  

xx=final_combined.join(xx1,["cat1","cat2"],"left")\
                 .withColumn("spend_bin", 11-ceil(10*((row_number().over(Window.partitionBy("cat1","cat2")\
                                                                   .orderBy(desc("total_spend"))))/col('count')))) \
                 .withColumn("frequency", 11-ceil(10*((row_number().over(Window.partitionBy("cat1","cat2")\
                                                                   .orderBy(desc("frequency"))))/col('count')))) 
                 
xx.show(5)

+--------------------+-------------+----------+-----------+---------+-----+---------+
|                cat1|         cat2|identifier|total_spend|frequency|count|spend_bin|
+--------------------+-------------+----------+-----------+---------+-----+---------+
|Health & Personal...| Bath & Body | mqm@e.com|      554.5|       10|   63|       10|
|Health & Personal...| Bath & Body | ndh@f.com|      118.0|       10|   63|       10|
|Health & Personal...| Bath & Body | fgd@c.com|     109.98|       10|   63|       10|
|Health & Personal...| Bath & Body | qel@a.com|     109.95|       10|   63|       10|
|Health & Personal...| Bath & Body | rzg@c.com|     101.97|       10|   63|       10|
+--------------------+-------------+----------+-----------+---------+-----+---------+
only showing top 5 rows



#### Top preferred brand

In [43]:
total_spend_by_brand = df5.selectExpr("identifier","cat1","cat2","brand", "item_total")\
.groupBy(col("identifier"),col("cat1"),col("cat2"),col("brand")).agg(sum("item_total").alias("total_spend")).orderBy("identifier","brand")

row_num = total_spend_by_brand.withColumn("rownum",row_number().over(Window.partitionBy("identifier","cat1","cat2")\
                                                                   .orderBy(desc("total_spend"))))
top_preferred_brand = row_num.filter(col("rownum")==1).selectExpr("identifier","cat1","cat2","total_spend",
                                                                  "brand as top_preferred_brand")
top_preferred_brand.show(5)

+----------+--------------------+--------------------+-----------+-------------------+
|identifier|                cat1|                cat2|total_spend|top_preferred_brand|
+----------+--------------------+--------------------+-----------+-------------------+
| ajg@c.com|       Toys & Games |            Hobbies |        9.0|                 GS|
| als@b.com|  Sports & Outdoors |            Cycling |       25.0|                 IZ|
| anx@f.com|     Home & Kitchen |         Home D̩cor |        2.0|                 PX|
| awv@e.com|Health & Personal...|        Health Care |       24.0|                 OZ|
| ayt@a.com|        Electronics | Cell Phones & Ac...|       27.0|                 LZ|
+----------+--------------------+--------------------+-----------+-------------------+
only showing top 5 rows



#### Category source of volume - Sub category level spend

In [44]:
total_spend = df5.selectExpr("identifier","cat1","cat2", "item_total")\
.groupBy(col("identifier"),col("cat1"),col("cat2")).agg(sum("item_total").alias("total_spend"))

total_spend.show(5)

+----------+--------------------+--------------------+-----------+
|identifier|                cat1|                cat2|total_spend|
+----------+--------------------+--------------------+-----------+
| ilu@c.com|          Furniture |     Kids Furniture |       7.05|
| ajg@c.com|       Toys & Games |            Hobbies |        9.0|
| zjs@e.com|          Furniture | Kitchen & Dining...|      36.45|
| cok@b.com|     Home & Kitchen |   Small Appliances |       22.0|
| xdu@a.com|Arts, Crafts & Se...|     Craft Supplies |       18.0|
+----------+--------------------+--------------------+-----------+
only showing top 5 rows



#### Shopper - day of week spend

In [80]:
xx=df5.selectExpr("identifier","to_date(timestamp) as date","item_total")\
.groupBy("identifier","date").agg(sum("item_total").alias("spend"))
yy=xx.withColumn("Day_of_week",date_format("date","E")).groupBy("identifier","Day_of_week").agg(sum("spend").alias("spend"))
abc=yy.withColumn("rownum",row_number().over(Window.partitionBy("identifier","Day_of_week")\
                                                                   .orderBy(desc("spend"))))
top_preferred_day = abc.filter(col("rownum")==1).selectExpr("identifier","spend",
                                                                  "Day_of_week as top_preferred_day")
top_preferred_day.show(5)

+----------+------+-----------------+
|identifier| spend|top_preferred_day|
+----------+------+-----------------+
| aco@a.com|  12.0|              Wed|
| acp@c.com|  9.99|              Thu|
| afl@d.com|  47.0|              Sun|
| auw@f.com|105.99|              Mon|
| bsh@c.com| 11.99|              Sun|
+----------+------+-----------------+
only showing top 5 rows



#### Time of day - Shopper spends

In [79]:
xx=df5.selectExpr("identifier","timestamp","item_total").withColumn("Timestamp",to_timestamp("timestamp", 'yyyy-MM-dd HH:mm:ss'))\
.groupBy("identifier","Timestamp").agg(sum("item_total").alias("spend"))

yy=xx.withColumn("Hour_of_day",hour("Timestamp")).groupBy("identifier","Hour_of_day").agg(sum("spend").alias("spend"))

zz=yy.withColumn("Hour_bucket", when(((col("Hour_of_day")>=8) & (col("Hour_of_day")<12)),"8AM-12 noon")\
                                .when(((col("Hour_of_day")>=12) & (col("Hour_of_day")<17)),"12 noon-4PM")\
                                .when(((col("Hour_of_day")>=17) & (col("Hour_of_day")<22)),"5PM-10PM"))\
.groupBy("identifier","Hour_bucket").agg(sum("spend").alias("spend"))
abc=zz.withColumn("rownum",row_number().over(Window.partitionBy("identifier","Hour_bucket")\
                                                                   .orderBy(desc("spend"))))
top_preferred_hour_bucket = abc.filter(col("rownum")==1).selectExpr("identifier","spend",
                                                                  "Hour_bucket as top_preferred_hour_bucket")
top_preferred_hour_bucket.show(5)

+----------+-----------------+-------------------------+
|identifier|            spend|top_preferred_hour_bucket|
+----------+-----------------+-------------------------+
| are@e.com|             33.0|              12 noon-4PM|
| aru@e.com|86.99000000000001|                 5PM-10PM|
| bak@f.com|           153.99|              12 noon-4PM|
| cia@a.com|             50.5|                     null|
| dcy@a.com|            30.45|                     null|
+----------+-----------------+-------------------------+
only showing top 5 rows

