# UK Holiday Shopper Analysis

#### Importing yocuda data from GCS

In [62]:
data = (spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("gs://ds-mlengine/praneeth/2017_Yocuda_Dummy_Data_V01_20180129.csv"))\
       .filter((to_date(col("timestamp"))>'2016-11-30') & (to_date(col("timestamp"))<'2017-12-01'))\
  .selectExpr("identifier","item_total","transaction_id","to_date(timestamp) as date")

In [38]:
from pyspark.sql.functions import *

In [63]:
data.dtypes

[('identifier', 'string'),
 ('item_total', 'double'),
 ('transaction_id', 'string'),
 ('date', 'date')]

#### Importing UK Calendar

In [40]:
from pandas.tseries.holiday import (
    AbstractHolidayCalendar, DateOffset, EasterMonday,
    GoodFriday, Holiday, MO,
    next_monday, next_monday_or_tuesday)
class EnglandAndWalesHolidayCalendar(AbstractHolidayCalendar):
    rules = [
        Holiday('New Years Day', month=1, day=1, observance=next_monday),
        GoodFriday,
        EasterMonday,
        Holiday('Early May bank holiday',
                month=5, day=1, offset=DateOffset(weekday=MO(1))),
        Holiday('Spring bank holiday',
                month=5, day=31, offset=DateOffset(weekday=MO(-1))),
        Holiday('Summer bank holiday',
                month=8, day=31, offset=DateOffset(weekday=MO(-1))),
        Holiday('Christmas Day', month=12, day=25, observance=next_monday),
        Holiday('Boxing Day',
                month=12, day=26, observance=next_monday_or_tuesday)
    ]

#### Considering (Holiday -2 days) to (Holiday + 1 day) as Holiday period 

In [41]:
from pandas import DatetimeIndex, datetime
from pandas.tseries.holiday import get_calendar
from pyspark.sql.functions import to_date
from datetime import timedelta
from datetime import date
holidays = EnglandAndWalesHolidayCalendar().holidays(start=date(2016, 11, 30),end=date(2017, 12, 1))
holidays.tolist()

list_of_holidays = sqlContext.createDataFrame(
    [(str(x - timedelta(days = 2)),str(x - timedelta(days = 1)),str(x),str(x + timedelta(days = 1))) for x in holidays],
    ['date_sub_2','date_sub_1','date','date_add_1']
    )\
.selectExpr('to_date(date_sub_2) as date_sub_2','to_date(date_sub_1) as date_sub_1','to_date(date) as date','to_date(date_add_1) as date_add_1')

#### Consolidating holiday dates

In [42]:
from pyspark.sql.functions import lit
list_of_holidays_v2 = list_of_holidays.selectExpr('date_sub_2 as date')\
.union(list_of_holidays.selectExpr('date_sub_1 as date'))\
.union(list_of_holidays.selectExpr('date as date'))\
.union(list_of_holidays.selectExpr('date_add_1 as date'))\
.distinct()\
.withColumn('holiday_indicator', lit(1))
list_of_holidays_v2.show(5)

+----------+-----------------+
|      date|holiday_indicator|
+----------+-----------------+
|2017-04-29|                1|
|2017-04-12|                1|
|2017-05-27|                1|
|2017-01-01|                1|
|2017-04-18|                1|
+----------+-----------------+
only showing top 5 rows



In [43]:
list_of_holidays_v2.dtypes

[('date', 'date'), ('holiday_indicator', 'int')]

#### Joining the holiday dates

In [65]:
data.registerTempTable("data")
base_data=data.join(list_of_holidays_v2,["date"],"left")
base_data.show(5)

+----------+----------+----------+--------------------+-----------------+
|      date|identifier|item_total|      transaction_id|holiday_indicator|
+----------+----------+----------+--------------------+-----------------+
|2017-04-29| suv@a.com|     148.0|2017-04-29 - E169...|                1|
|2017-04-29|      null|      60.0|2017-04-29 - B121...|                1|
|2017-04-29| oeh@f.com|    115.98|2017-04-29 - B175...|                1|
|2017-04-29| nlw@d.com|      44.0|2017-04-29 - F116...|                1|
|2017-04-29| uzy@b.com|     48.99|2017-04-29 - B162...|                1|
+----------+----------+----------+--------------------+-----------------+
only showing top 5 rows



#### Calculating metrics of interest

In [70]:
df1=base_data.filter((col("identifier").isNotNull()) & \
                  (col("item_total")>0))  \
            .groupBy("identifier").agg(sum("item_total").alias("total_spend"),countDistinct("transaction_id").alias("total_txns"))

df2=base_data.filter((col("identifier").isNotNull()) & \
                    (col("holiday_indicator").isNotNull()) & \
                  (col("item_total")>0))  \
            .groupBy("identifier").agg(sum("item_total").alias("holiday_spends"),countDistinct("transaction_id").alias("holiday_txns"))  
df=df1.join(df2,"identifier","left")
df.show(5)

+----------+-----------+----------+--------------+------------+
|identifier|total_spend|total_txns|holiday_spends|holiday_txns|
+----------+-----------+----------+--------------+------------+
| ajb@b.com|        6.0|         1|          null|        null|
| bee@f.com|       32.0|         1|          null|        null|
| bnx@a.com|     679.98|         1|          null|        null|
| buv@e.com|     329.22|         1|          null|        null|
| ccj@e.com|       5.05|         1|          null|        null|
+----------+-----------+----------+--------------+------------+
only showing top 5 rows



#### Segmenting shoppers based on the % of holiday spends and transactions

In [75]:
df_final = df.withColumn('Type of shopper', when((col('holiday_txns')/col('total_txns') >= 0.6) & (col('holiday_spends')/col('total_spend') >= 0.6) ,"Holiday Shopper") \
                                                 .otherwise("Not a holiday shopper"))  
df_final.show(5)

+----------+-----------+----------+--------------+------------+--------------------+
|identifier|total_spend|total_txns|holiday_spends|holiday_txns|     Type of shopper|
+----------+-----------+----------+--------------+------------+--------------------+
| ajb@b.com|        6.0|         1|          null|        null|Not a holiday sho...|
| bee@f.com|       32.0|         1|          null|        null|Not a holiday sho...|
| bnx@a.com|     679.98|         1|          null|        null|Not a holiday sho...|
| buv@e.com|     329.22|         1|          null|        null|Not a holiday sho...|
| ccj@e.com|       5.05|         1|          null|        null|Not a holiday sho...|
+----------+-----------+----------+--------------+------------+--------------------+
only showing top 5 rows



In [79]:
df_final.groupBy("Type of shopper").agg(count("Type of shopper")).show()

+--------------------+----------------------+
|     Type of shopper|count(Type of shopper)|
+--------------------+----------------------+
|Not a holiday sho...|                 13045|
|     Holiday Shopper|                  1160|
+--------------------+----------------------+

