# UK Holiday Shopper Analysis

#### Importing yocuda data from GCS

In [13]:
from pyspark.sql.functions import *

In [14]:
data = (spark.read.format("csv").option("header", "true").option("inferSchema", "true").load('gs://westfield-tom/datalab/Yocuda_clean_data_Nov15_Nov17_20171214_v01_*.csv'))\
       .filter((to_date(col("timestamp"))>'2016-11-30') & (to_date(col("timestamp"))<'2017-12-01'))\
  .selectExpr("identifier","item_total","transaction_id","to_date(timestamp) as date","retailer_name")

In [16]:
data.dtypes

[('identifier', 'string'),
 ('item_total', 'double'),
 ('transaction_id', 'string'),
 ('date', 'date'),
 ('retailer_name', 'string')]

In [15]:
data.show(5)

+--------------------+----------+--------------------+----------+-------------+
|          identifier|item_total|      transaction_id|      date|retailer_name|
+--------------------+----------+--------------------+----------+-------------+
|                null|    -16.99|20170803 / 4329 /...|2017-08-03|        Argos|
|                null|     69.99|20161208 / 4309 /...|2016-12-08|        Argos|
|ginagordon13@hotm...|      5.99|20170911 / 413 / ...|2017-09-11|        Argos|
|kirsty.reddem96@g...|     11.99|20170506 / 4221 /...|2017-05-06|        Argos|
|                null|     28.99|20171106 / 4479 /...|2017-11-06|        Argos|
+--------------------+----------+--------------------+----------+-------------+
only showing top 5 rows



#### Importing UK Calendar

In [5]:
from pandas.tseries.holiday import (
    AbstractHolidayCalendar, DateOffset, EasterMonday,
    GoodFriday, Holiday, MO,
    next_monday, next_monday_or_tuesday)
class EnglandAndWalesHolidayCalendar(AbstractHolidayCalendar):
    rules = [
        Holiday('New Years Day', month=1, day=1, observance=next_monday),
        GoodFriday,
        EasterMonday,
        Holiday('Early May bank holiday',
                month=5, day=1, offset=DateOffset(weekday=MO(1))),
        Holiday('Spring bank holiday',
                month=5, day=31, offset=DateOffset(weekday=MO(-1))),
        Holiday('Summer bank holiday',
                month=8, day=31, offset=DateOffset(weekday=MO(-1))),
        Holiday('Christmas Day', month=12, day=25, observance=next_monday),
        Holiday('Boxing Day',
                month=12, day=26, observance=next_monday_or_tuesday)
    ]

#### Considering (Holiday -2 days) to (Holiday + 1 day) as Holiday period 

In [6]:
from pandas import DatetimeIndex, datetime
from pandas.tseries.holiday import get_calendar
from pyspark.sql.functions import to_date
from datetime import timedelta
from datetime import date
holidays = EnglandAndWalesHolidayCalendar().holidays(start=date(2016, 11, 30),end=date(2017, 12, 1))
holidays.tolist()

list_of_holidays = sqlContext.createDataFrame(
    [(str(x - timedelta(days = 2)),str(x - timedelta(days = 1)),str(x),str(x + timedelta(days = 1))) for x in holidays],
    ['date_sub_2','date_sub_1','date','date_add_1']
    )\
.selectExpr('to_date(date_sub_2) as date_sub_2','to_date(date_sub_1) as date_sub_1','to_date(date) as date','to_date(date_add_1) as date_add_1')

#### Consolidating holiday dates

In [8]:
from pyspark.sql.functions import lit

list_of_holidays_v2 = list_of_holidays.selectExpr('date_sub_2 as date')\
.union(list_of_holidays.selectExpr('date_sub_1 as date'))\
.union(list_of_holidays.selectExpr('date as date'))\
.union(list_of_holidays.selectExpr('date_add_1 as date'))\
.distinct()\
.withColumn('holiday_indicator', lit(1))
list_of_holidays_v2.show(5)
list_of_holidays_v2.count()

+----------+-----------------+
|      date|holiday_indicator|
+----------+-----------------+
|2017-04-29|                1|
|2017-04-12|                1|
|2017-05-27|                1|
|2017-01-01|                1|
|2017-04-18|                1|
+----------+-----------------+
only showing top 5 rows



28

In [9]:
list_of_holidays_v2.dtypes

[('date', 'date'), ('holiday_indicator', 'int')]

#### Joining the holiday dates

In [17]:
data.registerTempTable("data")
base_data=data.join(list_of_holidays_v2,["date"],"left")
base_data.show(5)

+----------+--------------------+----------+------------------+-------------+-----------------+
|      date|          identifier|item_total|    transaction_id|retailer_name|holiday_indicator|
+----------+--------------------+----------+------------------+-------------+-----------------+
|2017-08-11|gm.webster@hotmai...|      60.0|097752162734110817|     Halfords|             null|
|2017-08-11|                null|       4.0|072955061411110817|     Halfords|             null|
|2017-08-11|evans372@hotmail.com|     99.99|041452173483110817|     Halfords|             null|
|2017-08-11|                null|       1.5|051752125006110817|     Halfords|             null|
|2017-08-11|                null|     330.0|052053084342110817|     Halfords|             null|
+----------+--------------------+----------+------------------+-------------+-----------------+
only showing top 5 rows



#### Calculating metrics of interest

In [18]:
df1=base_data.filter((col("identifier").isNotNull()) & \
          ## added in restriction to look at only Argos transactions
                     (col("retailer_name")=="Argos") & \
                     (col("item_total")>0))  \
            .groupBy("identifier").agg(sum("item_total").alias("total_spend"),countDistinct("transaction_id").alias("total_txns"))

df2=base_data.filter((col("identifier").isNotNull()) & \
          ## added in restriction to look at only Argos transactions
                     (col("retailer_name")=="Argos") & \
                     (col("holiday_indicator").isNotNull()) & \
                  (col("item_total")>0))  \
            .groupBy("identifier").agg(sum("item_total").alias("holiday_spends"),countDistinct("transaction_id").alias("holiday_txns"))  
df=df1.join(df2,"identifier","left")
df.show(5)
df.count()

+--------------------+------------------+----------+--------------+------------+
|          identifier|       total_spend|total_txns|holiday_spends|holiday_txns|
+--------------------+------------------+----------+--------------+------------+
|   -.tmh24@gmail.com|             13.99|         1|          null|        null|
|-indiaismial78672...|             73.97|         1|          null|        null|
|.j.douglas@hotmai...|             14.99|         1|          null|        null|
|    00673@uk.mcd.com|             71.48|         1|          null|        null|
|00blackswan7@yaho...|29.479999999999997|         1|          null|        null|
+--------------------+------------------+----------+--------------+------------+
only showing top 5 rows



12105105

#### Segmenting shoppers based on the % of holiday spends and transactions

In [19]:
df_final = df.withColumn('Type of shopper', when((col('holiday_txns')/col('total_txns') >= 0.6) & (col('holiday_spends')/col('total_spend') >= 0.6) ,"Holiday Shopper") \
                                                 .otherwise("Not a holiday shopper"))  
df_final.show(5)

+--------------------+------------------+----------+--------------+------------+--------------------+
|          identifier|       total_spend|total_txns|holiday_spends|holiday_txns|     Type of shopper|
+--------------------+------------------+----------+--------------+------------+--------------------+
|   -.tmh24@gmail.com|             13.99|         1|          null|        null|Not a holiday sho...|
|-indiaismial78672...|             73.97|         1|          null|        null|Not a holiday sho...|
|.j.douglas@hotmai...|             14.99|         1|          null|        null|Not a holiday sho...|
|    00673@uk.mcd.com|             71.48|         1|          null|        null|Not a holiday sho...|
|00blackswan7@yaho...|29.479999999999997|         1|          null|        null|Not a holiday sho...|
+--------------------+------------------+----------+--------------+------------+--------------------+
only showing top 5 rows



In [20]:
df_final.groupBy("Type of shopper").agg(count("Type of shopper")).show()

+--------------------+----------------------+
|     Type of shopper|count(Type of shopper)|
+--------------------+----------------------+
|Not a holiday sho...|              11324749|
|     Holiday Shopper|                780356|
+--------------------+----------------------+



In [24]:
hol_sum1=df.withColumn('holiday_score_group', floor((col('holiday_txns')/col('total_txns'))*10)).groupBy("holiday_score_group").agg(countDistinct('identifier').alias('unique_shoppers'),\
                                sum('total_spend').alias('spend'),\
                                sum('total_txns').alias('num_of_txns'),\
                                sum('holiday_spends').alias('hol_spend'),\
                                sum('holiday_txns').alias('hol_num_of_txns'))
hol_sum1.show()

+-------------------+---------------+--------------------+-----------+--------------------+---------------+
|holiday_score_group|unique_shoppers|               spend|num_of_txns|           hol_spend|hol_num_of_txns|
+-------------------+---------------+--------------------+-----------+--------------------+---------------+
|                  0|          21012|3.1920018769999992E7|     418752|   1972758.739999998|          27890|
|                  7|           3181|  1102608.1800000009|      13189|   838284.5499999997|           9866|
|               null|       10617978|1.1330786129308734E9|   16068080|                null|           null|
|                  6|          27397|           7045084.5|      89527|   4720962.689999998|          58791|
|                  9|              3|   6111.039999999999|         35|             5899.58|             32|
|                  5|         276794| 4.280748829000031E7|     602513|2.1778674120000217E7|         301516|
|                  1|       