In [None]:
import pandas as pd
import numpy as np
import re
from matplotlib.ticker import MaxNLocator
import matplotlib.pyplot as plt
import seaborn as sns

# Loading the data

Let's start by loading the data in pandas dataframes.

## Campaign desc

In [None]:
df_campaign_desc = pd.read_csv('dunnhumby/campaign_desc.csv')

In [None]:
df_campaign_desc.head()

In [None]:
print('Number of campaign: ', len(df_campaign_desc.CAMPAIGN))
print('Unique description: ', df_campaign_desc.DESCRIPTION.unique())

Let's add a column with the duration of each campaign

In [None]:
df_campaign_desc['DURATION'] = df_campaign_desc['END_DAY'] - df_campaign_desc['START_DAY'] 
df_campaign_desc.sort_values(by = ['DESCRIPTION', 'DURATION'])

Let's have some statistics for each type of campaign.

In [None]:
for camp in ['TypeA', 'TypeB', 'TypeC']:
    print('Campaigns of %s range between %d and %d days' %(camp, df_campaign_desc[df_campaign_desc.DESCRIPTION == camp].DURATION.min(),
                                                     df_campaign_desc[df_campaign_desc.DESCRIPTION == camp].DURATION.max()))
    print('with a mean duration of %.2f days and a median duration of %.2f days.' 
            %(df_campaign_desc[df_campaign_desc.DESCRIPTION == camp].DURATION.mean(),
              df_campaign_desc[df_campaign_desc.DESCRIPTION == camp].DURATION.median()))

We observe these campaigns does not last the same amount of days. Notice in particular there is one campaign of type C lasting 161 days. We will look at the number of coupons distributed during this particular campaign to see what happened.

## Campaign table

In [None]:
df_campaign_table = pd.read_csv('dunnhumby/campaign_table.csv')

In [None]:
df_campaign_table.head()

Let's study the distribution of the number of campaigns each household has benefited from.

In [None]:
nbins = df_campaign_table.groupby(by = 'household_key').CAMPAIGN.count().max()
df_campaign_table.groupby(by = 'household_key').CAMPAIGN.count().hist(bins = nbins)
plt.title('Distribution of the number of campaigns per household')
plt.xlabel('Number of campaigns')
plt.ylabel('Number of households')
plt.show()

In [None]:
print('Mean number of campaigns:', df_campaign_table.groupby(by = 'household_key').CAMPAIGN.count().mean())
print('Median number of campaigns:', df_campaign_table.groupby(by = 'household_key').CAMPAIGN.count().median())
print('Max number of campaigns:', df_campaign_table.groupby(by = 'household_key').CAMPAIGN.count().max())
print('25% of the households have benefited from', df_campaign_table.groupby(by = 'household_key').CAMPAIGN.count().quantile(q = 0.25),
      'campaigns or less.')
print('75% of the households have benefited from', df_campaign_table.groupby(by = 'household_key').CAMPAIGN.count().quantile(q = 0.75),
      'campaigns or less.')
print('95% of the households have benefited from', df_campaign_table.groupby(by = 'household_key').CAMPAIGN.count().quantile(q = 0.95),
      'campaigns or less.')

## Causal data

In [None]:
df_causal = pd.read_csv('dunnhumby/causal_data.csv')

In [None]:
df_causal.head()

Let's define two dict for the code meaning of display and mailer.

In [None]:
display_dict = {0:'Not on Display', 1:'Store Front', 2:'Store Rear', 3:'Front End Cap',
                4:'Mid-Aisle End Cap', 5:'Read End Cap' ,6:'Side-Aisle End Cap', 7:'In-Aisle' ,
                9:'Secondary Location Display' ,'A':'In-Shelf' }
mailer_dict = {0:'Not on ad', 'A':'Interior page feature', 'C':'Interior page line item', 
              'D':'Front page feature', 'F':'Back page feature', 'H':'Wrap front feature',
              'J':'Wrap interior coupon', 'L':'Wrap back feature', 'P':'Interior page coupon',
              'X':'Free on interior page', 'Z':'Free on front page, back page or wrap'}

## Coupon redemption

In [None]:
df_coupon_redempt = pd.read_csv('dunnhumby/coupon_redempt.csv')

In [None]:
df_coupon_redempt.head()

## Coupon

In [None]:
df_coupon = pd.read_csv('dunnhumby/coupon.csv')

In [None]:
df_coupon.head()

Let's study how many coupons were distributed for each campaign.

In [None]:
df_coupon.groupby(by = 'CAMPAIGN').count().plot.bar(y = 'COUPON_UPC', logy = True) 
plt.xlabel('Campaign')
plt.ylabel('Number of coupons')
plt.title('Number of coupons per campaign')
plt.show()

We notice there are definetely some campaigns with way more coupons than others. Is this related to the type of campaign?

## Demographics

In [None]:
df_demo = pd.read_csv('dunnhumby/hh_demographic.csv')

In [None]:
df_demo.head()

## Products

In [None]:
df_products = pd.read_csv('dunnhumby/product.csv')

In [None]:
df_products.head()

## Transcation data

In [None]:
df_transaction = pd.read_csv('dunnhumby/transaction_data.csv')

In [None]:
df_transaction.head()