## Description: Predictive Model, All Impressions
Each record is a unique user and campaign combination. Use only the first survey response per user & campaign.  And remove all impressions afterwards.  Convert the remaining impressions into separate columns with one cut per column, and the value signifying the frequency.

- Site/Channel/Device/Medium
- Baseline (Need to revisit this, basic comp)
- Month/Year/Weekday/Daytime
- Creative Size
- Recency
- Cum Sum of Weekly Impression Viewings


Notes:
- Product differences.
- Side by side comparisons of weighted vs. non-weighted data
- Frequency is very important.

Note: Need to correct the differences in sites, such as GDN vs. Google Display Network.

In [341]:
import pandas as pd, numpy as np, os 
from datetime import timedelta, datetime
from functools import reduce
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
%matplotlib inline

### Data Cleaning

In [342]:
df = pd.read_csv(r'C:\Users\william.raikes\Programming\Python\abacus\v1_model\data\raw\chromebook.csv')

In [343]:
def freq_counts(df, label):
    new_df = df.groupby(['user_id', 'campaign_id', label]).size().unstack().reset_index().fillna(0)  
    new_df.columns = [label+'_'+str(x) if x != 'user_id' and x != 'campaign_id' else x for x in new_df.columns]
    
    return new_df

In [344]:
def get_sample(df, start, end):
    tmp = df[
        (df.grp == 'CON') &
        (df.date >= start) &
        (df.date <= end)
    ]
    
    return tmp.shape[0]

In [345]:
def get_rate(df, start, end):
    tmp = df[
        (df.grp == 'CON') &
        (df.date >= start) &
        (df.date <= end)
    ].answer_desired_Aided_awareness
    
    return tmp.mean()

In [346]:
def update_baseline(group, df, sample, count):
    _days = 7 * count
    base = group.copy()
    
    for idx in group[group.baseline_n < sample].index:
        date = pd.to_datetime(group.loc[idx, 'baseline_month'])
        start = date - pd.DateOffset(days=_days)
        end = date + pd.DateOffset(months=1, days=_days)

        new_n = get_sample(df, start, end)
        new_rate = get_rate(df, start, end)

        base.loc[idx, 'baseline_n'] = new_n
        base.loc[idx, 'baseline'] = new_rate
        
    return base

In [347]:
def run_updates(base, sample):
    count = 1

    while any(base.baseline_n < sample):
        base = update_baseline(base, df, sample, count)
        count += 1
    
    return base

In [348]:
# Dates
df['date'] = pd.to_datetime(df.date)

df['survey_date'] = pd.to_datetime(df.Created_Aided_awareness_date)
df['recency'] = (df['survey_date'] - df['date']).dt.days

df['current_month'] = df.date.dt.to_period('M')
df['year'] = df['date'].dt.year
df['weekday'] = df['date'].dt.weekday
df['hour'] = df['date'].dt.hour
df['month'] = df['date'].dt.month
df['day_time'] = np.where((df.hour >= 6) & (df.hour < 12), 'AM',                                   #6am-11am
                    np.where((df.hour >= 12) & (df.hour < 18), 'Noon',                             #12pm-5pm
                        np.where((df.hour >= 18) & (df.hour < 23), 'PM',                           #6pm-10pm
                            np.where((df.hour >= 23) | (df.hour < 6), 'Late Night', np.nan))))     #11pm-5am

In [349]:
df['vid_size'] = df.dcm_ad.str.extract(r'(:[\d\d]{1,2}|[\d\d]{1,2}s)')[0].str.replace('s|:', '')
df['creative_size'] = np.where(df.creative_pixel_size == '0x0', 
                               df.vid_size, 
                               df.creative_pixel_size)

In [350]:
df['start_date'] = np.where(df.campaign_id==20334613, df[df.campaign_id==20334613].date.min(),
                       np.where(df.campaign_id==20570795, df[df.campaign_id==20570795].date.min(),
                           np.where(df.campaign_id==20920617, df[df.campaign_id==20920617].date.min(), np.nan)))

In [351]:
df['weeks_elapsed'] = ((df.date - df.start_date).dt.days // 7).round()

In [352]:
df['rolling_imps'] = 1
imps = df.sort_values('date').set_index('date').groupby(['user_id','campaign_id'])['rolling_imps'].rolling('7D').sum().reset_index()
df.drop(columns='rolling_imps', inplace=True)
df = pd.merge(df, imps, how='left', on=['user_id', 'campaign_id', 'date'])

In [353]:
base_sample = df.groupby('current_month').size().reset_index()
base_sample.columns = ['baseline_month', 'baseline_n']

base = df.groupby('current_month').mean().answer_desired_Aided_awareness.reset_index()
base.columns = ['baseline_month', 'baseline']

base = pd.merge(base, base_sample, on='baseline_month')
base.baseline_month = base.baseline_month.astype(str)

base.index = base.baseline_month
idx = pd.period_range(base.baseline_month.min(), base.baseline_month.max(), freq='M')
base = base.reindex(idx.astype(str), fill_value=0)
base.baseline_month = base.index
base.index = range(base.shape[0])

base = run_updates(base, 200)
base.columns = ['current_month', 'baseline', 'baseline_n']

In [354]:
df.current_month = df.current_month.astype(str)
df = pd.merge(df, base, how='left', on='current_month')

In [355]:
df.rolling_imps = np.where(df.rolling_imps >= 8, '8 plus', df.rolling_imps)
df.recency = np.where(df.recency >= 8, '8 plus', df.recency)

In [322]:
answer = df.groupby(['user_id', 'campaign_id'])['answer_desired_Aided_awareness'].max().reset_index()
grp = df.groupby(['user_id', 'campaign_id'])['grp'].first().reset_index()
frequency = df.groupby(['user_id', 'campaign_id'])['frequency'].first().reset_index()
baseline = df.sort_values(by=['user_id', 'campaign_id', 'date']).groupby(['user_id', 'campaign_id'])['baseline'].first().reset_index()

In [323]:
channel_name = freq_counts(df, 'channel_name')
device_name = freq_counts(df, 'device_name')
medium_name = freq_counts(df, 'medium_name')
prst = freq_counts(df, 'prst')
rolling_imps = freq_counts(df, 'rolling_imps')
weeks = freq_counts(df, 'weeks_elapsed')
day_time = freq_counts(df, 'day_time')
creative_size = freq_counts(df, 'creative_size')
recency = freq_counts(df, 'recency')

In [324]:
dfs = [
    answer,
    grp,
    frequency,
    baseline,
    channel_name,
    device_name,
    medium_name,
    prst,
    rolling_imps,
    weeks,
    day_time,
    creative_size,
    recency
]

df_final = reduce(lambda left, right: pd.merge(left, right, how = 'inner', on=['user_id', 'campaign_id']), dfs)
df_final = pd.get_dummies(df_final, columns=['grp'])
df_final['total'] = df_final['channel_name_Display'] + df_final['channel_name_Video']

In [325]:
df_final.drop(columns=['user_id', 'campaign_id', 'grp_CON'], inplace=True)

In [326]:
cols = []
for col in df_final.columns:
    new_col = ''
    if ' ' in col:
        new_col = col.replace(' ', '_')
        cols.append(new_col)
    else:
        cols.append(col)

df_final.columns = cols

#### This file is the base file; to be used to tree models.

In [327]:
df_final.to_csv(r'C:\Users\william.raikes\Programming\Python\abacus\v1_model\data\clean\explanatory\chromebook_all_imps_wo_ints.csv', index=False)

In [335]:
cols = [
    'channel_name_Display',
    'device_name_Desktop',
    'medium_name_Web',
    'prst_OPEN_EXCHANGE_ADX',
    'rolling_imps_8_plus',
    'day_time_AM',
    'creative_size_30',
    'recency_8_plus',
    'weeks_elapsed_0',
    'total',
    'creative_size_300x250',
    'prst_YOUTUBE'
]

df_clean = df_final.drop(columns=cols)

#### This file removes the collinear features.

In [336]:
df_clean.to_csv(r'C:\Users\william.raikes\Programming\Python\abacus\v1_model\data\clean\explanatory\chromebook_all_imps_wo_ints.csv', index=False)

In [337]:
#df_clean = df_final.copy()
cols = [x for x in df_clean.columns if any([y in x for y in ['baseline', 'prst', 'channel', 'medium', 'device', 
                                                             'rolling', 'year', 'month', 'day_time',
                                                             'creative', 'recency', 'total', 'frequency', 'elapsed']])]

for col in cols:
    df_clean[col+'_int'] = df_clean[col] * df_clean['grp_EXP']

#### This file includes the interaction terms for all features; used for full model builds.

In [338]:
df_clean.to_csv(r'C:\Users\william.raikes\Programming\Python\abacus\v1_model\data\clean\explanatory\chromebook_all_imps_w_ints.csv', index=False)

In [339]:
cols = [x for x in df_clean.columns if '_int' in x or 'desired' in x or 'grp_EXP' in x]

df_int_only = df_clean[
    cols
]

#### This file includes only the interaction terms; used for partial stepwise model builds.

In [340]:
df_int_only.to_csv(r'C:\Users\william.raikes\Programming\Python\abacus\v1_model\data\clean\explanatory\chromebook_all_imps_ints_only.csv', index=False)

In [356]:
df[df.grp == 'EXP'].groupby('weeks_elapsed').answer_desired_Aided_awareness.agg(['mean', 'size'])

Unnamed: 0_level_0,mean,size
weeks_elapsed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.475822,517
1,0.551364,1246
2,0.597954,1271
3,0.637568,1661
4,0.574599,811
5,0.625922,2847
6,0.604907,4280
7,0.524983,2922
8,0.533679,1930
9,0.546535,1515


In [243]:
df[df.grp == 'EXP'].groupby('rolling_imps').answer_desired_Aided_awareness.agg(['mean', 'size'])

Unnamed: 0_level_0,mean,size
rolling_imps,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.591741,6175
2.0,0.58351,3784
3.0,0.58439,2524
4.0,0.572387,1789
5.0,0.575507,1331
6.0,0.57388,1049
7.0,0.580882,816
8 plus,0.576711,4165


In [246]:
df.groupby(['year', 'month']).baseline.agg(['mean', 'size'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,size
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1
2017,10,0.49027,3443
2017,11,0.535844,16488
2017,12,0.574086,9084
2018,1,0.526,500
2018,2,0.700884,5884
2018,3,0.687625,1495
2018,5,0.629213,623
2018,6,0.604588,5885
