## Description: Predictive Model, All Impressions
Each record is a unique user and campaign combination. Use only the first survey response per user & campaign.  And remove all impressions afterwards.  Convert the remaining impressions into separate columns with one cut per column, and the value signifying the frequency.

- Site/Format/Device
- Baseline
- Month
- Creative Size

Note: Need to correct the differences in sites, such as GDN vs. Google Display Network.

In [52]:
import pandas as pd, numpy as np, os 
from datetime import timedelta, datetime
from functools import reduce
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
%matplotlib inline

### Data Cleaning

In [53]:
df = pd.read_csv(r'C:\Users\william.raikes\Programming\Python\abacus\v1_model\data\raw\chromebook.csv')

In [54]:
df.head()

Unnamed: 0,map_plfh,map_grp,imp_imp_event_time,map_dcm_creative,map_dcm_ad,map_olive_property,map_dcm_placement_id,campaign_id,user_id,map_prst_site,imp_table_match_creative_pixel_size,map_clicks,map_dcm_ad_id,imp_table_match_rendering_id,map_systm,date,answer_desired_Aided_awareness,answer_desired_Consideration,answer_desired_Purchase_intent,Preview_Aided_awareness,Preview_Consideration,Preview_Purchase_intent,map_olive_media_plan,frequency
0,VID,CON,1510255555059404,Control_Charity_St Jude - Finding Progress - H...,IA_CTR_DBM-VID-Chromebooks_DBM_StJude__StJude_:30,Open Exchange NonAdX,208648192,20334613,CAESEEcfm4NTukPl_qHookpLsbU,OPEN EXCHANGE NONADX,0x0,1880.91,408165170,94431032,MBL,2017-11-09 19:25:55.059404 UTC,True,False,False,False,False,False,Chromebooks NA Q4 2017 United States - Brand & DR,2
1,VID,CON,1510255513615560,Control_Charity_St Jude - Finding Progress - H...,IA_CTR_DBM-VID-Chromebooks_DBM_StJude__StJude_:30,Open Exchange NonAdX,208648192,20334613,CAESEMJMCvmAklnp1tsU4bHlOgg,OPEN EXCHANGE NONADX,0x0,1880.91,408165170,94431032,MBL,2017-11-09 19:25:13.61556 UTC,False,False,False,False,False,False,Chromebooks NA Q4 2017 United States - Brand & DR,3
2,VID,CON,1510462556235243,Control_Charity_St Jude - Finding Progress - H...,IA_CTR_DBM-VID-Chromebooks_DBM_StJude__StJude_:30,Open Exchange NonAdX,208648192,20334613,CAESECAhT4_9Y465CvsZmm8hTOg,OPEN EXCHANGE NONADX,0x0,1880.91,408165170,94431032,MBL,2017-11-12 04:55:56.235243 UTC,False,False,False,False,False,False,Chromebooks NA Q4 2017 United States - Brand & DR,1
3,VID,EXP,1510462923588202,Google_Chromebooks_Product_HP_EN_30s_0x0,IA_Exp1_DBM-VID-Chromebooks_DBM_Chromebook :30...,Open Exchange NonAdX,208648192,20334613,CAESEPmTQrkhAbC3em5OLPI9tqM,OPEN EXCHANGE NONADX,0x0,153.99,408079934,94423160,MBL,2017-11-12 05:02:03.588202 UTC,False,False,False,False,False,False,Chromebooks NA Q4 2017 United States - Brand & DR,1
4,VID,EXP,1510461580903162,Google_Chromebooks_Product_HP_EN_30s_0x0,IA_Exp1_DBM-VID-Chromebooks_DBM_Chromebook :30...,Open Exchange NonAdX,208648192,20334613,CAESEKMY4QTHAx8_xmFB_T-GKac,OPEN EXCHANGE NONADX,0x0,153.99,408079934,94423160,MBL,2017-11-12 04:39:40.903162 UTC,False,False,False,False,False,False,Chromebooks NA Q4 2017 United States - Brand & DR,4


In [55]:
df = df.rename(
    columns = {
        'map_systm': 'systm',
        'map_plfh': 'plfh',
        'map_grp': 'group',
        'imp_imp_event_time': 'event_time',
        'map_dcm_placement_id': 'placement_id',
        'map_dcm_ad_id': 'ad_id',
        'imp_table_match_rendering_id': 'rendering_id',
        'map_prst_site': 'site'
    }
)

df['date'] = pd.to_datetime(df.date)
df['month'] = df.date.dt.month

In [56]:
df['cuts'] = df.systm.str.cat([df.plfh, df.site], sep='_')
df['vid_size'] = df.map_dcm_ad.str.extract(r'(:[\d\d]{1,2}|[\d\d]{1,2}s)')[0].str.replace('s|:', '')
df['creative_size'] = np.where(df.imp_table_match_creative_pixel_size == '0x0', 
                               df.vid_size, 
                               df.imp_table_match_creative_pixel_size)

In [57]:
def start_date_in_previous_month(dt, count):
    _days = 7 * count
    tmp = dt.replace(day=1) - timedelta(days=1)
    return tmp.replace(day=1) - timedelta(days=_days)

In [58]:
def end_date_in_previous_month(dt, count):
    _days = 7 * count
    return dt.replace(day=1) - timedelta(days=1) + timedelta(days=_days)

In [59]:
def get_baseline(df, row, label, count):
    #logic for calculating baseline

    start = start_date_in_previous_month(row['date'], count)
    end = end_date_in_previous_month(row['date'], count)
    sample = get_sampleSize(df, start, end)
        
    if (sample >= 200):
        baseline = get_baseline_helper(df, start, end, label)
        return baseline
    
    else:
        count += 1
        return get_baseline(df, row, label, count+1)

In [60]:
def get_sampleSize(df, start, end):
    size = df[
        (df.group == 'CON') &
        (df.date >= start) &
        (df.date <= end)
    ].shape[0]
    
    return size

In [61]:
def get_baseline_helper(df, start, end, label):
    avg = df.loc[
        (df.group == 'CON') &
        (df.date >= start) &
        (df.date <= end)
    ][label].mean()
     
    return avg

In [63]:
df['baseline'] = df.apply(lambda x: get_baseline(df, x, 'answer_desired_Aided_awareness', 0), axis=1)

In [64]:
df_answer = df.groupby(['user_id', 'campaign_id'])['answer_desired_Aided_awareness'].max().reset_index()
df_grp = df.groupby(['user_id', 'campaign_id'])['group'].first().reset_index()
df_freq = df.groupby(['user_id', 'campaign_id'])['frequency'].first().reset_index()
df_base = df.groupby(['user_id', 'campaign_id'])['baseline'].mean().reset_index()

In [65]:
def freq_counts(df, label):
    new_df = df.groupby(['user_id', 'campaign_id', label]).size().unstack().reset_index().fillna(0)  
    new_df.columns = [label+x for x in new_df.columns if x != 'user_id' or x != 'campaign_id']
    
    return new_df

In [66]:
df_creative_size = freq_counts(df, 'creative_size')
df_cuts = freq_counts(df, 'cuts')
df_month = freq_counts(df, 'month')

TypeError: must be str, not int

In [None]:
dfs = [df_answer, df_grp, df_freq, df_base, df_creative_size, df_cuts, df_month]

df_final = reduce(lambda left, right: pd.merge(left, right, how = 'inner', on=['user_id', 'campaign_id']), dfs)

In [None]:
df_final = pd.get_dummies(df_final, columns=['group'])

In [None]:
df_final.shape

In [None]:
df_final.head()

In [42]:
df_final.to_csv(r'C:\Users\william.raikes\Programming\Python\abacus\v1_model\data\clean\chromebook_draft.csv', index=False)

In [24]:
from sklearn.model_selection import train_test_split

X = df_final.drop(axis=1, columns=['user_id', 'campaign_id', 'answer_desired_Aided_awareness'])
y = df_final.answer_desired_Aided_awareness

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [40]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=4)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=4,
            verbose=0, warm_start=False)

In [41]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])

0.5637815589099653

# QC:
- max the cuts' frequencies.
- compare my freqs vs. data's

In [527]:
df_final.max(axis=0)

user_id                           CAESEPznyPsyqQd1S72aQJCtv38
campaign_id                                          20920617
creative_size                                          970x66
answer_desired_Aided_awareness                           True
frequency                                                  97
DSK_BNR_CONDE NAST                                          1
DSK_BNR_FUSION MEDIA GROUP                                 31
MBL_BNR_CNET                                               25
MBL_BNR_CONDE NAST                                          3
MBL_BNR_DBM                                                 1
MBL_BNR_ESPN                                                1
MBL_BNR_FUSION MEDIA GROUP                                 36
MBL_BNR_FUSION.NET                                         77
MBL_BNR_GDN                                                70
MBL_BNR_NYTIMES                                            19
MBL_BNR_OPEN EXCHANGE ADX                                  22
MBL_BNR_

In [528]:
df_final['sum_freq'] = df_final[[x for x in list(df_final.columns) if x.startswith('MBL') or x.startswith('DSK')]].sum(axis=1)

In [529]:
df_final[['user_id', 'sum_freq', 'frequency']]

Unnamed: 0,user_id,sum_freq,frequency
0,CAESEA075WX-Zar5QV3wMC3-ZDc,9.0,34
1,CAESEA0eyv8LxeuFLIxgfYjTM0w,1.0,3
2,CAESEA0scZsis0RkaAPWK1_75-Y,11.0,15
3,CAESEA15rhYSfY5NAmv8c7_k9rs,5.0,9
4,CAESEA1ReHnNfeAU8RAiwK8LpXI,2.0,2
5,CAESEA1VTVc1fS-1RK0aN30q118,1.0,4
6,CAESEA1nVtaH7ruFD7LtZzxoisw,11.0,15
7,CAESEA30wnq9VDKjDVRtz22yZcg,21.0,86
8,CAESEA3EoLew0uGCHJ8TWdRC4Vs,1.0,1
9,CAESEA3hUB17LbzS-iFnp-ErcK8,3.0,7


In [595]:
for col in df.columns:
    if df[col].dtype == 'O' and 'user_id' not in col:
        print(df[col].value_counts(), end='\n\n')

BNR       11809
VID        3283
HGHIMP      324
Name: map_plfh, dtype: int64

EXP    8196
CON    7220
Name: map_grp, dtype: int64

Control_Charity_RedCross_en_300x250_static                               3704
Control_Charity_RedCross_en_320x50_static                                1602
Chromebook_Generic_320x50_HTML5                                          1006
Control_Charity_St Jude - Finding Progress - Hulu_en_30_mp4               754
Control_Charity_St Jude - Finding Progress - Hulu_en_15_mp4               659
Chromebook_Generic_300x250                                                467
Chromebook_Offer-Netflix_Pixelbook_EN_320x50_HTML                         183
Chromebook_Offer-Netflix_Samsung-Pro_EN_320x50_HTML                       180
Control_Charity_RedCross_en_300x600_static                                126
Google_Chromebooks_Product_Acer_EN_30s_0x0                                102
ChromebookUS_Control_RedCross_336x280.jpg                                 102
Chromebook_

In [74]:
def a_day_in_previous_month(dt):
   return dt.replace(day=1) - timedelta(days=1)

px['base_year'] = px['date'].apply(lambda x: a_day_in_previous_month(x)).dt.year
px['base_month'] = px['date'].apply(lambda x: a_day_in_previous_month(x)).dt.month

In [75]:
px['year'] = px['date'].dt.year
px['month'] = px['date'].dt.month

In [76]:
baseline_n = px[px.group=='CON'].groupby(['year', 'month']).size().reset_index()
baseline_n.columns = ['base_year', 'base_month', 'base_n']

baseline = px[px.group=='CON'].groupby(['year', 'month']).mean()['answer_desired_Aided_awareness'].reset_index()
baseline.columns = ['base_year', 'base_month', 'baseline']

In [81]:
px['avg_awareness'] = px[px.group=='CON'].answer_desired_Aided_awareness.mean()
px['first_baseline'] = baseline.baseline[0]

In [77]:
px = pd.merge(px, baseline, how='left', on=['base_year', 'base_month'])
px = pd.merge(px, baseline_n, how='left', on=['base_year', 'base_month'])