In [1]:
# Description: Predictive Model, All Impressions
# Each record is a unique user and campaign combination. Use only the first survey response per user & campaign. 
# And remove all impressions afterwards.  Convert the remaining impressions into separate columns with one cut per column, 
# and the value signifying the frequency.
import configparser
import pandas as pd, numpy as np, datetime, os, sys
from functools import reduce
from google.cloud import bigquery


def df_rename(df):

    df = df.rename(
        columns = {
            'map_systm': 'systm',
            'map_plfh': 'plfh',
            'map_grp': 'group',
            'Answer_desired': 'answer',
            'imp_imp_event_time': 'event_time',
            'map_dcm_placement_id': 'placement_id',
            'map_dcm_ad_id': 'ad_id',
            'imp_table_match_rendering_id': 'rendering_id',
            'map_prst_site': 'site'
        }
    )

    return df

# Only keep users who have answered the first two questions:
def screen_surveys(df):
    grp = df.groupby('user_id').apply(lambda x: 1 if 1 in x.Question.values else 0).reset_index()
    idx = grp[grp[0]==1].user_id.values

    new_df = df[
        (df['user_id'].isin(idx)) &
        (df.Question==0)
    ]
    
    return new_df

# Only keep impressions within 7 days of the first scrutineer survey taken.
def date_filter(df):
    df['date'] = pd.to_datetime(df['date'])
    df['Created'] = pd.to_datetime(df['Created'])
    
    filters = df.sort_values(by=['user_id', 'campaign_id', 'Created'])
    filters = df.groupby(['user_id', 'campaign_id']).first().reset_index()[['user_id', 'campaign_id', 'Created']]
    new_df = pd.merge(df, filters, how='inner', on=['user_id', 'campaign_id', 'Created'])
    
    new_df['recency'] = pd.to_timedelta(new_df['Created'] - new_df['date'])

    new_df = new_df[
        (new_df.recency <= '7 days') & 
        (new_df.recency >= '0 days')
    ]
    
    return new_df

# Create dummy columns for different cuts.
def cuts(df):
    df['cuts'] = df.systm.str.cat([df.plfh, df.site], sep='_')
    df = pd.get_dummies(df, columns=['group'])

    return df

# Create supporting dfs to create a new df filtered by user_id and campaign_id.  
def supporting_dfs(df):
    df_answer = df.groupby(['user_id', 'campaign_id'])['answer'].max().reset_index()
    df_grp = df.groupby(['user_id', 'campaign_id'])[['group_CON', 'group_EXP']].first().reset_index()
    df_freq = df.groupby(['user_id', 'campaign_id'])['frequency'].first().reset_index()

    df_cuts = df.groupby(['user_id', 
                         'campaign_id',
                         'event_time', 
                         'placement_id', 
                         'ad_id', 
                         'rendering_id', 
                         'cuts']).size().reset_index()

    df_cuts = df_cuts.groupby(['user_id', 'campaign_id', 'cuts']).size().unstack().reset_index().fillna(0)

    return [df_answer, df_grp, df_freq, df_cuts]

# Combine supporting dfs to one final df
def combine_dfs(dfs_list):
    df = reduce(lambda left, right: pd.merge(left, right, how='inner', on=['user_id', 'campaign_id']), dfs_list)

    return df





In [2]:
pd.__version__

'0.23.4'

In [3]:
config = configparser.ConfigParser()
config.read('..\Resource\liftconfig.ini')



[]

In [4]:
keyFile=outputFile=r'C:\Users\william.raikes\Programming\Python\abacus\mvp_model\essence-analytics-dwh-627924bbda04.json'
client=bigquery.Client.from_service_account_json(keyFile)
job_config = bigquery.QueryJobConfig()
job_config.allow_large_results=True
project = 'essence-analytics-dwh'


In [5]:
cb_sql = 'SELECT * FROM Project_Abacus_Brand_Lift.Chrome_Final_noMOAT_FullScrutineer_Frequencies_flat_Filtered'
cb_df = client.query(cb_sql).to_dataframe()

In [6]:
cb_dfx = cb_df.copy()
cb_dfx['date'] = pd.to_datetime(cb_df['date'])
cb_dfx['Created'] = pd.to_datetime(cb_dfx['Created'])
filters = cb_dfx.sort_values(by=['user_id', 'campaign_id', 'Created'])
filters = cb_dfx.groupby(['user_id', 'campaign_id']).first().reset_index()[['user_id', 'campaign_id', 'Created']]

NameError: name 'cb_dfx' is not defined

In [34]:
pd.merge(cb_df, filters, how='inner', on=['user_id', 'campaign_id', 'Created'])

TypeError: Argument 'values' has incorrect type (expected numpy.ndarray, got Index)

In [None]:




new_df = pd.merge(df, filters, how='inner', on=['user_id', 'campaign_id', 'Created'])

new_df['recency'] = pd.to_timedelta(new_df['Created'] - new_df['date'])

new_df = new_df[
    (new_df.recency <= '7 days') & 
    (new_df.recency >= '0 days')
]



In [18]:
cb_dfx = df_rename(cb_df)
cb_dfx = screen_surveys(cb_dfx)
cb_dfx = date_filter(cb_dfx)
#cb_dfx = cuts(cb_dfx)
#list_dfs = supporting_dfs(cb_dfx)
#final_df = combine_dfs(list_dfs)
#final_df.to_csv(save, index=False)

TypeError: Argument 'values' has incorrect type (expected numpy.ndarray, got Index)

In [17]:
cb_dfx.head()

Unnamed: 0,plfh,group,answer,event_time,Created,Question,map_dcm_creative,map_dcm_ad,placement_id,campaign_id,user_id,frequency,site,imp_table_match_creative_pixel_size,map_clicks,ad_id,rendering_id,systm,date,Answer_value
603,VID,EXP,False,1529721705898025,2018-06-08 11:54:05.495650+00:00,0.0,Google_Chromebook_Productivity_Offline_Asus C3...,Dsk_Exp1_VID-CB_DBM_Hulu_Creative-specific LPs...,223168651,20920617,CAESEKfNNSTz02ILoGLnJim63dY,28,HULU,0x0,40.24,421498570,101862590,MBL,2018-06-23 02:41:45.898025+00:00,MacBook
605,VID,EXP,False,1529726713663630,2018-06-08 11:54:05.495650+00:00,0.0,Google_Chromebook_Productivity_Creativity Apps...,Dsk_Exp1_VID-CB_DBM_Hulu_Creative-specific LPs...,223309326,20920617,CAESEKfNNSTz02ILoGLnJim63dY,28,HULU,0x0,98.76,421498573,101810321,MBL,2018-06-23 04:05:13.663630+00:00,MacBook
608,BNR,EXP,False,1511624476342629,2018-06-25 16:10:31.599137+00:00,0.0,Chromebook_Pillar01_M1_AsusC101_728x90,Dsk_Exp2_BNR-Chromebooks_SambaTV_Chromebook Q4...,208154048,20334613,CAESELOXCNVdZ_qia4J1r6ZSTxU,5,SAMBA,728x90,10.0,407911638,94574174,DSK,2017-11-25 15:41:16.342629+00:00,MacBook
609,VID,EXP,False,1528710362215712,2018-06-21 17:50:08.529141+00:00,0.0,Google_Chromebook_Productivity_Creativity Apps...,Dsk_Exp1_VID-CB_DBM_Open Exchange_Creative-spe...,221897308,20920617,CAESEJh6CmoyfGbv2PFgjSDUots,23,TELARIA,0x0,1.51,419871801,101870187,DSK,2018-06-11 09:46:02.215712+00:00,MacBook
618,VID,EXP,False,1530161088726056,2018-06-21 17:53:23.935778+00:00,0.0,Google_Chromebook_Productivity Offline_Pixelbo...,Dsk_Exp1_VID-CB_DBM_Hulu_Creative-specific LPs...,223309326,20920617,CAESEJHB6Tyn2vp_2FCLpJNvUw0,14,HULU,0x0,106.94,421498573,101871411,MBL,2018-06-28 04:44:48.726056+00:00,MacBook


In [7]:
px_sql = 'SELECT * FROM Project_Abacus_Brand_Lift.Pixel_Final_noMOAT_FullScrutineer_Frequency_Flat_3_Filtered'
px_df = client.query(px_sql).to_dataframe()

KeyboardInterrupt: 

In [None]:
for df, save in zip([px_df], [px_save]):
    df = df_rename(df)
    df = screen_surveys(df)
    df = date_filter(df)
    df = cuts(df)
    list_dfs = supporting_dfs(df)
    final_df = combine_dfs(list_dfs)
    #final_df.to_csv(save, index=False)