# BSSP Mini Data Case Study

The goal of this case-study is to evaluate the effectiveness of the campaign, and optimize the spend.

In [None]:
import pandas as pd
import numpy as np

In [None]:
## Save both sheets of data as CSV files
## - “Main Data” tab saved as sample_data_main.csv
## - “ProgDSP” tab saved as sample_data_progdsp.csv
main_data = pd.read_csv('sample_data_main.csv')
progdsp = pd.read_csv('sample_data_progdsp.csv')

Peek at what we’re working with 

In [None]:
## Appears to be data at the daily + placement/creative level
main_data.head()

In [None]:
## Appears to be data from ProgDSP placements, includes their spend (not found in main_data)
progdsp.head()

## Task 1. 
### Evaluate performance of campaign

Not sure what the KPI’s were, so I’ll just look at 
CTR/CPC/CPM/CPA/Conversion Rate (CR), and provide a summary

In [None]:
## Create some functions that can be re-used
def get_cpm(cost,impressions):
    try:
        cpm = (float(cost) * 1000)/float(impressions)
        cpm = '{0:,.2f}'.format(cpm) ## reduce to 2 decimal places, add thousands commas
    except ZeroDivisionError:
        cpm = 0
    return cpm

def get_cpc(cost,clicks):
    try:
        cpc = float(clicks)/float(cost)
        cpc = '{0:,.2f}'.format(cpc)
    except ZeroDivisionError:
        cpc = 0
    return cpc

def get_ctr(clicks,impressions):
    try:
        ctr = (float(clicks)/float(impressions)) * 100
        ctr = '{0:,.4f}'.format(ctr)
    except ZeroDivisionError:
        ctr = 0
    return ctr

def get_cpa(cost,conversions):
    try:
        cpa = float(cost)/float(conversions)
        cpa = '{0:,.2f}'.format(cpa)
    except ZeroDivisionError:
        cpa = 0
    return cpa

def get_cr(conversions,impressions):
    try:
        cr = (float(conversions)/float(impressions)) * 100
        cr = '{0:,.4f}'.format(cr)
    except ZeroDivisionError:
        cr = 0
    return cr

## and if we have a dataframe, we can wrap these up for convenience
def get_stats(df,cost=True):
    df = df.agg('sum').join(pd.DataFrame(df.size(), columns=['Placement Count']))
    df['CTR'] = df.apply(lambda row: float(get_ctr(row['Clicks'],row['Impressions'])), axis=1)
    df['View_CR'] = df.apply(lambda row: float(get_cr(row['View-through Conversions'],row['Impressions'])), axis=1)
    df['Click_CR'] = df.apply(lambda row: float(get_cr(row['Click-through Conversions'],row['Impressions'])), axis=1)
    df['Total_CR'] = df.apply(lambda row: float(get_cr(row['Total Conversions'],row['Impressions'])), axis=1)
    if cost:
        df['CPM'] = df.apply(lambda row: float(get_cpm(row['Media Cost'],row['Impressions'])), axis=1)
        df['CPC'] = df.apply(lambda row: float(get_cpc(row['Media Cost'],row['Clicks'])), axis=1)    
        df['CPA'] = df.apply(lambda row: float(get_cpa(row['Media Cost'],row['Total Conversions'])), axis=1)
    return df

## Calculate these metrics for the entire campaign

For the sake of this case study, I’ll use “Media Cost” as a proxy for “Spend”.

Since ProgDSP placements’ costs aren’t in main_data, we can grab them from progdsp

In [None]:
total_clicks = sum(main_data['Clicks'])
total_impressions = sum(main_data['Impressions'])
total_cost = sum(main_data['Media Cost']) + sum(progdsp['Spend'])
total_conversions = sum(main_data['Total Conversions'])

print 'Total Cost: \n${:,.2f}\n'.format(total_cost)
print 'Total Impressions: \n{:,}\n'.format(total_impressions)
print 'Total Clicks: \n{:,}\n'.format(total_clicks)
print 'Total Conversions: \n{:,}\n'.format(total_conversions)


total_cpm = get_cpm(total_cost,total_impressions)
print 'Overall campaign CPM:\n${0}\n'.format(total_cpm)

total_cpc = get_cpc(total_cost,total_clicks)
print 'Overall campaign CPC:\n${0}\n'.format(total_cpc)

total_cpa = get_cpa(total_cost,total_conversions)
print 'Overall campaign CPA:\n${0}\n'.format(total_cpa)

total_ctr = get_ctr(total_clicks,total_impressions)
print 'Overall campaign CTR:\n{0}%\n'.format(total_ctr)

total_cr = get_cr(total_conversions,total_impressions)
print 'Overall campaign CR:\n{0}%'.format(total_cr)

## 2. We can break this down to the daily placement level

In [None]:
## Create a copy of main_data to wrangle and add columns to
p_daily = main_data.copy()

## We want to group by columns 'Placement' and 'Date', as well as the other categoricals.
## Then, aggregate (sum) the delivery metrics
group_col = ['Date','Placement','Site (DCM)']
sum_col = ['Impressions','Clicks','Total Conversions','Click-through Conversions'
           ,'View-through Conversions','Media Cost']

## Remove the other columns, then group by placement/date and sum the rest
p_daily = p_daily[group_col + sum_col]
p_daily = p_daily.groupby(group_col, as_index=False).sum()

## order by Date and Placement
p_daily = p_daily.sort_values(['Date','Placement'])

The media costs for ProgDSP placements aren’t in this dataframe. 

We can grab them from the ProgDSP dataframe.

In [None]:
## Create a copy of progdsp that we can adjust (while preserving the original dataframe)
temp = progdsp.copy()

## Since we’re using Spend as a proxy for media cost, rename the column
temp = temp.rename(columns={'Spend':'Media Cost'})

## Remove irrelevant columns
relevant_columns = ['Date','Media Cost','Placement']
temp = temp[relevant_columns]

## Change the ”0” values of ProgDSP costs to np.NaN so we can use df.fillna function
p_daily.loc[p_daily['Site (DCM)'] == 'ProgDSP','Media Cost'] = np.NaN

## Merge the two dataframes to get ProgDSP costs
p_daily = p_daily.merge(temp,on=['Placement','Date'],how='left')

## Replace the NaN values with their costs
p_daily['Media Cost_x'] = p_daily['Media Cost_x'].fillna(p_daily['Media Cost_y'])

## Clean up the columns
p_daily = p_daily.rename(columns={'Media Cost_x':'Media Cost'})
del p_daily['Media Cost_y']
p_daily['Date'] = pd.to_datetime(p_daily['Date'], format='%m/%d/%y')
p_daily['Date'] = p_daily['Date'].apply(lambda date: date.strftime('%m/%d/%y'))

## Sort by date
p_daily = p_daily.sort_values(['Date'])

p_daily

In [None]:
## Apply our calculation functions over the observations/rows
p_daily['CPM'] = p_daily.apply(lambda row: float(get_cpm(row['Media Cost'],row['Impressions'])), axis=1)
p_daily['CPC'] = p_daily.apply(lambda row: float(get_cpc(row['Media Cost'],row['Clicks'])), axis=1)
p_daily['CTR'] = p_daily.apply(lambda row: float(get_ctr(row['Clicks'],row['Impressions'])), axis=1)
p_daily['CPA'] = p_daily.apply(lambda row: float(get_cpa(row['Media Cost'],row['Total Conversions'])), axis=1)
p_daily['Total_CR'] = p_daily.apply(lambda row: float(get_cr(row['Total Conversions'],row['Impressions'])), axis=1)
p_daily['View_CR'] = p_daily.apply(lambda row: float(get_cr(row['View-through Conversions'],row['Impressions'])), axis=1)
p_daily['Click_CR'] = p_daily.apply(lambda row: float(get_cr(row['Click-through Conversions'],row['Impressions'])), axis=1)

With this data, we can look at some summary statistics.
These can be compared against campaign KPI’s to determine effectiveness

In [None]:
summary = p_daily.describe()

## Format the columns
for col in ['Impressions','Clicks','Total Conversions','Click-through Conversions','View-through Conversions']:
    summary[col] = summary[col].astype(int)
for col in ['Media Cost','CPM','CPC']:
    summary[col] = summary[col].map('${:,.2f}'.format)
summary.drop(summary.index[0],inplace=True)

summary

## Task 2.
### Optimizing Spend

First, we can see how each placement performed through the entire campaign flight length (rather, the range of the dataset).

Then, we can compare performance of individual placement features.

In [None]:
## Create a copy of our placements from the previous step
placements = p_daily.copy()

## Remove irrelevant columns from our copied dataframe
relevant_columns = ['Placement','Impressions','Clicks','Total Conversions','Click-through Conversions'
                   ,'View-through Conversions','Media Cost']
placements = placements[relevant_columns]

## Organize and summarize the dataframe
placements = placements.groupby(['Placement'])
placements = get_stats(placements)

## Make output more readable
pd.options.display.float_format = '{:,.4f}%'.format
organized_columns = ['Placement Count','Media Cost','CPM','CPC','CPA','CTR','View_CR','Click_CR','Total_CR']
placements = placements[organized_columns]
placements = placements.sort_values(['CPA'])
placements = placements.rename(columns={'Placement Count':'Flight Days'})

## Format our CPM/CPC/CPA columns to have dollar signs instead of percent signs
## Create a copy of this dataframe to format, so we can keep using the raw statistics just in case
placements = placements.copy()
for col in ['Media Cost','CPM','CPC','CPA']:
    placements[col] = placements[col].map('${:,.2f}'.format)

placements

The Spend + KPI’s of the above data can be compared against delivery goals of the campaign.
They can be used to allocate future placements, given a campaign goal and budget.

Further analysis can be performed on other features of placements, such as creative copy or ad-size.

### Starting with creative copy

In [None]:
## Get a copy dataframe of all the placements and unique creatives.
creative = main_data.copy()

## Because it’s unclear how spend was allocated among ProgDSP placements’ different creative, 
## we will not calculate CPM/CPC.

## Remove irrelevant columns from our copied dataframe
relevant_columns = ['Creative','Impressions','Clicks','Total Conversions','Click-through Conversions'
                   ,'View-through Conversions']
creative = creative[relevant_columns]

## Extract the actual creative copy from the 'Creative' field
creative['Creative'] = creative.apply(lambda row: row['Creative'].split('_')[1], axis=1)

## Organize and summarize the dataframe
creative = creative.groupby(['Creative'])
creative = get_stats(creative,cost=False)

## Make output more readable
pd.options.display.float_format = '{:,.4f}%'.format
organized_columns = ['Placement Count','CTR','View_CR','Click_CR','Total_CR']
creative = creative[organized_columns]
creative = creative.sort_values(['CTR'],ascending=False)

creative

Because there wasn’t the data to associate costs with creative, CTR can be examined instead of CPA.

### Next, we’ll do the same thing for ad-size

In [None]:
## Get a copy of all unique placements including their costs
ad_size = p_daily.copy()

## Remove irrelevant columns from our copied dataframe
relevant_columns = ['Placement','Impressions','Clicks','Total Conversions','Click-through Conversions'
                   ,'View-through Conversions','Media Cost']
ad_size = ad_size[relevant_columns]

## Extract the Ad-Size from the 'Placement' field
ad_size['Ad-Size'] = ad_size.apply(lambda row: row['Placement'].split('_')[2], axis=1)
del ad_size['Placement']

## Organize and summarize the dataframe
ad_size = ad_size.groupby(['Ad-Size'])
ad_size = get_stats(ad_size)

## Make output more readable
organized_columns = ['Placement Count','Media Cost','CPM','CPC','CPA','CTR','View_CR','Click_CR','Total_CR']
ad_size = ad_size[organized_columns]
ad_size = ad_size.sort_values(['CPA'])

## Format our CPM/CPC/CPA columns to have dollar signs instead of percent signs
## Create a copy of this dataframe to format, so we can keep using the raw statistics just in case
ad_size_copy = ad_size.copy()
ad_size_copy['Media Cost'] = ad_size_copy['Media Cost'].map('${:,.2f}'.format)
ad_size_copy['CPM'] = ad_size_copy['CPM'].map('${:,.2f}'.format)
ad_size_copy['CPC'] = ad_size_copy['CPC'].map('${:,.2f}'.format)
ad_size_copy['CPA'] = ad_size_copy['CPA'].map('${:,.2f}'.format)

ad_size_copy