In [1]:
# import libraries
import numpy as np
import pandas as pd
from datetime import datetime


pd.set_option('max_columns', 50)
pd.options.display.float_format = '{:.2f}'.format

In [2]:
# import csv file
df_tabA = pd.read_csv('table_A_conversions.csv')
df_tabA.head()

Unnamed: 0,Conv_Date,Revenue,User_ID,Conv_ID
0,2017-03-06,47.0,5094298f068196c5349d43847de5afc9125cf989,881152bb20f9b73daafb99d77714f38ac702629c
1,2017-03-02,98.0,,faf5c1181ea84a32237dff45ca201d2c28f19d7b
2,2017-03-02,180.35,,b0e58a88459ece1b585ca22c93e633dc56273b83
3,2017-03-23,201.94,433fdf385e33176cf9b0d67ecf383aa928fa261c,f0e6b7de22332c7b18c024e550bb1d860130cdf1
4,2017-03-03,197.47,,966568c7c859480c79b212520d20a51e735fd735


In [3]:
# import csv file
df_tabB = pd.read_csv('table_B_attribution.csv')
df_tabB.head()

Unnamed: 0,Channel,IHC_Conv,Conv_ID
0,H,1.0,881152bb20f9b73daafb99d77714f38ac702629c
1,I,0.3,faf5c1181ea84a32237dff45ca201d2c28f19d7b
2,A,0.32,faf5c1181ea84a32237dff45ca201d2c28f19d7b
3,E,0.38,faf5c1181ea84a32237dff45ca201d2c28f19d7b
4,H,1.0,b0e58a88459ece1b585ca22c93e633dc56273b83


In [4]:
def info(x):
    '''
    return information about dataframe
    '''
    decoration = "-_-"
    print("df.info() \n") 
    x.info() 
    print("\n {} \n".format((decoration*20))) 
    print("df.describe() \n\n", x.describe().round(2))
    print("\n {} \n".format((decoration*20))) 
    print("df.shape: {}".format(x.shape))
    print("\n {} \n".format((decoration*20)))
    print("df.isna().sum()\n\n{}".format(x.isna().sum()))
    print("\n {} \n".format((decoration*20)))
    print("df.nunique()\n\n{}".format(x.nunique()))
    return

In [5]:
info(df_tabA)

df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79643 entries, 0 to 79642
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Conv_Date  79643 non-null  object 
 1   Revenue    79643 non-null  float64
 2   User_ID    77347 non-null  object 
 3   Conv_ID    79643 non-null  object 
dtypes: float64(1), object(3)
memory usage: 2.4+ MB

 -_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_- 

df.describe() 

        Revenue
count 79643.00
mean    181.70
std     109.24
min      20.00
25%     114.23
50%     158.47
75%     217.59
max    4596.48

 -_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_- 

df.shape: (79643, 4)

 -_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_- 

df.isna().sum()

Conv_Date       0
Revenue         0
User_ID      2296
Conv_ID         0
dtype: int64

 -_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_--_- 

df.nunique()

Conv_Date      389
Revenue      39368
U

In [6]:
# change Conv_Date column type to datetime 
df_tabA['Conv_Date'] = pd.to_datetime(df_tabA['Conv_Date'])

## Revenue by month

In [7]:
#df_tabA['Revenue_Month'] = df_tabA['Conv_Date'].map(lambda x: 100*x.year + x.month)
# Get year and month from Conv_Date to create column Revenue_Month
df_tabA['Revenue_Month'] = df_tabA['Conv_Date'].apply(lambda x: x.strftime('%Y-%m'))
# Groupby Revenue_Month and sum() values of Revenue in each group
df_revenue = df_tabA.groupby(['Revenue_Month'])['Revenue'].sum().reset_index()
df_revenue

Unnamed: 0,Revenue_Month,Revenue
0,2017-03,958340.3
1,2017-04,2052788.16
2,2017-05,766990.5
3,2017-06,963796.46
4,2017-07,1026004.76
5,2017-08,774130.97
6,2017-09,1230439.17
7,2017-10,1271118.75
8,2017-11,1417349.64
9,2017-12,876302.66


In [8]:
# export csv file
df_revenue.to_csv('Revenue_by_month.csv',index=False)

![Revenue_by_month](Assets/Tableau_Table_A/Revenue_by_month.png)

## Monthly Revenue Growth

In [9]:
# get percentage change of Revenue compared to previous month
df_revenue['MonthlyGrowth(%)'] = df_revenue['Revenue'].pct_change()*100
df_revenue

Unnamed: 0,Revenue_Month,Revenue,MonthlyGrowth(%)
0,2017-03,958340.3,
1,2017-04,2052788.16,114.2
2,2017-05,766990.5,-62.64
3,2017-06,963796.46,25.66
4,2017-07,1026004.76,6.45
5,2017-08,774130.97,-24.55
6,2017-09,1230439.17,58.94
7,2017-10,1271118.75,3.31
8,2017-11,1417349.64,11.5
9,2017-12,876302.66,-38.17


In [10]:
# export csv file
df_revenue.to_csv('Monthly_Revenue_Growth.csv',index=False)

![Monthly_Revenue_Growth](Assets/Tableau_Table_A/Monthly_Revenue_Growth.png)

## Monthly user count

In [11]:
# Groupby Revenue_Month and get number of unique User_ID in each group
df_user = df_tabA.groupby('Revenue_Month')['User_ID'].nunique().reset_index()
# Ranaming columns to give a contextually meaningful column name
df_user.columns = ['Revenue_Month','User_ID_Count']
df_user

Unnamed: 0,Revenue_Month,User_ID_Count
0,2017-03,4448
1,2017-04,8313
2,2017-05,4400
3,2017-06,5144
4,2017-07,5676
5,2017-08,4606
6,2017-09,6404
7,2017-10,6427
8,2017-11,7184
9,2017-12,4382


In [12]:
# export csv file
df_user.to_csv('Monthly_user_count.csv',index=False)

![Monthly_user_count](Assets/Tableau_Table_A/Monthly_user_count.png)

## Monthly User Growth

In [13]:
# get percentage change of User Count compared to previous month
df_user['MonthlyGrowth(%)'] = df_user['User_ID_Count'].pct_change()*100
df_user

Unnamed: 0,Revenue_Month,User_ID_Count,MonthlyGrowth(%)
0,2017-03,4448,
1,2017-04,8313,86.89
2,2017-05,4400,-47.07
3,2017-06,5144,16.91
4,2017-07,5676,10.34
5,2017-08,4606,-18.85
6,2017-09,6404,39.04
7,2017-10,6427,0.36
8,2017-11,7184,11.78
9,2017-12,4382,-39.0


In [14]:
# export csv file
df_user.to_csv('Monthly_User_Growth.csv',index=False)

![Monthly_User_Growth](Assets/Tableau_Table_A/Monthly_User_Growth.png)

## Monthly Conversion Count

In [15]:
# Groupby Revenue_Month and get number of unique Conv_ID in each group
df_conv = df_tabA.groupby('Revenue_Month')['Conv_ID'].nunique().reset_index()
# Renaming columns to give a contextually meaningful column name
df_conv.columns = ['Revenue_Month','Conv_ID_Count']
df_conv

Unnamed: 0,Revenue_Month,Conv_ID_Count
0,2017-03,4973
1,2017-04,9071
2,2017-05,4786
3,2017-06,5499
4,2017-07,6033
5,2017-08,4866
6,2017-09,6854
7,2017-10,6750
8,2017-11,7990
9,2017-12,4997


In [16]:
# export csv file
df_conv.to_csv('Monthly_Conversion_Count.csv',index=False)

![Monthly_Conversion_Count](Assets/Tableau_Table_A/Monthly_Conversion_Count.png)

## Monthly Conversion Growth

In [17]:
# get percentage change of Conversion Count compared to previous month
df_conv['MonthlyGrowth(%)'] = df_conv['Conv_ID_Count'].pct_change()*100
df_conv

Unnamed: 0,Revenue_Month,Conv_ID_Count,MonthlyGrowth(%)
0,2017-03,4973,
1,2017-04,9071,82.4
2,2017-05,4786,-47.24
3,2017-06,5499,14.9
4,2017-07,6033,9.71
5,2017-08,4866,-19.34
6,2017-09,6854,40.85
7,2017-10,6750,-1.52
8,2017-11,7990,18.37
9,2017-12,4997,-37.46


In [18]:
# export csv file
df_conv.to_csv('Monthly_Conversion_Growth.csv',index=False)

![Monthly_Conversion_Growth](Assets/Tableau_Table_A/Monthly_Conversion_Growth.png)

## Average Revenue Per User Per Month

In [19]:
temp0 =df_tabA.copy()
temp0.dropna(inplace=True)

# count user id per month
temp1 = temp0.groupby(['Revenue_Month'])['User_ID'].count().reset_index()

# sum revenue groupby revenue month
temp2 = temp0.groupby(['Revenue_Month'])['Revenue'].sum().reset_index()

df_avg_rev = pd.merge(temp1,temp2,on='Revenue_Month')
df_avg_rev.columns = ['Revenue_Month','User_ID_Count','Revenue']
df_avg_rev['Avg_Revenue'] = df_avg_rev['Revenue']/df_avg_rev['User_ID_Count']
df_avg_rev

Unnamed: 0,Revenue_Month,User_ID_Count,Revenue,Avg_Revenue
0,2017-03,4930,952220.74,193.15
1,2017-04,9058,2049221.53,226.23
2,2017-05,4786,766990.5,160.26
3,2017-06,5499,963796.46,175.27
4,2017-07,6032,1025810.66,170.06
5,2017-08,4865,773943.74,159.08
6,2017-09,6854,1230439.17,179.52
7,2017-10,6750,1271118.75,188.31
8,2017-11,7557,1339370.86,177.24
9,2017-12,4573,805684.26,176.18


In [20]:
# export csv file
df_avg_rev.to_csv('Average_Revenue_Per_User_Per_Month.csv',index=False)

![Average_Revenue_Per_User_Per_Month](Assets/Tableau_Table_A/Average_Revenue_Per_User_Per_Month.png)

## User Classification by Month

In [21]:
# Groupby User_ID and get min or first Conv_Date in each group
df_min_conv = df_tabA.groupby('User_ID').Conv_Date.min().reset_index()
# Renaming columns to give a contextually meaningful column name
df_min_conv.columns = ['User_ID','MinConvDate']
# Format datetime in MinConvDate as year and month
df_min_conv['MinConvDate'] = df_min_conv['MinConvDate'].apply(lambda x: x.strftime('%Y-%m'))
# merge dataframes on User_ID
df_tabA_mod = pd.merge(df_tabA, df_min_conv, on='User_ID')
# create a new column UserType and populate it with New
df_tabA_mod['UserType'] = 'New'
# Filter UserType column and assign UserType as Existing if min or first Conv_Date is before Revenue_Month
df_tabA_mod.loc[df_tabA_mod['Revenue_Month']>df_tabA_mod['MinConvDate'],'UserType'] = 'Existing'

In [22]:
# calculate total Revenue per month for each user type
df_user_type_revenue = df_tabA_mod.groupby(['Revenue_Month','UserType'])['Revenue'].sum().reset_index()
df_user_type_revenue

Unnamed: 0,Revenue_Month,UserType,Revenue
0,2017-03,New,952220.74
1,2017-04,Existing,295191.75
2,2017-04,New,1754029.78
3,2017-05,Existing,239488.67
4,2017-05,New,527501.83
5,2017-06,Existing,296838.17
6,2017-06,New,666958.3
7,2017-07,Existing,350725.65
8,2017-07,New,675085.01
9,2017-08,Existing,243607.86


In [23]:
# export csv file
df_user_type_revenue.to_csv('User_Classification_Revenue.csv',index=False)

![User_Classification_Revenue](Assets/Tableau_Table_A/User_Classification_Revenue.png)

In [24]:
# Count users per month for each user type
df_user_type_count = df_tabA_mod.groupby(['Revenue_Month','UserType'])['User_ID'].count().reset_index()
df_user_type_count

Unnamed: 0,Revenue_Month,UserType,User_ID
0,2017-03,New,4930
1,2017-04,Existing,1236
2,2017-04,New,7822
3,2017-05,Existing,1465
4,2017-05,New,3321
5,2017-06,Existing,1551
6,2017-06,New,3948
7,2017-07,Existing,1864
8,2017-07,New,4168
9,2017-08,Existing,1388


In [25]:
# export csv file
df_user_type_count.to_csv('User_Classification_UserIDs.csv',index=False)

![User_Classification_UserIDs](Assets/Tableau_Table_A/User_Classification_UserIDs.png)