# What kind of emails triggered payment during payment window (accept-pay)?

In [1]:
import psycopg2
import pandas as pd
import seaborn as sns
import pymc3 as pm
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline

  from pandas.core import datetools


### Import data
* blueshift record in harday starts from 2017-06-01. hence we only look at data from that time point
* only look at cohorts already started (i.e. payment is due). cohorts those still waiting for accept/pay is not considered here
* payment due day is cohort start date + 6

In [2]:
conn_hardy = psycopg2.connect("dbname='analytics' user='u_yidong' host='udacity-segment.c2zpsqalam7o.us-west-2.redshift.amazonaws.com' port = '5439' password='Lyd580809?!'")
sql = "select distinct a.id,a.applicant_id,a.nd_key,a.cohort_id,a.accepted_at\
             ,a.cohort_start_at,a.first_charge_created_at\
             ,case when a.first_charge_created_at is not null then 1 else 0 end as pay\
             ,b.received_at,b.campaign_name\
             ,case when b.campaign_name is null then 'no emails' else b.campaign_name end as campaign\
             ,row_number() over (partition by a.id order by b.received_at desc) as row_num\
             ,case when count(distinct c.term_nd_key) > 0 then 1 else 0 end as nd_enrolled\
       from analytics_tables.applications a\
       left join blueshift.email_delivered b\
         on a.applicant_id = b.user_id and a.accepted_at <= b.received_at\
          and coalesce(a.first_charge_created_at,a.cohort_start_at + interval '1' day * 6) >= b.received_at\
          and b.event = 'email_delivered'\
       left join analytics_tables.term_enrollments c\
         on a.applicant_id = c.user_id and c.cohort_start_at <= a.accepted_at\
       where a.email not like '%@udacity%' and a.cohort_start_at <= current_date\
         and a.accepted_at::date >= '2017-06-01'\
       group by a.id,a.applicant_id,a.nd_key,a.cohort_id,a.accepted_at,a.cohort_start_at,a.first_charge_created_at\
             ,b.received_at,b.campaign_name"
df = pd.read_sql(sql,conn_hardy)

In [3]:
df.shape

(32681, 13)

### There're 32,681 records, 7773 of them paid (24%). total 13,513 students

In [4]:
df.groupby(['pay'])['id'].count()

pay
0    24908
1     7773
Name: id, dtype: int64

In [5]:
len(df['id'].unique())

13513

### Categorize campaigns: we will have payment reminder, career, event, weekly info send-out, nd related updates, etc..

In [9]:
df['campaign'] = np.where(df['campaign_name'].str.lower().str.contains('career'),'career',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('promo'),'promo',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('upsell'),'promo',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('discoveryweek'),'promo',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('survey'),'survey',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('alumni'),'alumni',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('lastchance'),'lastchance',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('appsclos'),'lastchance',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('applicationsclos'),'lastchance',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('weekly'),'announcements-info',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('announcement'),'announcements-info',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('topenroll'),'announcements-info',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('nur'),'nurture',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('comingsoon'),'nurture',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('invite'),'nurture',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('notify'),'notify',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('accept'),'notify',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('uconnect')
                          &df['campaign'].str.lower().str.contains('confirm'),'notify',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('google'),'scholarship',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('scholarship'),'scholarship',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('mentor'),'mentorship',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('robond_udacityexplores'),'explore',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('freepreview'),'freepreview',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('grad'),'grad/end of term',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('endofterm'),'grad/end of term',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('end of term'),'end of term reminder',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('vrnd_winback'),'winback',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('aind_re-engage'),'winback',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('aws'),'ND-event',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('webcast'),'ND-event',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('podcast'),'ND-event',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('challenge'),'ND-event',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('bosch'),'ND-event',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('arkit_launch'),'ND-event',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('event'),'event',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('forum'),'event',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('udacitytalk'),'event',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('udacity talk'),'event',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('infosession'),'event',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('techcrunch'),'event',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('outreach'),'event',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('incomplete'),'incomplete reminder',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('application'),'incomplete reminder',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('late_regist'),'extension',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('extension'),'extension',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('reviewsapp'),'reviews-app',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('app confirm'),'app confirmation',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('admissions-submit'),'app confirmation',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('application received'),'app confirmation',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('application confirm'),'app confirmation',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('course confirm'),'course confirm',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('abandon'),'abandon',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('recommendation'),'recommendation',df['campaign'])

df['campaign'] = np.where(df['campaign'].str.lower().str.contains('project'),'ND-update',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('correction'),'ND-update',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('curriculum'),'ND-update',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('welcome'),'ND-update',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('first day'),'ND-update',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('enrolled'),'ND-update',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('classroom'),'ND-update',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('nd'),'ND-update',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('downloadable'),'ND-update',df['campaign'])
df['campaign'] = np.where(df['campaign'].str.lower().str.contains('d27'),'ND-update',df['campaign'])

df['campaign'] = np.where(df['campaign_name'].str.lower().str.contains('payment'),'payment_reminder',df['campaign'])


### In general, notify is the biggest portition during this time frame (accept-pay), then ND related notifications (which means these students are enrolled with other NDs at the time), then payment reminders.

In [24]:
df.groupby(['campaign'])['id'].count().reset_index().sort_values(by='id',ascending=False)

Unnamed: 0,campaign,id
13,notify,10876
0,ND-update,3855
15,payment_reminder,3211
12,no emails,2765
11,mentorship,2213
16,promo,2047
14,nurture,2040
10,lastchance,1426
9,freepreview,1286
6,event,1101


### If we look at the very last email our student received before they put down payment, top 1 they received is notify, then no emails, then payment reminders

In [25]:
df.query('row_num==1').groupby(['campaign'])['id'].count().reset_index().sort_values(by='id',ascending=False)

Unnamed: 0,campaign,id
12,notify,3794
11,no emails,2765
14,payment_reminder,2065
0,ND-update,1696
15,promo,1172
13,nurture,632
6,event,355
10,mentorship,318
8,freepreview,181
9,lastchance,102


### If we create a Logistics Regression to see what features are causing the payment, here are the results.
#### 2 models we have here:
* pay = campaign_category + row_num
* pay = campaign_category + row_num + nd_enrolled (if enrolled in other ND at the time)

In [16]:
# Logistics Regression : campaign_category + row_num = pay in general
lr1 = smf.glm('pay~C(campaign)+row_num',data=df,family=sm.families.Binomial(sm.families.links.logit)).fit()
lr1.summary()

0,1,2,3
Dep. Variable:,pay,No. Observations:,32681.0
Model:,GLM,Df Residuals:,32659.0
Model Family:,Binomial,Df Model:,21.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-17460.0
Date:,"Mon, 06 Nov 2017",Deviance:,34920.0
Time:,15:07:50,Pearson chi2:,32600.0
No. Iterations:,19,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.4674,0.041,-35.784,0.000,-1.548,-1.387
C(campaign)[T.alumni],1.1151,0.608,1.834,0.067,-0.076,2.307
C(campaign)[T.announcements-info],1.0305,0.155,6.640,0.000,0.726,1.335
C(campaign)[T.app confirmation],-0.3653,0.488,-0.749,0.454,-1.321,0.590
C(campaign)[T.career],0.6968,0.352,1.981,0.048,0.007,1.386
C(campaign)[T.course confirm],0.2297,0.184,1.251,0.211,-0.130,0.590
C(campaign)[T.event],-0.2826,0.091,-3.098,0.002,-0.461,-0.104
C(campaign)[T.explore],21.7285,1.77e+04,0.001,0.999,-3.47e+04,3.48e+04
C(campaign)[T.extension],-0.4146,0.196,-2.114,0.035,-0.799,-0.030


### Result of pay = campaign_category + row_num:
* What're significant: (p value: the small the more influencer)
    * free preview (p-value:0)
    * last chance (p-value:0)
    * announcements-info (0)
    * no emails (0)
    * notify (0)
    * promo (0)
    * event (0.002)
    * payment reminder (0.006)
    * review-app (0.007)
    * extension (0.035)
    * career (0.048)
* row_num is also significant

In [14]:
# Logistics Regression : campaign_category + row_num + nd_enrolled = pay
lr2 = smf.glm('pay~C(campaign)+row_num+nd_enrolled',data=df,family=sm.families.Binomial(sm.families.links.logit)).fit()
lr2.summary()

0,1,2,3
Dep. Variable:,pay,No. Observations:,32681.0
Model:,GLM,Df Residuals:,32658.0
Model Family:,Binomial,Df Model:,22.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-17326.0
Date:,"Mon, 06 Nov 2017",Deviance:,34652.0
Time:,15:06:04,Pearson chi2:,32700.0
No. Iterations:,19,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.5802,0.042,-37.644,0.000,-1.662,-1.498
C(campaign)[T.alumni],0.8060,0.611,1.320,0.187,-0.391,2.003
C(campaign)[T.announcements-info],0.8879,0.157,5.660,0.000,0.580,1.195
C(campaign)[T.app confirmation],-0.3658,0.490,-0.747,0.455,-1.325,0.594
C(campaign)[T.career],0.4707,0.354,1.329,0.184,-0.224,1.165
C(campaign)[T.course confirm],0.2980,0.184,1.617,0.106,-0.063,0.659
C(campaign)[T.event],-0.3630,0.092,-3.950,0.000,-0.543,-0.183
C(campaign)[T.explore],21.3506,1.77e+04,0.001,0.999,-3.47e+04,3.48e+04
C(campaign)[T.extension],-0.4225,0.197,-2.144,0.032,-0.809,-0.036


### Result of pay = campaign_category + row_num + nd_enrolled:
* What're significant: (p value: the small the more influencer)
    * free preview (p-value:0)
    * nd_enrolled (0)
    * no emails (0)
    * notify (0)
    * event (0)
    * announcements-info (0)
    * last chance (p-value:0.001)
    * promo (0.004)
    * mentorship (0.006)
    * payment reminder (0.014)
    * extension (0.032)
    * nurture (0.045)
* row_num is also significant

### What if we only look at the lastest 2 emails before payment?

In [26]:
lr3 = smf.glm('pay~C(campaign)+row_num+nd_enrolled',data=df.query('row_num <= 2'),family=sm.families.Binomial(sm.families.links.logit)).fit()
lr3.summary()

0,1,2,3
Dep. Variable:,pay,No. Observations:,20548.0
Model:,GLM,Df Residuals:,20526.0
Model Family:,Binomial,Df Model:,21.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-10995.0
Date:,"Mon, 06 Nov 2017",Deviance:,21991.0
Time:,15:39:08,Pearson chi2:,20800.0
No. Iterations:,20,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.1594,0.072,-16.025,0.000,-1.301,-1.018
C(campaign)[T.alumni],-20.5725,1.66e+04,-0.001,0.999,-3.25e+04,3.25e+04
C(campaign)[T.announcements-info],0.8766,0.189,4.628,0.000,0.505,1.248
C(campaign)[T.app confirmation],-0.3149,0.636,-0.495,0.621,-1.562,0.933
C(campaign)[T.career],-0.1639,0.587,-0.279,0.780,-1.314,0.986
C(campaign)[T.course confirm],0.5011,0.218,2.298,0.022,0.074,0.929
C(campaign)[T.event],-0.4874,0.128,-3.802,0.000,-0.739,-0.236
C(campaign)[T.extension],0.3593,0.224,1.605,0.108,-0.079,0.798
C(campaign)[T.freepreview],-0.1622,0.113,-1.431,0.152,-0.384,0.060


### Result of pay = campaign_category + row_num + nd_enroll for lastest 2 emails received:
* What're significant: (p value: the small the more influencer)
    * announcements-info (0)
    * event (0)
    * last chance (p-value:0)
    * no emails (0)
    * notify (0)
    * promo (0)
    * nurture (0.004)
    * course confirm (0.022)

## Summary:

* Common significant features:
    * notify
    * last chance
    * promo
    * nurture
    * announcements-info
    * event
    * no email
* Regardless of only focusing on lastest 2 emails received, more significant features we get are:
    * free preview
    * payment reminder
    * extension

* One thing to notice is that: students who **already enrolled in another ND** have higher possibility to pay for another ND
* No emails received is sig. causing payment looks odd. Could be data issue.
* Set aside notify and payment reminders, also set aside emails from other NDs students enrolled at the time, from these findings, **free preview**, **last chance** (still a reminder) and **promo**/**nurture** should be good appoaches to students encouraging them to put down payment