# Subscription and Donation data exploration

## load and aggregate data

- load subscription and donation data separately.
- group by email and calculate the frequency, recency and money value of subscriptions and donations
- combine processed data

In [1]:
import pandas as pd
import datetime
# to show the whole content in the columns
pd.set_option('display.max_colwidth', -1)

subsciption_filename = '/home/centos/mojo/data/subscriptions.xlsx'
donation_filename = '/home/centos/mojo/data/donations_combined.xlsx'

from subscription_donation_preprocess import sub_don_process
from subscription_donation_preprocess import combine_dat_sets

subscription, donation, combined = sub_don_process(subsciption_filename,
                                                   donation_filename )

- save the processed data to disk for future use

In [2]:
combined.to_pickle('subscription_donation_processed_data.pkl')

### summerize the donation and subscription data separately

In [2]:
print('#### #### #### #### #### #### #### ')
print('#### summary of subscriptions #### ')
print(combined.loc[combined['subs_freq'] > 0][['subs_total','subs_freq','subs_recency']].describe())
print('\n')
print('{} subscribed more than twice'.format(combined.loc[combined['subs_freq'] >=2]['Email'].count()))
print('{} subscribed more than three'.format(combined.loc[combined['subs_freq'] >=3]['Email'].count()))
print('{} subscribed more than five times'.format(combined.loc[combined['subs_freq'] >=5]['Email'].count()))

print('#### an example of freq subscribers #### ')
print(subscription[subscription['EMAIL']==combined.loc[combined['subs_freq'] >=5]['Email'].iloc[0]])

print('\n')
print('#### #### #### #### #### #### #### ')
print('#### summary of donations #### ')
print(combined.loc[combined['don_freq'] > 0][['don_total','don_freq','don_recency']].describe())
print('\n')
print('#### an example of freq donors #### ')
print(donation[donation['EMAIL']==combined.loc[combined['don_freq'] >=5]['Email'].iloc[0]])

print('#### an example of high-value donors #### ')
print(donation[donation['EMAIL']==combined.loc[combined['don_total'] >=500]['Email'].iloc[0]])

print('\n')
print(' number of high-value donors:{}'.format(combined.loc[combined['don_total'] > 500]['Email'].count()))
print(' {}% of donnars are high-value donors'.format(round(100*combined.loc[combined['don_total'] > 500]['Email'].count()/combined.loc[combined['don_total'] > 0]['Email'].count(),2)))

#### #### #### #### #### #### #### 
#### summary of subscriptions #### 
         subs_total     subs_freq  subs_recency
count  51177.000000  51177.000000  51177.000000
mean   20.349169     1.262794      212.296168  
std    12.248219     0.627154      122.522390  
min    2.000000      1.000000      19.000000   
25%    12.000000     1.000000      104.000000  
50%    18.000000     1.000000      210.000000  
75%    24.000000     1.000000      329.000000  
max    990.000000    49.000000     423.000000  


11646 subscribed more than twice
1107 subscribed more than three
122 subscribed more than five times
#### an example of freq subscribers #### 
                                                                                                                                  EMAIL  \
44033  04fcd1ff223405668b4226410cfab7a2b5d553480927da9ce1ebb8ac0a1dc84428682d3d8c80b93c863de44c325bbd57fbbf54c4ad07bfa0f0879856c719014e   
44034  04fcd1ff223405668b4226410cfab7a2b5d553480927da9ce1ebb8ac0a1dc84428

### summerize how often subscribers donate and how often donors are subscribers

In [3]:
combined_cf = pd.crosstab(combined['subs_freq'].astype('bool'), 
                          combined['don_freq'].astype('bool'), 
                          rownames=['subscription'], 
                          colnames=['donation'], margins=True)
print('confusion matrix of subscription and donation')
print('\n')
print(combined_cf)
print('\n')
print('{}% of subscribers also donated'.format(round(100*combined_cf.iloc[1,1]/combined_cf.iloc[1,2])))
print('{}% of donors are subscribers'.format(round(100*combined_cf.iloc[1,1]/combined_cf.iloc[2,1])))


confusion matrix of subscription and donation


donation      False   True    All
subscription                     
False         0      27326  27326
True          42030  9147   51177
All           42030  36473  78503


18.0% of subscribers also donated
25.0% of donors are subscribers


## Combine subscription & donnation data with newsletter click data
- load processed newsletter click data, subscription data and donation data
- for newsletter click data, group by email and count the unique urls were clicked and the unique topics were clicked

In [4]:
# created by https://github.com/datakind/mj_user_engagement/blob/tingting/newsletter_clicks_parser/mojo_newsletter_clicks_preprocess.py
click_data = pd.read_pickle('/home/centos/mojo/newsletter_processed_data.pkl')
sub_don_combined = pd.read_pickle('subscription_donation_processed_data.pkl')

nclicks_by_users = click_data.groupby('Email').agg({'Url': pd.Series.nunique,'topic':pd.Series.nunique})
nclicks_by_users = nclicks_by_users.reset_index()

- use outer join to combine processed newsletter click data, subscription data and donation data based on emails

In [7]:
sub_don_click_combined = combine_dat_sets(sub_don_combined, nclicks_by_users, 'Email','Email')
sub_don_click_combined = sub_don_click_combined.set_index('Email')

In [8]:
sub_don_click_combined.head(n=3)

Unnamed: 0_level_0,subs_total,subs_freq,subs_recency,don_total,don_freq,don_recency,Url,topic
Email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
00005cfbddbc503ae8837036a9b96cec672997922271a736c2253c88108a9c937c65648e3e7232ba8bbc4cab14fef463e94a1acb2287de698d1790af58c4d8d3,22.0,1.0,248.0,0.0,0.0,0.0,2.0,2.0
0000a8d23211fec8ed10561725547e244648f4dfd391c7d7262d324eac5a15c805d18b21c03df78da876048a1a7d9954c62535a678296c16b75432944d51e167,36.0,2.0,26.0,0.0,0.0,0.0,0.0,0.0
00020b15c1f68335def0ad9595c3c4924988436ef3c6a07f4777cd31e9cd9fc3f0db62e389c469210098fee366e1aee1e7533828ec5c43d3bba3ffabd5a2eb47,10.0,1.0,124.0,0.0,0.0,0.0,0.0,0.0


## summerize subscribers' click activities


In [9]:
sub = sub_don_click_combined[sub_don_click_combined['subs_freq'] >0]

print('{} out of {} subscribers clicked newsletters in Oct'.format(sum(sub['Url']> 0),
                                                                sub.shape[0]))
print('{}% of subscribers clicked newsletters'.format(round(pd.Series.mean(sub['Url']> 0)*100,1)))
print('summary of subscribers click activities in Oct')
sub[sub['Url']>0][['Url','topic']].describe()

4194 out of 51177 subscribers clicked newsletters in Oct
8.2% of subscribers clicked newsletters
summary of subscribers click activities in Oct


Unnamed: 0,Url,topic
count,4194.0,4194.0
mean,5.125894,1.930138
std,8.36571,1.117957
min,1.0,1.0
25%,1.0,1.0
50%,3.0,2.0
75%,6.0,3.0
max,242.0,8.0


## summerize donars' click activities

In [10]:
don = sub_don_click_combined[sub_don_click_combined['don_freq'] >0]
print('{} out of {} donars clicked newsletters in Oct'.format(sum(don['Url']> 0),
                                                           don.shape[0]))
print('{}% of doners clicked newsletters'.format(round(pd.Series.mean(don['Url']> 0)*100,1)))
print('summary of doners click activities in Oct')
don[don['Url']>0][['Url','topic']].describe()

5410 out of 36473 donars clicked newsletters in Oct
14.8% of doners clicked newsletters
summary of doners click activities in Oct


Unnamed: 0,Url,topic
count,5410.0,5410.0
mean,6.458965,2.136969
std,9.973003,1.213371
min,1.0,1.0
25%,2.0,1.0
50%,3.0,2.0
75%,8.0,3.0
max,238.0,8.0


## summerize how often the emails who clicked the newsletters are donar or subscribers

In [11]:
print('{}% of emails who clicked are subscribers '.format(round(pd.Series.mean(sub_don_click_combined[sub_don_click_combined['Url']>0]['subs_freq'] >0)*100,1)))
print('{}% of emails who clicked are donars '.format(round(pd.Series.mean(sub_don_click_combined[sub_don_click_combined['Url']>0]['don_freq'] >0)*100,1)))

7.6% of emails who clicked are subscribers 
9.8% of emails who clicked are donars 
