# Subscription and Donation data exploration

## load and aggregate data

- load subscription and donation data separately.
- group by email and calculate the frequency, recency and money value of subscriptions and donations
- combine processed data

In [7]:
import pandas as pd
import datetime
from subscription_donation_preprocess import sub_don_process
from subscription_donation_preprocess import combine_dat_sets

# to show the whole content in the columns
pd.set_option('display.max_colwidth', -1)

subscription_filenames = ['/home/centos/mojo/data/user_data_v2/MJM_datakind.csv']
donation_filenames = ['/home/centos/mojo/data/user_data_v2/DON_datakind.csv','/home/centos/mojo/data/user_data_v2/SDN_datakind.csv']

subscription, donation, combined = sub_don_process(subscription_filenames,
                                                   donation_filenames,
                                                   column_names = { 'email':'Hashed Email',
                                                                    'amount':'AMT PAID',
                                                                    'date': 'ORD ENTR DT',
                                                                    'pubcode':'PUB'})

# code to process 1st batch data:
#subscription_filenames = ['/home/centos/mojo/data/user_data_v1/subscriptions.xlsx']
#donation_filenames = ['/home/centos/mojo/data/user_data_v1/donations_combined.xlsx']

#subscription, donation, combined = sub_don_process(subscription_filenames,
#                                                   donation_filenames,
#                                                   column_names = { 'email':'EMAIL',
#                                                                    'amount':'ORD REMT',
#                                                                    'date': 'ORD ENTR DT',
#                                                                    'pubcode':'ORD-PUB-CODE'})

In [4]:
combined.head()

Unnamed: 0,Email,subs_total,subs_freq,subs_recency,subs_range,don_total,don_freq,don_recency,don_range,MJM,DON,SDN
0,00005c1f2524a64575752c470bdfa7d39f36d4a36d43836dd878f66d7991a0b12c8b4e64c29df52344bf54c71df239213aba5d20df72f34b904cb46b1e0b50b3,12.0,1.0,945.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0001719c06a46b0bcf726de6e5b136650e7ed6227d9f92a78a3c8d57f7f4f04d085dd6ef2db4fa9e094ad99a9c342405fd3c0c4f4babea78d452a3863c99fc82,12.0,2.0,581.0,257.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,00017b12b907aafc44713af066597cebb77a49daf61fce9496d7f3be05d8e55f37e130edbf44a8b0229e8057d554520309decbcb1eb2efb720854a8572224ae2,12.0,2.0,517.0,287.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0001a5b58db4e9ad3e9fedfa025c486ebc557f20032612e86227290b42db857566c1ef5edfe7c57ee5dea86cc3d7146036ac9a3758d70b838eff7d5f1cded92e,10.0,1.0,172.0,0.0,5.0,1.0,741.0,0.0,1.0,1.0,0.0
4,000246ace7348212b208427e04a0f7d78372db042b6de4203da2a880dda1381af6d5b50c79909efcc500d47443efc801e402c70efcfb59f493a4c1591a1980df,0.0,1.0,934.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


- save the processed data to disk for future use

In [8]:
combined.to_pickle('subscription_donation_processed_data_v2.pkl')

In [11]:

subscription.rename(columns = {'Hashed Email': 'EMAIL', 'PUB':'ORD-PUB-CODE'}, inplace=True)
subscription['ORD ENTR DT'] = pd.to_datetime(subscription['ORD ENTR DT'])

donation.rename(columns = {'Hashed Email': 'EMAIL', 'PUB':'ORD-PUB-CODE'}, inplace=True)
donation['ORD ENTR DT'] = pd.to_datetime(donation['ORD ENTR DT'])


### summerize the donation and subscription data separately

In [12]:
print('#### #### #### #### #### #### #### ')
print('#### summary of subscriptions #### ')
print(combined.loc[combined['subs_freq'] > 0][['subs_total','subs_freq','subs_recency']].describe())
print('\n')
print('{} subscribed more than twice'.format(combined.loc[combined['subs_freq'] >=2]['Email'].count()))
print('{} subscribed more than three'.format(combined.loc[combined['subs_freq'] >=3]['Email'].count()))
print('{} subscribed more than five times'.format(combined.loc[combined['subs_freq'] >=5]['Email'].count()))

print('#### an example of freq subscribers #### ')
print(subscription[subscription['EMAIL']==combined.loc[combined['subs_freq'] >=5]['Email'].iloc[0]])

print('\n')
print('#### #### #### #### #### #### #### ')
print('#### summary of donations #### ')
print(combined.loc[combined['don_freq'] > 0][['don_total','don_freq','don_recency']].describe())
print('\n')
print('#### an example of freq donors #### ')
print(donation[donation['EMAIL']==combined.loc[combined['don_freq'] >=5]['Email'].iloc[0]])

print('#### an example of high-value donors #### ')
print(donation[donation['EMAIL']==combined.loc[combined['don_total'] >=500]['Email'].iloc[0]])

print('\n')
print(' number of high-value donors:{}'.format(combined.loc[combined['don_total'] > 500]['Email'].count()))
print(' {}% of donnars are high-value donors'.format(round(100*combined.loc[combined['don_total'] > 500]['Email'].count()/combined.loc[combined['don_total'] > 0]['Email'].count(),2)))

#### #### #### #### #### #### #### 
#### summary of subscriptions #### 
          subs_total      subs_freq   subs_recency
count  120171.000000  120171.000000  120171.000000
mean   23.824416      2.490434       419.329031   
std    19.769585      2.904536       306.054325   
min    0.000000       1.000000       9.000000     
25%    12.000000      1.000000       149.000000   
50%    18.000000      2.000000       364.000000   
75%    34.950000      3.000000       688.000000   
max    1935.600000    131.000000     1108.000000  


73783 subscribed more than twice
37457 subscribed more than three
7513 subscribed more than five times
#### an example of freq subscribers #### 
                                                                                                                                 EMAIL  \
1222  00099c0b11c244ea19a249ad00844bf5f92f7bfd0cb572ab7f9da62556366785045435ca27d66f55740f1cdc42e308fa2335810c61c9d4cefca87ae3627044df   
1223  00099c0b11c244ea19a249ad00844bf5f92f7bfd

### summerize how often subscribers donate and how often donors are subscribers

In [13]:
combined_cf = pd.crosstab(combined['subs_freq'].astype('bool'), 
                          combined['don_freq'].astype('bool'), 
                          rownames=['subscription'], 
                          colnames=['donation'], margins=True)
print('confusion matrix of subscription and donation')
print('\n')
print(combined_cf)
print('\n')
print('{}% of subscribers also donated'.format(round(100*combined_cf.iloc[1,1]/combined_cf.iloc[1,2])))
print('{}% of donors are subscribers'.format(round(100*combined_cf.iloc[1,1]/combined_cf.iloc[2,1])))


confusion matrix of subscription and donation


donation      False   True     All
subscription                      
False         0      35348  35348 
True          85681  34490  120171
All           85681  69838  155519


29.0% of subscribers also donated
49.0% of donors are subscribers


## Combine subscription & donnation data with newsletter click data
- load processed newsletter click data, subscription data and donation data
- for newsletter click data, group by email and count the unique urls were clicked and the unique topics were clicked

In [14]:
# created by https://github.com/datakind/mj_user_engagement/blob/tingting/newsletter_clicks_parser/mojo_newsletter_clicks_preprocess.py
click_data = pd.read_pickle('/home/centos/mojo/newsletter_processed_data_v2.pkl')
sub_don_combined = pd.read_pickle('subscription_donation_processed_data_v2.pkl')

nclicks_by_users = click_data.groupby('Email').agg({'Url': pd.Series.nunique,'topic':pd.Series.nunique})
nclicks_by_users = nclicks_by_users.reset_index()

- use outer join to combine processed newsletter click data, subscription data and donation data based on emails

In [15]:
sub_don_click_combined = combine_dat_sets(sub_don_combined, nclicks_by_users, 'Email','Email')
sub_don_click_combined = sub_don_click_combined.set_index('Email')

In [16]:
sub_don_click_combined.head(n=3)

Unnamed: 0_level_0,subs_total,subs_freq,subs_recency,subs_range,don_total,don_freq,don_recency,don_range,MJM,DON,SDN,Url,topic
Email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
00005c1f2524a64575752c470bdfa7d39f36d4a36d43836dd878f66d7991a0b12c8b4e64c29df52344bf54c71df239213aba5d20df72f34b904cb46b1e0b50b3,12.0,1.0,945.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
0001719c06a46b0bcf726de6e5b136650e7ed6227d9f92a78a3c8d57f7f4f04d085dd6ef2db4fa9e094ad99a9c342405fd3c0c4f4babea78d452a3863c99fc82,12.0,2.0,581.0,257.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
00017b12b907aafc44713af066597cebb77a49daf61fce9496d7f3be05d8e55f37e130edbf44a8b0229e8057d554520309decbcb1eb2efb720854a8572224ae2,12.0,2.0,517.0,287.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## summerize subscribers' click activities


In [17]:
sub = sub_don_click_combined[sub_don_click_combined['subs_freq'] >0]

print('{} out of {} subscribers clicked newsletters in Oct'.format(sum(sub['Url']> 0),
                                                                sub.shape[0]))
print('{}% of subscribers clicked newsletters'.format(round(pd.Series.mean(sub['Url']> 0)*100,1)))
print('summary of subscribers click activities in Oct')
sub[sub['Url']>0][['Url','topic']].describe()

0 out of 120171 subscribers clicked newsletters in Oct
0.0% of subscribers clicked newsletters
summary of subscribers click activities in Oct


Unnamed: 0,Url,topic
count,0.0,0.0
mean,,
std,,
min,,
25%,,
50%,,
75%,,
max,,


## summerize donars' click activities

In [18]:
don = sub_don_click_combined[sub_don_click_combined['don_freq'] >0]
print('{} out of {} donars clicked newsletters in Oct'.format(sum(don['Url']> 0),
                                                           don.shape[0]))
print('{}% of doners clicked newsletters'.format(round(pd.Series.mean(don['Url']> 0)*100,1)))
print('summary of doners click activities in Oct')
don[don['Url']>0][['Url','topic']].describe()

0 out of 69838 donars clicked newsletters in Oct
0.0% of doners clicked newsletters
summary of doners click activities in Oct


Unnamed: 0,Url,topic
count,0.0,0.0
mean,,
std,,
min,,
25%,,
50%,,
75%,,
max,,


## summerize how often the emails who clicked the newsletters are donar or subscribers

In [19]:
print('{}% of emails who clicked are subscribers '.format(round(pd.Series.mean(sub_don_click_combined[sub_don_click_combined['Url']>0]['subs_freq'] >0)*100,1)))
print('{}% of emails who clicked are donars '.format(round(pd.Series.mean(sub_don_click_combined[sub_don_click_combined['Url']>0]['don_freq'] >0)*100,1)))

0.0% of emails who clicked are subscribers 
0.0% of emails who clicked are donars 
