# User Exploration - Grouped Insights 1

#### This notebook deep dive into donors, subscribers and newsletter click data to derive thorough insights of user behaviors

#### before proceed, make sure:
- run the separate data_preprocessing.ipynb to generate needed files and save them in '../data/':
- email_clean_2017_18.csv
- MJM_datakind_filtered.csv
- DON_datakind_filtered.csv
- SDN_datakind_filtered.csv

# 1. Set up environment

### 1.1 load python packages, custom-defined functions and set up environment

In [None]:
# load all the packages
import pandas as pd
import os
import urllib.parse
from ua_parser import user_agent_parser
import numpy as np
import datetime
from IPython.display import display
from subscription_donation_preprocess_functions import *
from MojoNewsletterClicksParser import *
from user_exploration_functions import *
import matplotlib.pyplot as plt
from textwrap import wrap
import itertools
import re
from time import strptime
import seaborn as sns
sns.set()


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
# to show the whole content in the columns
pd.set_option('display.max_colwidth', -1)


### 1.2 define path and file names
- these names need to be edited if 

In [None]:
# The '../data' directory is the folder to reserve all raw data files and the processed files during the process
data_folder = os.path.abspath(os.path.join(os.getcwd(),'../data'))

# the cleaned click data filename (Created from data_preprocessing.ipynb)
cleanfile = os.path.join(data_folder, 'email_clean_2017_18.csv')

# the filtered subscription and donation filenames (Created from data_preprocessing.ipynb)
subscription_filenames = [os.path.join(data_folder, 'MJM_datakind_filtered.csv')]
donation_filenames = [os.path.join(data_folder, 'DON_datakind_filtered.csv'), os.path.join(data_folder, 'SDN_datakind_filtered.csv')]

# the LDA feature file name. if it doesn' exist in '../data', manually move the file to that folder
lda_feature_filename = os.path.join(data_folder, 'lda20_results.csv')


## now define some new file names used in the following process
# define the filename for final processed click data
click_data_filename = os.path.join(data_folder, 'newsletter_processed_data_final.pkl')

# define the filename for final processed subscription and donation data
sub_don_combined_filename = os.path.join(data_folder, 'subscription_donation_processed_data_final.pkl')

# define the filename for count of clicks by LDA topics
nclicks_by_modeling_topics_filename = os.path.join(data_folder, 'nclicks_by_modeling_topics.pkl')
# define the filename for count of clicks by MoJo topics
nclicks_by_various_topics_filename = os.path.join(data_folder, 'nclicks_by_various_topics.pkl')
# define the file name for combined subscription, donation and click data
sub_don_click_combined_filename = os.path.join(data_folder, 'subscription_donation_click_combined_data_final.pkl')


# 2. Data Pre-process
- run data_preprocessing.ipynb before proceeding 


In [None]:
# 'data' directory should already contain all the raw data files
#  all the processed data will be saved in 'data' directory
if 'MJM_datakind_filtered.csv' in os.listdir(data_folder) and 'email_clean_2017_18.csv' in os.listdir(data_folder):
    print("you are ready to proceed!")
else:
    print("missing files; run data_preprocessing.ipynb before proceeding!")

## 2.1 parse click data to extract detailed information from Urls and save as python object


In [None]:
%%time
print('get click data...')
if os.path.exists(click_data_filename):
    click_data = pd.read_pickle(click_data_filename)
else:
    
    dat = pd.read_csv(cleanfile, dtype={'MONTH':object,'YEAR': int,'EMAIL':object,'file_month':str,'URL': object})
    dat.rename(columns={'URL':'Url','EMAIL':'Email'}, inplace=True)

    # create an instance of the class MojoNewsletterClicksParser
    nl = MojoNewsletterClicksParser(url_df=dat)

    # Parse Url, Browser and Recorded On and add parsed info to the original data frame
    full_url_df = nl.extend_url_df( nl.url_df,
                                    by_date=False,
                                    by_url=True,
                                    by_domain_type=True,
                                    by_ua=False)
    full_url_df['file_month'] = pd.to_datetime(dat['file_month'])

    # only Urls with www.motherjones.com domain
    click_data = nl.mojo_standard_parser(full_url_df,selected_cols=['Email',
                                                                    'Url',
                                                                    'domain',
                                                                    'domain_type',
                                                                    'topic',
                                                                    'title',
                                                                    'file_month'])
    print('saving data...')
    click_data.to_pickle(click_data_filename)
    print('saved final processed click data as ' + click_data_filename)

## 2.2 load and combine all the data


- load subscription and donation data separately.
- group by email and calculate the frequency, recency, date range and money value of subscriptions and donations
- combine processed subscription and donation data
- take around 5min
- refer to section 1.2 for filename definations

In [None]:
%%time
# load data if its pickle exists otherwise reprocess the data
print('get combined subscription and donation data...')
if os.path.exists(sub_don_combined_filename):
    combined = pd.read_pickle(sub_don_combined_filename)
    subscription_df = []
    for f in subscription_filenames:
        if '.csv' in f:
            try: 
                dat = pd.read_csv(f,encoding = "ISO-8859-1")
            except UnicodeDecodeError:
                dat = pd.read_csv(f,encoding = "utf-8")
        if '.xlsx' in f:
            dat = pd.read_excel(f)
        subscription_df.append(dat)
    subscription = pd.concat(subscription_df)

    donation_df = []
    for f in donation_filenames:
        if '.csv' in f:
            dat = pd.read_csv(f,encoding = "ISO-8859-1")
        if '.xlsx' in f:
            dat = pd.read_excel(f)
        donation_df.append(dat)
    donation = pd.concat(donation_df)
    print('load the combined processed subscription and donation data as data frame combined')
else:
    subscription, donation, combined = sub_don_process(subscription_filenames,
                                                       donation_filenames,
                                                       column_names = { 'email':'Hashed Email',
                                                                        'amount':'AMT PAID',
                                                                        'date': 'ORD ENTR DT',
                                                                        'pubcode':'PUB'})

    combined.to_pickle(sub_don_combined_filename)
                      
# reformat some columns    
subscription.rename(columns = {'Hashed Email': 'EMAIL', 'PUB':'ORD-PUB-CODE'}, inplace=True)
subscription['ORD ENTR DT'] = pd.to_datetime(subscription['ORD ENTR DT'])

donation.rename(columns = {'Hashed Email': 'EMAIL', 'PUB':'ORD-PUB-CODE'}, inplace=True)
donation['ORD ENTR DT'] = pd.to_datetime(donation['ORD ENTR DT'])


- Then load pre-processed click data
- group by email and count the unique number of urls and topics (both Mojo topicks and LDA topics) the email clicked.
- combine processed subscription and donation data with the aggregated click data to a single wide data frame.
- Take around 15min

In [None]:
%%time
# get the combined user data and click data
print('get combined user and click data...')
if os.path.exists(sub_don_click_combined_filename):
    sub_don_click_combined = pd.read_pickle(sub_don_click_combined_filename)
    print('load the combined processed subscription, donation and click data as data frame sub_don_click_combined')

else:    
    
    # count # post and # MoJo topics clicked by users
    nclicks_by_users = click_data.groupby('Email').agg({'Url': pd.Series.nunique,'topic':pd.Series.nunique})
    nclicks_by_users = nclicks_by_users.reset_index()

    # count # clicks from each MoJo topic by users
    nclicks_by_topics = reshape_data_to_wide(click_data,
                                             row = 'Email', 
                                             col = 'topic', 
                                             element = 'Url',
                                             cal = 'sum')
    nclicks_by_topics = nclicks_by_topics.reset_index()
    nclicks_by_topics = nclicks_by_topics.rename(columns = {'':'NA'})
    

    # count # of LDA topics per user (only choose tp 3 LDA topic for post)
    nclicks_by_modeling_topics = cal_nclicks_by_modeling_topics_percentile(
                                           topic_filename = lda_feature_filename, 
                                           raw_click_data = click_data, 
                                           prefix = 'LDA_', 
                                           topNtopics = 3)
    # combine # clicks of MoJo topics per user with # clicks of LDA topics per user 
    nclicks_by_various_topics = nclicks_by_topics.merge(nclicks_by_modeling_topics, 
                                                        left_on= 'Email', 
                                                        right_on= 'Email', 
                                                        how= 'left').fillna(0)
    
    # save as pickles for easy access
    nclicks_by_modeling_topics.to_pickle(nclicks_by_modeling_topics_filename)
    nclicks_by_various_topics.to_pickle(nclicks_by_various_topics_filename)

    # combine donation and subscription data with summarized click data
    sub_don_click_combined_temp = combine_dat_sets(combined, 
                                                   nclicks_by_users, 
                                                   'Email','Email', join_method = 'left')
    sub_don_click_combined_temp2 = combine_dat_sets(sub_don_click_combined_temp, 
                                                    nclicks_by_topics, 
                                                    'Email','Email', join_method = 'left')

    sub_don_click_combined = combine_dat_sets(sub_don_click_combined_temp2, 
                                              nclicks_by_modeling_topics, 
                                              'Email','Email', join_method = 'left')

    sub_don_click_combined = sub_don_click_combined.set_index('Email')

    # save as pickle for easy access
    sub_don_click_combined.to_pickle(sub_don_click_combined_filename)

### The final product of the aggregation process:
data frame 'sub_don_click_combined' with Email as index and columns explained as blow:


|column name| description|
| --- | --- | 
|subs_total|total dollar paid for subscription|
|subs_freq| # subscriptions|
|subs_recency| # days since the latest subscription|
|subs_range| # days between the first and the last subscriptions|
|don_total|total dollar paid for donation|
|don_freq|donations|
|don_recency| # days since the latest donation|
|don_range|# days between the first and the last subscriptions|
|MJM| if the email has pub code MJM |
|DON| if the email has pub code DON |
|SDN| if the email has pub code SDN (recurring donor) |
|process_date| the date when the data is aggregated to calculate the date range |
|Url| # unique urls the email clicked | 
|topic|	# unique topic the email clicked |
|NA| # urls the email clicked for topic 'NA' |	
|DeclineFreeSubscription-Confirmed|	# urls the email clicked for topic 'DeclineFreeSubscription' |
|Russia|# urls the email clicked for topic 'Russia' |
|about|# urls the email clicked for topic 'about' |
|author|# urls the email clicked for topic 'author' |
|authors|# urls the email clicked for topic 'authors' |
|blue-marble|# urls the email clicked for topic 'blue-marble' |
|category|# urls the email clicked for topic 'category' |
|contributor|# urls the email clicked for topic 'contributor' |
|crime-justice|# urls the email clicked for topic 'crime-justice' |
|digitalsample|# urls the email clicked for topic 'digitalsample' |
|environment|# urls the email clicked for topic 'environment' |
|food|# urls the email clicked for topic 'food' |
|kevin-drum|# urls the email clicked for topic 'kevin-drum' |
|media|# urls the email clicked for topic 'media' |
|mediakit|# urls the email clicked for topic 'mediakit' |
|mixed-media|# urls the email clicked for topic 'mixed-media' |
|mojo|# urls the email clicked for topic 'mojo' |
|newsletters|# urls the email clicked for topic 'newsletters' |
|politics|# urls the email clicked for topic 'politics' |
|riff|# urls the email clicked for topic 'riff' |
|thank-you-for-sticking-with-us|# urls the email clicked for topic 'thank-you-for-sticking-with-us' |
|thanks-for-signing-up|# urls the email clicked for topic 'thanks-for-signing-up' |
|tom-philpott|# urls the email clicked for topic 'tom-philpott' |
|topics|# urls the email clicked for topic 'topics' |
|All| # urls the email clicked for all the topics |
|LDA_Environment|# urls the email clicked for this LDA topic |
|LDA_Misc1|# urls the email clicked for this LDA topic | 
|LDA_Healthcare|# urls the email clicked for this LDA topic | 
|LDA_Internet_and_Culture|# urls the email clicked for this LDA topic |
|LDA_Economics_and_Taxes|# urls the email clicked for this LDA topic | 
|LDA_Drugs|# urls the email clicked for this LDA topic | 
|LDA_Criminal_Justice|# urls the email clicked for this LDA topic |
|LDA_Corporations_and_Labor|# urls the email clicked for this LDA topic | 
|LDA_Misc2|# urls the email clicked for this LDA topic | 
|LDA_Trump_and_Russia|# urls the email clicked for this LDA topic |
|LDA_Police_and_Prison|# urls the email clicked for this LDA topic | 
|LDA_Trump_and_Natl_Politics|# urls the email clicked for this LDA topic |
|LDA_Obama_and_Dems|# urls the email clicked for this LDA topic | 
|LDA_Race_and_Racism|# urls the email clicked for this LDA topic | 
|LDA_Food|# urls the email clicked for this LDA topic | 
|LDA_Misc3|# urls the email clicked for this LDA topic |
|LDA_Republican_Party|# urls the email clicked for this LDA topic | 
|LDA_Military|# urls the email clicked for this LDA topic | 
|LDA_Education|# urls the email clicked for this LDA topic |
|LDA_National_Politics|# urls the email clicked for this LDA topic |


# 3. User Insights - Overall Summary of Donors, Subscribers and Online Readers

## 3.1 basic summary of the data (page 3)



In [None]:
%%time
# count the number of unique emails by pub code
subscription_counts = basic_counts(subscription)
donation_counts = basic_counts(donation)

# calcuate the total amounts by pub code
subscription_sums = basic_sum(subscription)
donation_sums = basic_sum(donation)

### subscription data basic counts summary

In [None]:
print('### SUBSCRIPTION ###')
print("Data date ranges from {} to {}".format(subscription_counts['min_date'].date(),subscription_counts['max_date'].date()))
print("count of unique emails: {}".format(str(subscription_counts['nemails'])))
print("count of unique emails by pub code:") 
display(subscription_counts['nemails_by_pub'])
print('count of overlap between pub codes: {}'.format(subscription_counts['noverlap']))

### donation data basic counts summary

In [None]:
print('### DONATION ###')
print("Data date ranges from {} to {}".format(donation_counts['min_date'].date(),donation_counts['max_date'].date()))
print("count of unique emails: {}".format(str(donation_counts['nemails'])))
print("count of unique emails by pub code:") 
display(donation_counts['nemails_by_pub'])
print(donation_counts['nemails_by_pub']/donation_counts['nemails'])
print('\n')
print('count of emails having both DON and SDN: {}'.format(donation_counts['noverlap']))
print('%{} donoars have both DON and SDN'. format(round(donation_counts['noverlap']/donation_counts['nemails']*100,2)))

### Create a pie chart to present the amount distribution

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8, 6), subplot_kw=dict(aspect="equal"))

recipe = ["MJM","SDN","DON"]

data = [int(subscription_sums['sumamt_by_pub'].loc['MJM']),
        int(donation_sums['sumamt_by_pub'].loc['SDN']),
        int(donation_sums['sumamt_by_pub'].loc['DON'])]
labels = ['MJM (subscriptions)','SDN (recurring donations)','DON (single donations)']

def func(pct, allvals):
    absolute = int(pct/100.*np.sum(allvals))
    return "{:.1f}%\n(${:,})".format(pct, absolute)


wedges, texts, autotexts = ax.pie(data, autopct=lambda pct: func(pct, data),
                                  textprops=dict(color="w"))

ax.legend(wedges, labels,
          title="category",
          loc="center left",
          bbox_to_anchor=(1, 0, 0.5, 1))

plt.setp(autotexts, size=12, weight="bold")

ax.set_title("Donation and Subscription Percentages")

plt.show()

## 3.2 Create US State Maps for donation and subscription
- subset data to only include US data
- aggregate by state and count total amt paid and number of unique emails 

In [None]:
# load state abbr and their 18+ population 
state_population = pd.read_csv(os.path.join(data_folder, 'state_population.csv'))
state_population['pct'] = state_population['pop']/sum(state_population['pop'])

# subset donation to only include US data
# aggregate by state and count total amt paid and number of unique emails
# calculate amount pct by state and donor pct by state
# calculate index = donor pct by state / state population pct
df = donation[donation.COUNTRY =='UNITED STATES'].groupby('STATE').agg({'AMT PAID':'sum', 'EMAIL': 'nunique'})
df = df.reset_index()
df['amt_pct'] = df['AMT PAID']/sum(df['AMT PAID'])
df['num_pct'] = df['EMAIL']/sum(df['EMAIL'])
df = df.merge(state_population, left_on='STATE', right_on = 'state')
df['num_index'] = df['num_pct']/df['pct']

# subset subscription to only include US data
# aggregate by state and count total amt paid and number of unique emails
# calculate amount pct by state and subscriber pct by state
# calculate index = subscriber pct by state / state population pct
df2 = subscription[subscription.COUNTRY =='UNITED STATES'].groupby('STATE').agg({'AMT PAID':'sum', 'EMAIL': 'nunique'})
df2 = df2.reset_index()
df2['amt_pct'] = df2['AMT PAID']/sum(df2['AMT PAID'])
df2['num_pct'] = df2['EMAIL']/sum(df2['EMAIL'])
df2 = df2.merge(state_population, left_on='STATE', right_on = 'state')
df2['num_index'] = df2['num_pct']/df2['pct']

- calculate the percentage of donation, donors and subscribers that are from US

In [None]:
amt_US_pct = sum(donation[donation.COUNTRY =='UNITED STATES']['AMT PAID'])/sum(donation['AMT PAID'])
print("""{}% donation are from US""".format(round(amt_US_pct*100,2)))

US_donors_pct = len(donation[donation.COUNTRY =='UNITED STATES']['EMAIL'].unique())/len(donation['EMAIL'].unique())
print("""{}% donors are from US""".format(round(US_donors_pct*100,2)))

US_subscribers_pct = len(subscription[subscription.COUNTRY =='UNITED STATES']['EMAIL'].unique())/len(subscription['EMAIL'].unique())
print("""{}% subscribers are from US""".format(round(US_subscribers_pct*100,2)))

### plot Donation Amount by State

In [None]:
fig = plot_statemap( map_title = 'Donation Amount by State (2016-2018)',
                     bar_title = 'Donation $',
                     states = df['STATE'],
                     values = df['AMT PAID'])
iplot(fig, filename='d3-cloropleth-map')

### Plot the number of Donors by State

In [None]:
fig = plot_statemap( map_title = 'The number of Donors by State (2016-2018)',
                   bar_title = '# donors',
                   states = df['STATE'],
                   values = df['EMAIL'])
iplot( fig)

### Plot Donor Number Index by state

In [None]:
fig = plot_statemap( map_title = 'Donor Number Index by state (weighted by state 18+ population)(2016-2018)',
                     bar_title = 'donor # index',
                     states = df['STATE'],
                     values = df['num_index'] )
iplot( fig)

### Plot the number of Subscribers by State

In [None]:
fig = plot_statemap( map_title = 'The number of Subscribers by State (2016-2018)',
                   bar_title = '# subscribers',
                   states = df2['STATE'],
                   values = df2['EMAIL']
                 )
iplot( fig)

### plot Subscriber Number Index by state

In [None]:
fig = plot_statemap( map_title = 'Subscriber Number Index by state (weighted by state 18+ population)(2016-2018)',
                     bar_title = 'Subscriber # index',
                    states = df2['STATE'],
                    values = df2['num_index']
                 )
iplot( fig)

## 3.3 Summary of the overlap between donors and subscribers 

- Which donors are also subscribers and vice versa?
- create confusion matrix between donors and subscribers and find overlap

In [None]:
combined_cf = pd.crosstab(combined['subs_freq'].astype('bool'), 
                          combined['don_freq'].astype('bool'), 
                          rownames=['subscription'], 
                          colnames=['donation'], margins=True)
print('confusion matrix of magazine subscription and donation')
print('\n')
print(combined_cf)
print('\n')
print('{}% of magazine subscribers also donated'.format(round(100*combined_cf.iloc[1,1]/combined_cf.iloc[1,2])))
print('{}% of magazine subscribers also are recurring donors'.format(round(100*sum(combined[(combined['subs_freq'] > 0) & (combined['don_freq'] > 0)]['SDN'])/combined_cf.iloc[1,2])))

print('{}% of donors are magazine subscribers'.format(round(100*combined_cf.iloc[1,1]/combined_cf.iloc[2,1])))

don_clicked_url = sum((sub_don_click_combined['don_freq'] > 0) & (sub_don_click_combined['Url']>0))
total_donor = sum(sub_don_click_combined['don_freq'] > 0)

print('Also, {} donors clicked newsletter ({}%)'.format(don_clicked_url,round(don_clicked_url/total_donor*100,2)))
    

### analyze the donor & subscriber overlap change by year 

In [None]:
unique_years = subscription['ORD ENTR DT'].dt.year.unique()

overlap_by_year = {'year':[],'overlap':[],'nsub':[],'ndon':[]}

for single_year in unique_years:
    single_sub = subscription[subscription['ORD ENTR DT'].dt.year == single_year].EMAIL.unique()
    single_don = donation[donation['ORD ENTR DT'].dt.year == single_year].EMAIL.unique()
    overlap_by_year['year'].append(single_year)
    
    noverlap = len(np.intersect1d(single_sub,single_don))
    nsub = len(single_sub)
    ndon = len(single_don)

    overlap_by_year['overlap'].append(noverlap)
    overlap_by_year['nsub'].append(nsub)
    overlap_by_year['ndon'].append(ndon)
    
df = pd.DataFrame.from_dict(overlap_by_year)
df['sub_pct'] = ['{:.1%}'.format(x) for x in df['overlap']/df['nsub']]
df['don_pct'] = ['{:.1%}'.format(x) for x in df['overlap']/df['ndon']]

df = df[['year','ndon','nsub','overlap','sub_pct','don_pct']]

df = df.rename(columns={'ndon':'# donors','nsub':'# subscribers',
                  'overlap':'# overlap','sub_pct':'% subscribers who donated','don_pct':'% donors who subscribed'})

df.style.set_table_styles([dict(selector="th",props=[('max-width', '100px')])])

## 3.4 transition between donors and subscribers
### If people both donated and subcribed, did they donate first or subscribe first? How many days between the two actions?

In [None]:
both_donor_sub = sub_don_click_combined.copy()[(sub_don_click_combined['subs_freq'] > 0) & (sub_don_click_combined['don_freq'] > 0)]
both_donor_sub['earliest_don'] = both_donor_sub['don_range'] +  both_donor_sub['don_recency']
both_donor_sub['earliest_sub'] = both_donor_sub['subs_range'] +  both_donor_sub['subs_recency']

both_donor_sub['earliest_don_date'] = both_donor_sub['process_date'] + [datetime.timedelta(days = -x) for x in both_donor_sub['earliest_don']]
both_donor_sub['earliest_sub_date'] = both_donor_sub['process_date'] + [datetime.timedelta(days = -x) for x in both_donor_sub['earliest_sub']]


both_donor_sub = both_donor_sub[['earliest_don','earliest_sub','earliest_don_date','earliest_sub_date']]
both_donor_sub = both_donor_sub.reset_index()

In [None]:
def transition_type(earliest_don, earliest_sub):
    if earliest_don > earliest_sub:
        return 'Donate first'
    if earliest_don < earliest_sub:
        return 'Subscribe first'
    if earliest_don == earliest_sub:
        return 'Donate and subscribe on the same day'

both_donor_sub['transition_type'] = both_donor_sub.apply(lambda x: transition_type(x[1],x[2]), axis=1)
both_donor_sub['transition_period'] = abs(both_donor_sub['earliest_don'] - both_donor_sub['earliest_sub'])
both_donor_sub_transition = both_donor_sub.groupby('transition_type').agg({'Email':'count','transition_period':'median'})
both_donor_sub_transition = both_donor_sub_transition.rename(columns = {'Email': 'email count',
                                             'transition_period': 'average days between transition'})
both_donor_sub_transition['email pct (%)'] = round(both_donor_sub_transition['email count']/both_donor_sub_transition['email count'].sum()*100,2)
both_donor_sub_transition[['email pct (%)','average days between transition']]

## 3.5 magazine subscription renew rate
### check how many magazine subscribers renewed

In [None]:
# exclude users who subscribed the first time within a year
blacklist_emails = combined[(combined['subs_freq'] ==1) & (combined['subs_recency']<360)]
potential_renew_subscribers = combined[(combined['subs_freq'] >0 ) & (~combined['Email'].isin(blacklist_emails['Email']))]

renew_rate = ((potential_renew_subscribers['subs_freq']> 1) & (potential_renew_subscribers['subs_range']> 300)).mean()
# assume people only renew within 60 days of expiration
print("magazine renew rate is " + str(round(renew_rate*100,2)) + '%')

## 3.6 donation distribution

### analyze the relationship between cumulative donation and cumulative email freq for donation below $1000

In [None]:
cnts = don_email_cumpct(dat = combined[(combined['don_freq'] > 0)])

# limit plot to only includ donation less than $1000 for better visualization
plt.rcParams['figure.figsize'] = [8, 6]
plt.plot( 'don_total', 
         'don_total_cumpct', 
         data=cnts[cnts['don_total'] < 1000], 
         marker='', markerfacecolor='green', markersize=12, color='green', linewidth=8)
plt.plot( 'don_total', 'Email_cumpct',data=cnts[cnts['don_total'] < 1000], marker='', color='skyblue', linewidth=8)
plt.legend(labels = ['cumulative % of total donations','cumulative % of emails'])
plt.grid(True)
plt.title("\n".join(wrap("Relationship between Cumulative Percentages of Donations and Donors",50)),fontweight='bold', color = 'purple', fontsize = 'large')
plt.xlabel("Donation Amount",fontweight='bold', color = 'darkblue', fontsize = 'large')
plt.ylabel("Cumulative Percentage (%)",fontweight='bold', color = 'darkblue', fontsize = 'large')
plt.xticks([0,500,1000],fontweight='bold', fontsize = 'large')
plt.yticks(np.arange(0, 101, step=20),fontweight='bold', fontsize = 'large')
plt.axvline(x=500, linestyle = '--',color='olive', linewidth=2)



ec = cnts['Email_cumpct'][cnts['don_total']==500]
print(str(int(round(ec))) +'% donors donated $500 or less')

dc = cnts['don_total_cumpct'][cnts['don_total']==500]
print('Their donations account for ' + str(int(round(dc))) +'% of the total donations')

## 3.7 compare donation distribution among subscribers vs non-subscribers 

In [None]:
donor_subscriber_cnts = don_email_cumpct(dat = combined[(combined['don_freq'] > 0) & (combined['subs_freq'] > 0)])
donor_nonsubscriber_cnts = don_email_cumpct(dat = combined[(combined['don_freq'] > 0) & (combined['subs_freq'] == 0)])

donor_subscriber_cnts_below1000 = donor_subscriber_cnts[donor_subscriber_cnts.don_total < 1000]
donor_nonsubscriber_cnts_below1000 = donor_nonsubscriber_cnts[donor_nonsubscriber_cnts.don_total < 1000]

In [None]:
# plot donation distribution from subscribers vs non-subscribers                                                                                                                                                
plt.rcParams['figure.figsize'] = [10, 8]
plt.plot( 'don_total', 'don_total_cumpct', 
         data = donor_subscriber_cnts_below1000, 
         marker='', markerfacecolor='green', markersize=10, color='green', linewidth=6, linestyle = 'solid')

plt.plot( 'don_total', 'don_total_cumpct',
         data = donor_nonsubscriber_cnts_below1000, 
         marker='', color='red', linewidth=6, linestyle = 'dashed')

plt.plot( 'don_total', 'Email_cumpct',
         data = donor_subscriber_cnts_below1000, 
         marker='', color='skyblue', linewidth=6, linestyle = 'solid')

plt.plot( 'don_total', 'Email_cumpct',
         data = donor_nonsubscriber_cnts_below1000, 
         marker='', color='orange', linewidth=6, linestyle = 'dashed')

plt.legend(labels = ['cum % of total donations (subscribers)',
                     'cum % of total donations (non-subscribers)',
                     'cum % of donors (subscribers)',
                     'cum % of donors (non-subscribers)'
                    ])
plt.grid(True)
plt.title("\n".join(wrap("Relationship between Cumulative Percentages of Donations and Donors",50)),fontweight='bold', color = 'purple', fontsize = 'large')
plt.xlabel("Donation Amount",fontweight='bold', color = 'darkblue', fontsize = 'large')
plt.ylabel("Cumulative Percentage (%)",fontweight='bold', color = 'darkblue', fontsize = 'large')
plt.xticks([0,500,1000],fontweight='bold', fontsize = 'large')
plt.yticks(np.arange(0, 101, step=20),fontweight='bold', fontsize = 'large')
plt.axvline(x=500, linestyle = '--',color='olive', linewidth=2)


ds_ec = donor_subscriber_cnts_below1000['Email_cumpct'][donor_subscriber_cnts_below1000['don_total']==500]
print(str(int(round(ds_ec))) +'% subscriber donors donated $500 or less')
ds_dc = donor_subscriber_cnts_below1000['don_total_cumpct'][donor_subscriber_cnts_below1000['don_total']==500]
print('Their donations account for ' + str(int(round(ds_dc))) +'% of the total subscriber donations')

dns_ec = donor_nonsubscriber_cnts_below1000['Email_cumpct'][donor_nonsubscriber_cnts_below1000['don_total']==500]
print(str(int(round(dns_ec))) +'% non-subscriber donors donated $500 or less')
dns_dc = donor_nonsubscriber_cnts_below1000['don_total_cumpct'][donor_nonsubscriber_cnts_below1000['don_total']==500]
print('Their donations account for ' + str(int(round(dns_dc))) +'% of the total non-subscriber donations')

ds_ec2 = donor_subscriber_cnts['Email_cumpct'][donor_subscriber_cnts['don_total']==1000]
print(str(int(round(ds_ec2))) +'% subscriber donors donated $1000 or less')
ds_dc2 = donor_subscriber_cnts['don_total_cumpct'][donor_subscriber_cnts['don_total']==1000]
print('Their donations account for ' + str(int(round(ds_dc2))) +'% of the total subscriber donations')

dns_ec2 = donor_nonsubscriber_cnts['Email_cumpct'][donor_nonsubscriber_cnts['don_total']==1000]
print(str(int(round(dns_ec2))) +'% non-subscriber donors donated $1000 or less')
dns_dc2 = donor_nonsubscriber_cnts['don_total_cumpct'][donor_nonsubscriber_cnts['don_total']==1000]
print('Their donations account for ' + str(int(round(dns_dc2))) +'% of the total non-subscriber donations')

## 3.8 Campaign Effectiveness Analysis

### Campaign Effectiveness Analysis 
#### group donotion data into spring, fall and dec campaigns
- for each campaign, create three groups:
- donation from the two months prior to the campaign
- donation during the campaign
- donation from the two months after the campaign
- campaign months are: 5,6,9,10,12

In [None]:
# consider 5,6 as spring campaign; 9,10 as fall campaign; 12 as dec campaign
# divide donotions into pre, mid and post of each campaign groups
don_spring_camp = donation[donation['ORD ENTR DT'].dt.month.isin([5,6])].groupby('EMAIL').agg({'AMT PAID':'sum'})
don_fall_camp = donation[donation['ORD ENTR DT'].dt.month.isin([9,10])].groupby('EMAIL').agg({'AMT PAID':'sum'})
don_dec_camp = donation[donation['ORD ENTR DT'].dt.month.isin([12])].groupby('EMAIL').agg({'AMT PAID':'sum'})

don_pre_spring_camp = donation[donation['ORD ENTR DT'].dt.month.isin([3,4])].groupby('EMAIL').agg({'AMT PAID':'sum'})
don_pre_fall_camp = donation[donation['ORD ENTR DT'].dt.month.isin([7,8])].groupby('EMAIL').agg({'AMT PAID':'sum'})
don_pre_dec_camp = donation[donation['ORD ENTR DT'].dt.month.isin([11])].groupby('EMAIL').agg({'AMT PAID':'sum'})

don_post_spring_camp = donation[donation['ORD ENTR DT'].dt.month.isin([7,8])].groupby('EMAIL').agg({'AMT PAID':'sum'})
don_post_fall_camp = donation[donation['ORD ENTR DT'].dt.month.isin([11])].groupby('EMAIL').agg({'AMT PAID':'sum'})
don_post_dec_camp = donation[donation['ORD ENTR DT'].dt.month.isin([1])].groupby('EMAIL').agg({'AMT PAID':'sum'})


camp = donation[donation['ORD ENTR DT'].dt.month.isin([5,6,9,10,12])].groupby('EMAIL').agg({'AMT PAID':'sum'})
pre_camp = donation[donation['ORD ENTR DT'].dt.month.isin([3,4,7,8,11])].groupby('EMAIL').agg({'AMT PAID':'sum'})
post_camp = donation[donation['ORD ENTR DT'].dt.month.isin([7,8,11,1])].groupby('EMAIL').agg({'AMT PAID':'sum'})

In [None]:
# reformat the donation group into a data frame with campaign label 
campaign_dic = {'period':['pre-campaign']*(don_pre_spring_camp.shape[0] + don_pre_fall_camp.shape[0] + don_pre_dec_camp.shape[0]) + \
                      ['mid-campaign']*(don_spring_camp.shape[0] + don_fall_camp.shape[0] + don_dec_camp.shape[0]) + \
                      ['post-campaign']*(don_post_spring_camp.shape[0] + don_post_fall_camp.shape[0] + don_post_dec_camp.shape[0]),         
             'Donation/Person': pd.concat([don_pre_spring_camp,don_pre_fall_camp,don_pre_dec_camp,
                                           don_spring_camp,don_fall_camp,don_dec_camp,
                                           don_post_spring_camp,don_post_fall_camp,don_post_dec_camp])['AMT PAID'],
             'campaign':['Spring/Summer']*don_pre_spring_camp.shape[0] + ['Fall']*don_pre_fall_camp.shape[0] + ['Dec']*don_pre_dec_camp.shape[0] + \
                        ['Spring/Summer']*don_spring_camp.shape[0] + ['Fall']*don_fall_camp.shape[0] + ['Dec']*don_dec_camp.shape[0] + \
                        ['Spring/Summer']*don_post_spring_camp.shape[0] + ['Fall']*don_post_fall_camp.shape[0] + ['Dec']*don_post_dec_camp.shape[0]

            }
campaign_df = pd.DataFrame.from_dict(campaign_dic)

In [None]:
# summarize the total donation amounts pre, mid and post each campaign peirod. 
campaign_summary = campaign_df.groupby(['period', 'campaign']).agg({'Donation/Person':['count','sum']})
campaign_summary.columns = ["_".join(x) for x in campaign_summary.columns.ravel()]
campaign_summary = campaign_summary.reset_index()
campaign_summary.rename(columns = {'Donation/Person_sum':'Total Donation','Donation/Person_count':'Total Number of Donors'}, inplace=True)
campaign_summary = pd.concat([campaign_summary[campaign_summary.period == 'pre-campaign'],
                              campaign_summary[campaign_summary.period == 'mid-campaign'],
                              campaign_summary[campaign_summary.period == 'post-campaign']])
campaign_summary = campaign_summary.sort_values('campaign', ascending = False)
print(campaign_summary)


In [None]:
# visualize the total donation comparison using barchart
g = sns.catplot(x="campaign", 
                y="Total Donation", 
                data=campaign_summary,
                hue="period", 
                height=6,aspect=1.2, kind="bar", palette="muted")
g.despine(left=True)
g.set_ylabels("Total Donation")
plt.title('Campaign Effectiveness on Total Donation', fontsize = 'x-large')

In [None]:
# visualize the total donors comparison using barchart
g = sns.catplot(x="campaign", 
                y="Total Number of Donors", 
                data=campaign_summary,
                hue="period", 
                height=6,aspect=1.2, kind="bar", palette="muted")
g.despine(left=True)
g.set_ylabels("Total Number of Donors")
plt.title('Campaign Effectiveness on the Number of Donors', fontsize = 'x-large')

In [None]:
# visualize the donation per person comparison by box plot
sns.set(rc={'figure.figsize':(20,10)})
testPlot = sns.catplot(x='campaign', y='Donation/Person',
                       hue='period', data=campaign_df, palette="muted",
                       showfliers=False,
                       kind='box'
                       ,height=8, aspect=1.5
                      )

plt.title('Campaign Effectiveness on Donation per Person', fontsize = 'x-large')

print("Median values:")
print(campaign_df.groupby(['period', 'campaign'])['Donation/Person'].median())

## 3.9 donation and subscription trend analysis
-  Aggregate subscription and donation data by month
- only consider donations now ( combine DON and SDN)

In [None]:
# get all the unique year numbers from subscription data
unique_years = subscription['ORD ENTR DT'].dt.year.unique()


# count of the number of donors by month
counts_by_year = {'year_month':[],'DON':[]}
for single_year, single_month in itertools.product(unique_years,range(1,13)):

    subset_donation = donation[(donation['ORD ENTR DT'].dt.year == single_year) & \
                               (donation['ORD ENTR DT'].dt.month == single_month)]
    if (not subset_donation.empty):
        bc_donation = basic_counts(subset_donation)
        counts_by_year['DON'].append(int(bc_donation['nemails']))        
        counts_by_year['year_month'].append(datetime.datetime(single_year,single_month,1))
    
counts_by_year_df = pd.DataFrame.from_dict(counts_by_year)

# count of the number of donations by month
events_by_year = {'year_month':[],'DON':[]}
for single_year, single_month in itertools.product(unique_years,range(1,13)):
    
    subset_donation = donation[(donation['ORD ENTR DT'].dt.year == single_year) & \
                               (donation['ORD ENTR DT'].dt.month == single_month)]
    if (not subset_donation.empty):
        events_by_year['DON'].append(subset_donation.shape[0])
        events_by_year['year_month'].append(datetime.datetime(single_year,single_month,1))
    
events_by_year_df = pd.DataFrame.from_dict(events_by_year)

# count of the donation amount by month
amounts_by_year = {'year_month':[],'DON':[]}
for single_year, single_month in itertools.product(unique_years,range(1,13)):
    subset_donation = donation[(donation['ORD ENTR DT'].dt.year == single_year) & \
                               (donation['ORD ENTR DT'].dt.month == single_month)]
                                        
    if (not subset_donation.empty):
        bc_donation = basic_sum(subset_donation)
        amounts_by_year['DON'].append(bc_donation['sumamt'])
        
        amounts_by_year['year_month'].append(datetime.datetime(single_year,single_month,1))

amounts_by_year_df = pd.DataFrame.from_dict(amounts_by_year)

# count of avg donation per person by month
amounts_unit_by_year = {'year_month':[],'DON':[]}
for single_year, single_month in itertools.product(unique_years,range(1,13)):

    subset_donation = donation[(donation['ORD ENTR DT'].dt.year == single_year) & \
                               (donation['ORD ENTR DT'].dt.month == single_month)]
                                        
    if not subset_donation.empty:
        total_amounts = basic_sum(subset_donation)
        total_emails = basic_counts(subset_donation)

        amounts_unit_by_year['DON'].append(int(total_amounts['sumamt_by_pub'].loc['DON'])/int(total_emails['nemails_by_pub'].loc['DON']))     
        amounts_unit_by_year['year_month'].append(datetime.datetime(single_year,single_month,1))

amounts_unit_by_year_df = pd.DataFrame.from_dict(amounts_unit_by_year)

### plot to show the number of subscribers /donors change by month

### plot the number of donors change by month

In [None]:
make_solid_plot(x = 'year_month', y = 'DON', df = counts_by_year_df,
                xlab = "month", ylab = "number of donors", title = "The number of donors change by month",
                legends = ['# donors by month','trend line (5 month avg)','end of year'])

### plot Total donation change by month

In [None]:
make_solid_plot(x = 'year_month', y = 'DON', df = amounts_by_year_df,
                xlab = "month", ylab = "total amount", title = "Total donation change by month",
                legends = ['total donation by month','trend line (5 month avg)','end of year']) 

In [None]:
make_solid_plot(x = 'year_month', y = 'DON', df = amounts_unit_by_year_df,
                xlab = "month", ylab = "total amount", title = "Donors' avg contribution change by month",
                legends = ['Donor avg donation by month','trend line (5 month avg)','end of year'])

## 3.10 newsletter click trend analysis

### aggregate click data by month and make trend plot

### plot the number people who clicked newsletters by month

In [None]:
nemails_by_month = click_data.groupby('file_month').agg({'Email':'nunique'})
nemails_by_month = nemails_by_month.reset_index()

make_click_data_scatter_plot(x_colname = 'file_month', y_colname = 'Email', df = nemails_by_month,
                             xlab = "month", ylab = "# emails", title = "# people who clicked newsletters by month", 
                             legends = ['# online readers by month','trend line (5 month avg)','end of year'])

### plot the # unique posts clicked by month

In [None]:
nurls_by_month = click_data.groupby('file_month').agg({'Url':'nunique'})

nurls_by_month = nurls_by_month.reset_index()

make_click_data_scatter_plot(x_colname = 'file_month', y_colname = 'Url', df = nurls_by_month,
                             xlab = "month", ylab = "# unique posts", title = "# unique posts clicked by month", 
                             legends = ['# unique posts clicked by month','trend line (5 month avg)','end of year'])

### plot total # clicks by month

In [None]:
nclicks_by_month = click_data.groupby('file_month').agg({'Url':'count'})
nclicks_by_month = nclicks_by_month.reset_index()

make_click_data_scatter_plot(x_colname = 'file_month', y_colname = 'Url', df = nclicks_by_month,
                             xlab = "month", ylab = "# clicks", title = "total # clicks by month", 
                             legends = ['total # clicks by month','trend line (5 month avg)','end of year'])

### plot the # of readers who first clicked posts ever by month, i.e # new readers by month 

In [None]:
email_1st_click = click_data[(click_data.domain_type =='mojo_standard')].groupby('Email').agg({'file_month':'min'})
email_1st_click = email_1st_click.reset_index()
email_1st_click_bymonth = email_1st_click.groupby('file_month').agg({'Email':'count'})
email_1st_click_bymonth = email_1st_click_bymonth.reset_index()

placeholder = {'file_month':click_data.file_month.unique(), 'count':[0]*len(click_data.file_month.unique())}
placeholder_df = pd.DataFrame.from_dict(placeholder).merge(email_1st_click_bymonth, how = 'left', right_on = 'file_month', left_on ='file_month')
placeholder_df.fillna(0,inplace=True)
placeholder_df = placeholder_df.sort_values(by = 'file_month')
placeholder_df = placeholder_df[2:]

make_click_data_scatter_plot(x_colname = 'file_month', y_colname = 'Email', df = placeholder_df,
                             xlab = "month", ylab = "# Email", title = "number of new readers by month", 
                             legends = ['# new readers by month','trend line (5 month avg)','end of year'])

### avg click per person by month 

In [None]:
nclicks_by_personh = click_data.groupby(['file_month','Email']).agg({'Url':['count','nunique']})
nclicks_by_personh.columns = ["_".join(x) for x in nclicks_by_personh.columns.ravel()]
nclicks_by_personh = nclicks_by_personh.reset_index()
nclicks_by_person_by_month = nclicks_by_personh.groupby('file_month').agg({'Url_count':'mean','Url_nunique':'mean'})
nclicks_by_person_by_month = nclicks_by_person_by_month.reset_index()

make_click_data_scatter_plot(x_colname = 'file_month', y_colname = 'Url_count', df = nclicks_by_person_by_month,
                             xlab = "month", ylab = "count", title = "avg clicks per person change by month", 
                             legends = ['avg # unique urls/person','avg # clicks/person'])

### explore the campaign effectiveness on readers' click rate

In [None]:
click_spring_camp = click_data[click_data['file_month'].dt.month.isin([5,6])].groupby('Email').agg({'Url':'count'})
click_fall_camp = click_data[click_data['file_month'].dt.month.isin([9,10])].groupby('Email').agg({'Url':'count'})
click_dec_camp = click_data[click_data['file_month']==datetime.datetime(2017,12,1)].groupby('Email').agg({'Url':'count'})

click_pre_spring_camp = click_data[click_data['file_month'].dt.month.isin([3,4])].groupby('Email').agg({'Url':'count'})
click_pre_fall_camp = click_data[click_data['file_month'].dt.month.isin([7,8])].groupby('Email').agg({'Url':'count'})
click_pre_dec_camp = click_data[click_data['file_month']==datetime.datetime(2017,11,1)].groupby('Email').agg({'Url':'count'})

click_post_spring_camp = click_data[click_data['file_month'].dt.month.isin([7,8])].groupby('Email').agg({'Url':'count'})
click_post_fall_camp = click_data[click_data['file_month'].dt.month.isin([11])].groupby('Email').agg({'Url':'count'})
click_post_dec_camp = click_data[click_data['file_month']==datetime.datetime(2018,1,1)].groupby('Email').agg({'Url':'count'})



In [None]:
click_combo_dic = {'period':['pre-campaign']*(click_pre_spring_camp.shape[0] + click_pre_fall_camp.shape[0] + click_pre_dec_camp.shape[0]) + \
                      ['mid-campaign']*(click_spring_camp.shape[0] + click_fall_camp.shape[0] + click_dec_camp.shape[0]) + \
                      ['post-campaign']*(click_post_spring_camp.shape[0] + click_post_fall_camp.shape[0] + click_post_dec_camp.shape[0]),         
             '# clicks': pd.concat([click_pre_spring_camp,click_pre_fall_camp,click_pre_dec_camp,
                                           click_spring_camp,click_fall_camp,click_dec_camp,
                                           click_post_spring_camp,click_post_fall_camp,click_post_dec_camp])['Url'],
             'campaign':['Spring/Summer']*click_pre_spring_camp.shape[0] + ['Fall']*click_pre_fall_camp.shape[0] + ['Dec']*click_pre_dec_camp.shape[0] + \
                        ['Spring/Summer']*click_spring_camp.shape[0] + ['Fall']*click_fall_camp.shape[0] + ['Dec']*click_dec_camp.shape[0] + \
                        ['Spring/Summer']*click_post_spring_camp.shape[0] + ['Fall']*click_post_fall_camp.shape[0] + ['Dec']*click_post_dec_camp.shape[0]
            }
click_combo_df = pd.DataFrame.from_dict(click_combo_dic)

In [None]:
click_summary = click_combo_df.groupby(['period', 'campaign']).agg({'# clicks':'sum'})
click_summary = click_summary.reset_index()
click_summary.rename(columns = {'# clicks':'Total Clicks'}, inplace=True)
click_summary = pd.concat([click_summary[click_summary.period == 'pre-campaign'],click_summary[click_summary.period == 'mid-campaign'],click_summary[click_summary.period == 'post-campaign']])
click_summary = click_summary.sort_values('campaign', ascending = False)

In [None]:
# compare total clicks before, mid and after campaigns with boxplot
g = sns.catplot(x="campaign", y="Total Clicks", hue="period", data=click_summary,
                height=6,aspect=1.2, kind="bar", palette="muted")
g.despine(left=True)
g.set_ylabels("Total Clicks")
plt.title('Campaign Effectiveness on Total Newsletter Clicks', fontsize = 'x-large')


In [None]:
# compare # clicks per person before, mid and after campaigns with boxplot
testPlot = sns.catplot(x='campaign', y='# clicks',
                       hue='period', data=click_combo_df, palette="muted",
                       showfliers=False,
                       kind='box',height=8, aspect=1.5)

plt.title('Campaign Effectiveness on Newsletter Clicks per Person', fontsize = 'x-large')

print("Median values:")
print(click_combo_df.groupby(['period', 'campaign'])['# clicks'].median())


## 3.11 Survival Anlysis of newsletter engagement
### Survival Analysis of how many months people clicked newsletters 
- only consider readers who started from 2017 and their 12-month activities

In [None]:
# find readers' earliest click month
click_data_min = click_data.groupby(['Email']).agg({'file_month':'min'})
click_data_min = click_data_min.reset_index()

# only keep readers' data who first clicked in 2017
clicked_2017 = click_data_min[click_data_min['file_month'] < datetime.datetime(2017,12,1)]['Email']

filtered_click_data = click_data[click_data['Email'].isin(clicked_2017)]

# create new data frame for these 2017 readers with their click data and their first month indicator
filtered_click_data2 = filtered_click_data.merge(click_data_min, left_on = 'Email', right_on = 'Email')
filtered_click_data2.rename(columns = {'file_month_x':'file_month','file_month_y':'first_month'},inplace=True)

In [None]:
# create a new column 'latest_month' that defined as the 12th month after the readers' first clicked month
filtered_click_data2['latest_month'] = [ add_months(x,11) for x in filtered_click_data2['first_month']]

filtered_click_data2['latest_month'] = pd.to_datetime(filtered_click_data2['latest_month'])

# only keep readers' click data within 12-month window
filtered_click_data2 = filtered_click_data2[filtered_click_data2['file_month'] <= filtered_click_data2['latest_month']]


In [None]:
# aggregate by email for its number of click month, first month and last month.
filtered_click_data2_agg = filtered_click_data2.groupby(['Email']).agg({'file_month':['nunique','max'],
                                                                        'latest_month':'max',
                                                                        'first_month':'max'})
filtered_click_data2_agg.columns = ["_".join(x) for x in filtered_click_data2_agg.columns.ravel()]
filtered_click_data2_agg = filtered_click_data2_agg.reset_index()

# calculate the readers' active month range and if they are still active by the end of 12-month window
filtered_click_data2_agg['month_range'] = [diff_month(x,y)+1 for x, y in zip(filtered_click_data2_agg['file_month_max'],filtered_click_data2_agg['first_month_max'])]
filtered_click_data2_agg['is_censored'] = [int(x) for x in (filtered_click_data2_agg['file_month_max'] == filtered_click_data2_agg['latest_month_max'])]

In [None]:
print('{}% emails were censored cuz they are active users'.format(round(100*filtered_click_data2_agg.is_censored.mean(),2)))
print(filtered_click_data2_agg['month_range'].describe())

### Survival rate for each month's new readers
- used to explain why click rate is going down

In [None]:
%%time
# visualize users' active months for each month's new readers using boxplot
filtered_click_data2_agg['first_month'] = filtered_click_data2_agg['first_month_max'].apply(lambda x: datetime.datetime.strftime(x, '%Y-%m'))
filtered_click_data2_agg = filtered_click_data2_agg.sort_values('first_month')
g = sns.catplot(x='first_month', y='month_range',
                        data=filtered_click_data2_agg, palette="muted",
                       showfliers=False,
                       kind='box',height=8, aspect=1.5)
g.set_ylabels("# active months within 12-month window")
g.set_xlabels("readers' first month")

plt.title("Survival rate for each month's new readers", fontsize = 'x-large')

### Survival Analysis for 2017's readers' click activity within a year

In [None]:
# survival probability plot for 2017 readers within a year
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
sns.set(style="white")
plt.rcParams['figure.figsize'] = [6,6]

durations = filtered_click_data2_agg['month_range']
event_observed = 1-filtered_click_data2_agg['is_censored']

## create a kmf object
kmf = KaplanMeierFitter() 

## Fit the data into the model
kmf.fit(durations, event_observed,label='2017 readers survival rate within a year')

a1 = kmf.plot(ci_show=True)
a1.grid()
a1.set_xticks(range(0,max(filtered_click_data2_agg['month_range'])+1))
a1.set_ylabel("probability")


### Compare survival rate of newsletter engagement between donors and subscribers 

In [None]:
%%time
# get last two years' donors and subcribers 
# create three grouups: 
# people who are both donors and subscribers, people who are donors but not subscribers, people who are subscribers but not donors
# take around 5min
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 

def leftjoin(lst1, lst2): 
    lst3 = [value for value in lst1 if value not in lst2] 
    return lst3 


donors = donation[(donation['ORD ENTR DT'] > datetime.datetime(2017,1,1)) & (donation['ORD ENTR DT'] < datetime.datetime(2018,12,31))]['EMAIL'].unique().tolist()

subscribers = subscription[(subscription['ORD ENTR DT'] > datetime.datetime(2017,1,1)) & (subscription['ORD ENTR DT'] < datetime.datetime(2018,12,31))]['EMAIL'].unique().tolist()

don_y_sub_y = intersection(donors,subscribers)
don_y_sub_n = leftjoin(donors,subscribers)
don_n_sub_y = leftjoin(subscribers,donors)

In [None]:
# survival probability plot for four

print("\n#### donors ####")
durations_donors = filtered_click_data2_agg[filtered_click_data2_agg['Email'].isin(donors)]['month_range']
event_observed_donors = 1-filtered_click_data2_agg[filtered_click_data2_agg['Email'].isin(donors)]['is_censored']
print('{}% emails were censored cuz they are active users'.format(round(100*(1-event_observed_donors).mean(),2)))
print("donors' median active months:" + str(durations_donors.median()) + " months")
 
print("\n#### non-donors ####")
durations_nondonors = filtered_click_data2_agg[~filtered_click_data2_agg['Email'].isin(donors)]['month_range']
event_observed_nondonors = 1-filtered_click_data2_agg[~filtered_click_data2_agg['Email'].isin(donors)]['is_censored']
print('{}% emails were censored cuz they are active users'.format(round(100*(1-event_observed_nondonors).mean(),2)))
print("nondonors' median active months:" + str(durations_nondonors.median()) + " months")

print("\n#### subscribers ####")
durations_subscribers = filtered_click_data2_agg[filtered_click_data2_agg['Email'].isin(subscribers)]['month_range']
event_observed_subscribers = 1-filtered_click_data2_agg[filtered_click_data2_agg['Email'].isin(subscribers)]['is_censored']
print('{}% emails were censored cuz they are active users'.format(round(100*(1-event_observed_subscribers).mean(),2)))
print("subscribers' median active months:" + str(durations_subscribers.median()) + " months")

print("\n#### non-subscribers ####")
durations_nonsubscribers = filtered_click_data2_agg[~filtered_click_data2_agg['Email'].isin(subscribers)]['month_range']
event_observed_nonsubscribers = 1-filtered_click_data2_agg[~filtered_click_data2_agg['Email'].isin(subscribers)]['is_censored']
print('{}% emails were censored cuz they are active users'.format(round(100*(1-event_observed_nonsubscribers).mean(),2)))
print("non-subscribers' median active months:" + str(durations_nonsubscribers.median()) + " months")

print("\n#### donors & subscribers ####")
durations_don_y_sub_y = filtered_click_data2_agg[filtered_click_data2_agg['Email'].isin(don_y_sub_y)]['month_range']
event_observed_don_y_sub_y = 1-filtered_click_data2_agg[filtered_click_data2_agg['Email'].isin(don_y_sub_y)]['is_censored']
print('{}% emails were censored cuz they are active users'.format(round(100*(1-event_observed_don_y_sub_y).mean(),2)))
print("People who are both donors and subscribers' median active months:" + str(durations_don_y_sub_y.median()) + " months")


print("\n#### donors & non-subscribers ####")
durations_don_y_sub_n = filtered_click_data2_agg[filtered_click_data2_agg['Email'].isin(don_y_sub_n)]['month_range']
event_observed_don_y_sub_n = 1-filtered_click_data2_agg[filtered_click_data2_agg['Email'].isin(don_y_sub_n)]['is_censored']
print('{}% emails were censored cuz they are active users'.format(round(100*(1-event_observed_don_y_sub_n).mean(),2)))
print("People who are donors but not subscribers' median active months:" + str(durations_don_y_sub_n.median()) + " months")

print("\n#### non-donors & subscribers ####")
durations_don_n_sub_y = filtered_click_data2_agg[filtered_click_data2_agg['Email'].isin(don_n_sub_y)]['month_range']
event_observed_don_n_sub_y = 1-filtered_click_data2_agg[filtered_click_data2_agg['Email'].isin(don_n_sub_y)]['is_censored']
print('{}% emails were censored cuz they are active users'.format(round(100*(1-event_observed_don_n_sub_y).mean(),2)))
print("People who are subscribers but not donors' median active months:" + str(durations_don_n_sub_y.median()) + " months")

print("\n#### non-donors & non subscribers ####")
durations_don_n_sub_n = filtered_click_data2_agg[filtered_click_data2_agg['Email'].isin(don_n_sub_y + don_y_sub_n + don_y_sub_y)]['month_range']
event_observed_don_n_sub_n = 1-filtered_click_data2_agg[filtered_click_data2_agg['Email'].isin(don_n_sub_y + don_y_sub_n + don_y_sub_y)]['is_censored']
print('{}% emails were censored cuz they are active users'.format(round(100*(1-event_observed_don_n_sub_n).mean(),2)))
print("People who are not subscribers and not donors' median active months:" + str(durations_don_n_sub_n.median()) + " months")




## create a kmf object
from lifelines import KaplanMeierFitter
kmf1 = KaplanMeierFitter() 
kmf1.fit(durations_don_y_sub_y, event_observed_don_y_sub_y, label='Donor & Subsriber')
a1 = kmf1.plot()
a1.set_xticks(range(0,max(filtered_click_data2_agg['month_range'])+1))
a1.set_ylabel("probability")

## fit the model for 2nd cohort
kmf1.fit(durations_don_y_sub_n, event_observed_don_y_sub_n, label='Donor & Non-subsriber')
kmf1.plot(ax=a1)

## fit the model for 3nd cohort
kmf1.fit(durations_don_n_sub_y, event_observed_don_n_sub_y, label='Non-donor & Subsriber')
kmf1.plot(ax=a1)

## fit the model for 4nd cohort
kmf1.fit(durations_don_n_sub_n, event_observed_don_n_sub_n, label='Non-donors & Non-subsriber')
kmf1.plot(ax=a1)

a1.grid()



# 4 User Insights - User Behavior Comparison

## 4.1 What do our print subscribers engage in vs our online readers in the newsletters?

The newsletter click data ranges from 2018/09/19 to 2018/10/30
The magazine subscription data ranges from 2017-10-01 to 2018-11-09

Limit magazine subscribers to 2017/09/20 - 2018/10/30 to get the overlap between newsletter subscribes and magazine subscribes

In [None]:
if 'nclicks_by_users' not in locals():
    
    nclicks_by_users = click_data.groupby('Email').agg({'Url': pd.Series.nunique,'topic':pd.Series.nunique})
    nclicks_by_users = nclicks_by_users.reset_index()

    nclicks_by_topics = reshape_data_to_wide(click_data,
                                             row = 'Email', 
                                             col = 'topic', 
                                             element = 'Url',
                                             cal = 'sum')
    nclicks_by_topics = nclicks_by_topics.reset_index()
    nclicks_by_topics = nclicks_by_topics.rename(columns = {'':'NA'})

nclicks_by_various_topics = pd.read_pickle(nclicks_by_various_topics_filename)
nclicks_by_modeling_topics= pd.read_pickle(nclicks_by_modeling_topics_filename)

In [None]:
# create a dictionary of three online reader groups:
# online readers who are magazine subscribers
# online readers who are non-magazine subscribers
# online readers who are recent magazine subscribers

min_date = datetime.date(2016, 1, 1)
min_date2 = datetime.date(2017, 1, 1)
max_date = datetime.date(2018, 11, 30)

# get all the emails who subscribed the magazine within a year of the click data but before the end date of the click data
mag_sub = subscription[(subscription['ORD ENTR DT'] > min_date) & (subscription['ORD ENTR DT'] < max_date)]['EMAIL'].unique().tolist()
# get all the emails who subscribed the magazine during the newsletter campaign
recent_mag_sub = subscription[(subscription['ORD ENTR DT'] > min_date2) & (subscription['ORD ENTR DT'] < max_date)]['EMAIL'].unique().tolist()
# get all the emails who subscribed the news letter
news_sub = click_data['Email'].unique().tolist()

mag_news_sub_all = combine_dat_sets(nclicks_by_various_topics, nclicks_by_users, 'Email','Email', join_method = 'left')
mag_news_sub_all['MJM'] = mag_news_sub_all['Email'].isin(mag_sub)

online_reader_groups=OrderedDict()
online_reader_groups['magazine subscribers']=mag_news_sub_all[mag_news_sub_all['MJM'] == True]
online_reader_groups['non-magazine subscribers']=mag_news_sub_all[ mag_news_sub_all['MJM'] == False]
online_reader_groups['recent magazine subscribers']=mag_news_sub_all[(mag_news_sub_all['MJM'] == True) & (mag_news_sub_all['Email'].isin(recent_mag_sub))]

### a. how many online readers have subscribed the magazine?

In [None]:
print('out of {} clicked the newsletter, {} have subscribed magazine within a year ({}%)'.format(
    mag_news_sub_all.shape[0], 
    sum(mag_news_sub_all['MJM']), 
    round(sum(mag_news_sub_all['MJM'])/mag_news_sub_all.shape[0]*100,2)
 ))


### b. what's the click rate difference between online reasers who subcribed the magazines within a year,  who subscribed the magazines during the newsletter campaign and who didn't subcribe the magazines?

In [None]:
grps = ['magazine subscribers','recent magazine subscribers','non-magazine subscribers']

online_reader_groups_click = {'group':[],'num_clicks': []}

for g in grps:
    online_reader_groups_click['group'].extend([g]*len(online_reader_groups[g]))
    online_reader_groups_click['num_clicks'].extend(online_reader_groups[g]['Url'])
online_reader_groups_click_df = pd.DataFrame.from_dict(online_reader_groups_click)

In [None]:
# use boxplot to compare click rateof magazine subscribers and that of non-magazine subscribers
plot_group_boxplot(x_colname = 'group', y_colname = 'num_clicks', df = online_reader_groups_click_df,
                      xlab = "", ylab = "# clicks",
                      xticklab = ['subscribers','recent subscribers','non-subscribers'], 
                      yticklab = range(-10,int(max(online_reader_groups_click_df['num_clicks'])),10),
                      title = "newsletter click rate: print subsribers vs non-subscribers")

In [None]:
#  compare # urls and # topics clicked by each donor group 

online_reader_groups_click_summary = []

for grpname, dat in online_reader_groups.items():
    
    online_reader_groups_click_summary.append(check_click_distribution(df =dat, groupname =grpname))
    
print("compare # urls clicked by each donor group ")    
display(pd.concat(online_reader_groups_click_summary, axis=1))

online_reader_groups_topics_summary = []

for grpname, dat in online_reader_groups.items():
    
    online_reader_groups_topics_summary.append(check_topics_distribution(df =dat, groupname =grpname))
    
print("compare # topics clicked by each donor group ")    
display(pd.concat(online_reader_groups_topics_summary, axis=1))

### c. what's the difference in the interested topics between online reasers who subcribed the magazines within a year, who subscribed the magazines during the newsletter campaign and who didn't subcribe the magazines?

### heatmap subscriber vs non-subscriber using MOJO topics

In [None]:
# compare topic click rate
topics = ['about','crime-justice', 'environment', 'food', 'kevin-drum',
       'media', 'mixed-media','politics']
comparison_groups = ['magazine subscribers',
                     'non-magazine subscribers',
                     'recent magazine subscribers'
                    ]
combo = pd.concat([sub_df_colsum(df = online_reader_groups[k],cols = topics, name = k) for k in comparison_groups],axis=1)

# convert to click percentage
for g in comparison_groups:
    combo[g] = round(combo[g]/sum(online_reader_groups[g]['All'])*100,2) 
     
print('compare magazine subscribers, recent magazine subscribers and non subscribers of the topics they clicked')
print('all the numbers are the percentage of urls containing the certain topics')
print(combo)

plot_topic_heatmap(df = combo, xlab = "", ylab ="# clicks", 
                   xticklab = ['subscribers','recent subscribers','non-subscribers'], 
                   yticklab = [x.replace('LDA_','') for x in combo.index], 
                   title = "% of clicks from topics", 
                   fig_width = 14, fig_height = 8)

In [None]:

# calculate the overall topic click rate across all the readers
overal_dist =sub_df_colsum(df = mag_news_sub_all,cols = topics, name = 'all readers')/sum(mag_news_sub_all['All'])
combo_index = combo.copy()

# calculate the ratio between the topic click from the givin group and the overall topic click rate across all the readers
# then the ratio subtract 100% will be the percentage of how much more or less likely the group is interested in a topic than average
for g in comparison_groups:
      combo_index[g] = 100*(combo[g]/100/overal_dist - 1)

        
plot_topic_heatmap(df = combo_index, xlab = "", ylab ="# clicks", 
                   xticklab = ['subscribers','recent subscribers','non-subscribers'], 
                   yticklab = combo.index, 
                   title = "\n".join(wrap('How much more likely does one group click a certain topic than average online readers? (%)',40)), 
                   fig_width = 14, fig_height = 8)        
        


### define LDA topics for heatmaps

In [None]:
lda_topics = pd.read_csv(lda_feature_filename,encoding = "ISO-8859-1").add_prefix('LDA_').columns[1:]


### heatmap LDA topic subscribers vs non-subscribers

In [None]:
# compare topic click rate
comparison_groups = ['magazine subscribers',
                     'non-magazine subscribers',
                     'recent magazine subscribers'
                    ]
combo = pd.concat([sub_df_colsum(df = online_reader_groups[k],cols = lda_topics, name = k) for k in comparison_groups],axis=1)

# convert to click percentage
for g in comparison_groups:
    combo[g] = round(combo[g]/sum(online_reader_groups[g][lda_topics].sum(axis =0))*100,2) 
     
print('compare magazine subscribers, recent magazine subscribers and non subscribers of the topics they clicked')
print('all the numbers are the percentage of urls containing the certain topics')
combo = combo.sort_index()
combo = combo[['magazine subscribers','recent magazine subscribers','non-magazine subscribers']]
combo

plot_topic_heatmap(df = combo, xlab = "", ylab ="# clicks", 
                   xticklab = ['subscribers','recent subscribers','non-subscribers'], 
                   yticklab = [x.replace('LDA_','') for x in combo.index], 
                   title = "% of clicks from LDA topics", 
                   fig_width = 14, fig_height = 10)  

In [None]:
# calculate the overal topic click rate across all the readers
overal_dist =sub_df_colsum(df = mag_news_sub_all,cols = lda_topics, name = 'all readers')/sum(mag_news_sub_all[lda_topics].sum(axis =0))
combo_index = combo.copy()

# calculate the ratio between the topic click from the givin group and the overall topic click rate across all the readers
# then the ratio subtract 100% will be the percentage of how much more or less likely the group is interested in a topic than average
for g in comparison_groups:
      combo_index[g] = 100*(combo[g]/100/overal_dist - 1)

plot_topic_heatmap(df = combo_index, xlab = "", ylab ="# clicks", 
                   xticklab = ['subscribers','recent subscribers','non-subscribers'], 
                   yticklab = [x.replace('LDA_','') for x in combo_index.index], 
                   title = "\n".join(wrap('How much more likely does one group click a certain topic than average online readers? (%)',40)), 
                   fig_width = 14, fig_height = 10)  
        


### d. what's the difference in the most popular titles between online readers who subcribed the magazines within a year, who subscribed the magazines during the newsletter campaign and who didn't subcribe the magazines? 

In [None]:
print("Top 10 clicks from online readers who subcribed the magazines within a year")
find_most_popular_titles(df = click_data, emails = online_reader_groups['magazine subscribers']['Email'])

In [None]:
print("Top 10 clicks from online readers who didn't subcribe the magazines")
find_most_popular_titles(df = click_data, emails = online_reader_groups['non-magazine subscribers']['Email'])

In [None]:
print("Top 10 clicks from online readers who subscribed the magazines during the newsletter campaign")
find_most_popular_titles(df = click_data, emails = online_reader_groups['recent magazine subscribers']['Email'])

## 4.2  What differences and similarities can we find between loyal subscribers and new donors?

Assume: 
- loyal subscribers are people who subscribed magazines two years in a row (assume people renew within a month before the subscription expires)
- new donors are people who first donated during the campaigns.

### a.how many loyal subscribers and new donors clicked newsletter? how much did they donate?

In [None]:
# create two groups: loyal subscribers and new donors

# define loyal subscribers are people who subscribed magazines two years in a row (assume people renew within a month before the subscription expires)
loyal_sub = sub_don_click_combined[sub_don_click_combined['subs_range'] > 365-60]

# define new donors are people who first donated during the campaigns
min_date = datetime.date(2017, 1, 1)
max_date = datetime.date(2018, 11, 30)
old_donors_emails = donation[donation['ORD ENTR DT'] < min_date]['EMAIL'].unique().tolist()
new_donors_emails = donation['EMAIL'][~donation['EMAIL'].isin(old_donors_emails)]

new_donors = sub_don_click_combined[sub_don_click_combined.index.isin(new_donors_emails)]

interested_groups=OrderedDict()
interested_groups['loyal subscribers'] = loyal_sub
interested_groups['new donors'] = new_donors

In [None]:
print("{} % loyal subscribers clicked and {}% donated ${} on average per person".format(
    round(100*((interested_groups['loyal subscribers']['Url'] > 0).mean()),2),
    round(100*((interested_groups['loyal subscribers']['don_freq'] > 0).mean()),2),
    round(interested_groups['loyal subscribers'][interested_groups['loyal subscribers']['don_freq'] > 0]['don_total'].mean(),2)))

In [None]:
print("{} % new donors clicked and donated ${} on average per person".format(
    round(100*((interested_groups['new donors']['Url'] > 0).mean()),2),
    round(interested_groups['new donors']['don_total'].mean(),2)))

### b. compare the number of urls clicked by loyal subscribers and new donors (page 27)

In [None]:
grps = ['loyal subscribers','new donors']

interested_groups_click = {'group':[],'num_clicks': []}

for g in grps:
    interested_groups_click['group'].extend([g]*len(interested_groups[g]))
    interested_groups_click['num_clicks'].extend(interested_groups[g]['Url'])
interested_groups_click_df = pd.DataFrame.from_dict(interested_groups_click)
interested_groups_click_df = interested_groups_click_df[interested_groups_click_df['num_clicks'] != 0]


plot_group_boxplot(x_colname = 'group', y_colname = 'num_clicks', df = interested_groups_click_df,
                      xlab = "", ylab = "# clicks",
                      xticklab = ['loyal subscribers','new donors'], 
                      yticklab = range(-10,int(max(online_reader_groups_click_df['num_clicks'])),10),
                      title = "newsletter click rate: loyal subscribers vs new donors")



In [None]:
interested_groups_click_summary = []

for grpname, dat in interested_groups.items():
    
    interested_groups_click_summary.append(check_click_distribution(df =dat, groupname =grpname))
    
print("compare # urls clicked by loyal subscribers vs new donors ")    
display(pd.concat(interested_groups_click_summary, axis=1))

interested_groups_topics_summary = []

for grpname, dat in interested_groups.items():
    
    interested_groups_topics_summary.append(check_topics_distribution(df =dat, groupname =grpname))
    
print("compare # topics clicked by loyal subscribers vs new donors")    
display(pd.concat(interested_groups_topics_summary, axis=1))

### c. compare the topics clicked by loyal subscribers and new donors

### Heatmap with Mojo Topics: loyal subscribers vs new donors

In [None]:
topics = ['about','crime-justice', 'environment', 'food', 'kevin-drum',
       'media', 'mixed-media','politics']
comparison_groups = ['loyal subscribers','new donors']
combo = pd.concat([sub_df_colsum(df = interested_groups[k],cols = topics, name = k) for k in comparison_groups],axis=1)

# convert to click percentage
for g in comparison_groups:
    combo[g] = round(combo[g]/sum(interested_groups[g]['All'])*100,2) 

print(combo)

plot_topic_heatmap(df = combo, xlab = "", ylab ="# clicks", 
                   xticklab = combo.columns, 
                   yticklab = combo.index, 
                   title = "% of clicks from topics", 
                   fig_width = 14, fig_height = 10)  
        


In [None]:
overal_dist =sub_df_colsum(df = mag_news_sub_all,cols = topics, name = 'all readers')/sum(mag_news_sub_all['All'])
combo_index = combo.copy()

for g in comparison_groups:
    combo_index[g] = 100*(combo[g]/100/overal_dist - 1)

        
plot_topic_heatmap(df = combo_index, xlab = "", ylab ="# clicks", 
                   xticklab = combo_index.columns, 
                   yticklab = combo.index, 
                   title = "\n".join(wrap('How much more likely does one group click a certain topic than average online readers? (%)',40)), 
                   fig_width = 14, fig_height = 10)  



### heatmap LDA topic: loyal subscribers vs new donors

In [None]:
# compare topic click rate
comparison_groups = ['loyal subscribers','new donors']
combo = pd.concat([sub_df_colsum(df = interested_groups[k],cols = lda_topics, name = k) for k in comparison_groups],axis=1)

# convert to click percentage

for g in comparison_groups:
    combo[g] = round(combo[g]/sum(interested_groups[g][lda_topics].sum(axis =0))*100,2) 
     
print('all the numbers are the percentage of urls containing the certain topics')
combo = combo.sort_index()
combo

plot_topic_heatmap(df = combo, xlab = "", ylab ="# clicks", 
                   xticklab = combo.columns, 
                   yticklab = [x.replace('LDA_','') for x in combo.index], 
                   title ="% of clicks from LDA topics", 
                   fig_width = 14, fig_height = 10)  



In [None]:
overal_dist =sub_df_colsum(df = mag_news_sub_all,cols = lda_topics, name = 'all readers')/sum(mag_news_sub_all[lda_topics].sum(axis =0))
combo_index = combo.copy()

for g in comparison_groups:
    combo_index[g] = 100*(combo[g]/100/overal_dist - 1)

plot_topic_heatmap(df = combo_index, xlab = "", ylab ="# clicks", 
                   xticklab = combo_index.columns, 
                   yticklab = [x.replace('LDA_','') for x in combo_index.index], 
                   title ="\n".join(wrap('How much more likely does one group click a certain topic than average online readers? (%)',40)), 
                   fig_width = 14, fig_height = 10)  
        
 

In [None]:
print("Top 10 clicks from loyal subscribers")
find_most_popular_titles(df = click_data, emails = interested_groups['loyal subscribers'].index)

In [None]:
print("Top 10 clicks from new donors")
find_most_popular_titles(df = click_data, emails = interested_groups['new donors'].index)

## 4.3 Compare six different groups of donors and subscribers
only have oct campaign data. too little to analyze the factors that lead to donation or subscription

## Presentation summary

### definition of the six groups
- donation:
    - regular donor
    - high-value donor
    - non donor
- subscription
    - print only
    - online only
    - both
    
- compare the six groups:
    - % print sub
    - % online click
    - avg urls
    - avg topics
    - avg donation
    - topic index vs avg online reader
    - top 10 articles vs avg online reader

In [None]:
# use outer join to combine click data and subscription & donation data. 
sub_don_click_combined_temp = combine_dat_sets(combined, 
                                               nclicks_by_users, 
                                               'Email','Email', join_method = 'outer')
all_dat = combine_dat_sets(sub_don_click_combined_temp, 
                            nclicks_by_various_topics,
                             'Email','Email', join_method = 'outer')
all_dat = all_dat.set_index('Email')

In [None]:
# define the six interested groups
interested_groups = define_interested_groups(df= all_dat)

### compare magazine subscription rate and newsletter click rate by each donor group (page 32)

In [None]:
# subscription rate summary
subscription_rate_summary = []

for grpname, dat in interested_groups.items():
    
    subscription_rate_summary.append(check_subscription_rate(df =dat, groupname =grpname))
    
subscription_rate_summary

subscription_rate_summary_df = pd.DataFrame.from_records(subscription_rate_summary)
subscription_rate_summary_df = subscription_rate_summary_df.rename(columns = {0:'group name',
                                                                    1: 'magazine subscription rate',
                                                                    2: 'newsletter click rate'})

print("compare magazine subscription rate and newsletter click rate by each donor group ")   
subscription_rate_summary_df.iloc[[0,1,2,6]]

### compare avg donations by each donor group

In [None]:
# use boxplot to compare avg donations from the six groups
groups_donations = {'group':[],'don_total': []}

for g, dat in interested_groups.items():
    groups_donations['group'].extend([g]*dat.shape[0])
    groups_donations['don_total'].extend(dat['don_total'])

    
groups_donations_df = pd.DataFrame.from_dict(groups_donations)
groups_donations_df = groups_donations_df[groups_donations_df['don_total'] != 0]
groups_donations_df = groups_donations_df[groups_donations_df['group'] != 'highvalue donors (>= $500)']


plot_group_boxplot(x_colname = 'don_total', y_colname = 'group', df = groups_donations_df,
                      xlab = "donation amount", ylab = "",
                      xticklab = [], 
                      yticklab = groups_donations_df['group'].unique(),
                      title = "donation comparision", orient = 'h')




### compare avg clicks from each group

In [None]:
# summarize the click rate from the six groups
click_summary = []

for grpname, dat in interested_groups.items():
    
    click_summary.append(check_click_distribution(df =dat, groupname =grpname))
    
print("compare # urls clicked by each donor group ")    
pd.concat(click_summary, axis=1)


In [None]:
# use boxplot to compare the click rate from the six groups
groups_clicks = {'group':[],'Url': []}

for g, dat in interested_groups.items():
    groups_clicks['group'].extend([g]*dat.shape[0])
    groups_clicks['Url'].extend(dat['Url'])

    
groups_clicks_df = pd.DataFrame.from_dict(groups_clicks)
groups_clicks_df = groups_clicks_df[groups_clicks_df['Url'] != 0]

plot_group_boxplot(x_colname = 'Url', 
                    y_colname ='group' , df = groups_clicks_df,
                      xlab = "# clicks", ylab = "",
                      xticklab = [], 
                      yticklab = groups_clicks_df['group'].unique(),
                      title = "newsletter click rate comparision",orient = 'h')



### compare avg number of topics by each donor group

In [None]:
# summarize the avg topics clicked by the six groups
topics_summary = []

for grpname, dat in interested_groups.items():
    
    topics_summary.append(check_topics_distribution(df =dat, groupname =grpname))
    
print("compare # topics clicked by each donor group ")    
display(pd.concat(topics_summary, axis=1))

### compare avg donation by each donor group

In [None]:
def check_avg_donor_rate(df, groupname):
    
    if sum(df['don_total']>0) >0:
        avg_don = round(df['don_total'][df['don_total']>0].median(),2)
    else:
        avg_don = 0
    return (groupname, avg_don)
                    
avg_donation_summary = []

for grpname, dat in interested_groups.items():
    
    avg_donation_summary.append(check_avg_donor_rate(df =dat, groupname =grpname))

avg_donor_summary_df = pd.DataFrame.from_records(avg_donation_summary)
avg_donor_summary_df = avg_donor_summary_df.rename(columns = {0:'group name',
                                                                    1: 'avg donation'})

print("compare average donation amounts by each donor group ")   
avg_donor_summary_df

### compare topic click rate and index by each donor group

In [None]:
topics = ['about','crime-justice', 'environment', 'food', 'kevin-drum',
       'media', 'mixed-media','politics']
comparison_groups = interested_groups.keys()
combo = pd.concat([sub_df_colsum(df = interested_groups[k],cols = topics, name = k) for k in comparison_groups],axis=1)

# convert to click percentage
for g in comparison_groups:
      combo[g] = round(combo[g]/sum(interested_groups[g]['All'])*100,2) 

# how much likely the group is to click a topic than average
combo_index = combo.copy()
for c in combo_index.columns:
    combo_index[c] = round((combo_index[c]/combo['all online readers'] - 1)*100,2)

combo_index = combo_index.drop(columns=['print subscriber only','all online readers'])
combo = combo.drop(columns=['print subscriber only'])

### Heatmap to compare groups’ interests MoJo topics

In [None]:
plot_topic_heatmap(df = combo_index.iloc[:,:6], xlab = "", ylab ='MoJo topics', 
                   xticklab = ['Reg Donors','High Donors','Non Donors','Online Only','Print & Online'], 
                   yticklab = combo.index, 
                   title ="\n".join(wrap('How much more likely does one group click a certain topic than average online readers? (%)',40)), 
                   fig_width = 14, fig_height = 10)  
     


In [None]:
plot_topic_heatmap(df = combo, xlab = "", ylab ='MoJo topics', 
                   xticklab = ['Reg Donors','High Donors','Non Donors','Online Only','Print & Online','All Online'], 
                   yticklab = combo.index, 
                   title ="\n".join(wrap('percentage of clicks from each topic (%)',40)), 
                   fig_width = 14, fig_height = 10)  


### Heatmap to compare six groups’ interests LDA topics

In [None]:
# compare topic click rate

comparison_groups = interested_groups.keys()
combo = pd.concat([sub_df_colsum(df = interested_groups[k],cols = lda_topics, name = k) for k in comparison_groups],axis=1)

# convert to click percentage

for g in comparison_groups:
    combo[g] = round(combo[g]/sum(interested_groups[g][lda_topics].sum(axis =0))*100,2) 
     
print('all the numbers are the percentage of urls containing the certain topics')
combo = combo.sort_index()
#combo_index = combo_index.drop(columns=['print subscriber only','all online readers'])
combo = combo.drop(columns=['print subscriber only'])


plot_topic_heatmap(df = combo, xlab = "", ylab ='LDA topics', 
                   xticklab = combo.columns, 
                   yticklab = [x.replace('LDA_','') for x in combo.index], 
                   title ="% of clicks from LDA topics", 
                   fig_width = 22, fig_height = 10)  


In [None]:
comparison_groups = interested_groups.keys()
combo = pd.concat([sub_df_colsum(df = interested_groups[k],cols = lda_topics, name = k) for k in comparison_groups],axis=1)

# convert to click percentage
for g in comparison_groups:
    combo[g] = round(combo[g]/sum(interested_groups[g][lda_topics].sum(axis=0))*100,2) 

# how much likely the group is to click a topic than average
combo_index = combo.copy()
for c in combo_index.columns:
    combo_index[c] = round((combo_index[c]/combo['all online readers'] - 1)*100,2)

combo_index = combo_index.drop(columns=['print subscriber only','all online readers'])
combo = combo.drop(columns=['print subscriber only'])

plot_topic_heatmap(df = combo_index, xlab = "", ylab ='LDA topics', 
                   xticklab = combo_index.columns, 
                   yticklab = [x.replace('LDA_','') for x in combo_index.index], 
                   title ="\n".join(wrap('How much more likely does one group click a certain topic than average online readers? (%)',40)), 
                   fig_width = 22, fig_height = 10)  



### compare top clicked posts from the six groups

In [None]:
print("Top 10 clicks from the six groups")
for k in interested_groups.keys():
    print(' #### {}'.format(k))
    display(find_most_popular_titles(df = click_data, emails = interested_groups[k].index)[['title','email_pct']])

## 4.4 engagement difference between different donation sources (page 40)

### load source code lookup data

In [None]:
src_p1_lookup = pd.read_csv(os.path.join(data_folder,'ICN FULFILLMENT SOURCE LIST.csv'),keep_default_na=False)

### Add source name to donation data
- find the names for the first and second letter of source code
- create column source code tobe the combination of first letter and its name
- filter and keep donation source name only

In [None]:
# function to return the first letter or the first 4 letters (SUST) of source codes
def return_position1(x):
    if str(x)[0] == 'S':
        return str(x)[:4]
    else:
        return str(x)[0]
    
# name lookup based on the first two letters combo of source code    
def return_position2(x):
    y = str(x)
    
    if y[0] == '4':
        if y[1] == 'A':
            return 'Appeals'
        if y[1] == 'R':
            return 'Renewals'
        if y[1:3] == 'LC':
            return 'Sub Conversion'
        if y[1] == 'L':
            return ' Reinstatement'
        if y[1:3] == 'TS':
            return 'Telemarketing Sustainer Invite'
        if y[1:3] == 'PD':
            return 'Planned Giving'
        if y[1] == 'B':
            return 'Sub Inserts'
        if y[1] == 'N':
            return 'Gift Inserts'
        if y[1] == 'U':
            return 'Newsstand Inserts'
        return 'unknown'
    
    if y[0] == '7':
            
        if y[1] in ['9', 'E', 'G', 'H', 'I', 'Z']:
             return 'ICN Web Forms'
        if y[1] in ['C']:
            return 'Convio Web Forms'
        if y[1] in ['P', 'F', 'Q', 'J', 'K']:
            return 'ICN Web Forms'
        if y[1] in ['D']:
            return 'Digital Edition'
        return 'unknown'
    return ''
    

# add source name to donation data based on the first letter of their donation source codes.
donation2 = donation.copy()
donation2['position1'] = [return_position1(x)  for x in donation2['ORD LST SRC']]

donation2 = donation2.merge(src_p1_lookup, left_on = 'position1', right_on ='position1', how = 'left' ).fillna('unknown')
donation2

donation2 = donation2[donation2['category'].isin(['FUNDRAISING DONATIONS','DONATIONS','unknown'])]

donation2['name'] = [x.replace('Web Donations, E-Campaigns','Web Donations').replace('Membership Direct Mail Fundraising Promotions','Direct Mail Fundraising') for x in donation2['name']]
donation2['source_name'] = donation2['position1'] + '-' + donation2['name']

### count the total donation of each source and select the sources that account for at least 0.1% of total donations

In [None]:
don_by_src = donation2.groupby('source_name').agg({'AMT PAID':sum,})
don_by_src['AMT PCT'] = don_by_src['AMT PAID']/sum(don_by_src['AMT PAID'])
don_by_src = don_by_src[don_by_src['AMT PCT'] > 0.001]
don_by_src = don_by_src.sort_values('AMT PAID',ascending=False).head(n=10)

print("top donation sources")
print(don_by_src)

## summarize the click rate of the top donation sources


In [None]:
%%time
# compare the click rate of the top donation sources using bar charts
# take around 1min
nclicks_by_users = click_data.groupby('Email').agg({'Url': pd.Series.nunique,'topic':pd.Series.nunique})
nclicks_by_users = nclicks_by_users.reset_index()
top_src_emails = donation2[donation2['source_name'].isin(don_by_src.index)]
top_src_emails_click = top_src_emails.merge(nclicks_by_users, left_on = 'EMAIL',right_on = 'Email')

plot_group_boxplot(x_colname = 'Url', y_colname = 'source_name', df = top_src_emails_click,
                      xlab = "# clicks", ylab = "",
                      xticklab = [], yticklab = top_src_emails_click['source_name'].unique(),
                      title = "newsletter click rate comparision (Top donation sources)",
                      orient="h")