In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag, ConditionalFreqDist
from collections import Counter
import numpy as np
import re
from itertools import chain

from sklearn.feature_extraction.text import CountVectorizer

In [4]:
campaign_df = pd.read_csv(r'..\data\raw\campaign_data.csv')
train_df = pd.read_csv(r'..\data\raw\train.csv', parse_dates=True)
test_df = pd.read_csv(r'..\data\raw\test.csv', parse_dates=True)

In [5]:
campaign_df_copy = campaign_df[['campaign_id', 'communication_type', 'total_links', 
                                'no_of_internal_links', 'no_of_images', 'no_of_sections']].copy()

#### Transforming text data into features

In [6]:
# function to clean data
stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [7]:
clean_subs = campaign_df['subject'].map(lambda x: cleanData(x, True, True, True))

In [8]:
# Keeping minimum document frequency high to only include high frequency terms 
countvec_1 = CountVectorizer(analyzer='word', ngram_range = (1,1), min_df=4, max_features=50)
countvec_2 = CountVectorizer(analyzer='word', ngram_range = (2,2), min_df=3, max_features=50)

In [9]:
bow_1 = countvec_1.fit_transform(clean_subs)
bow_2 = countvec_2.fit_transform(clean_subs)

In [10]:
modified_features_1 = ["sub - " + s for s in countvec_1.get_feature_names()] 
modified_features_2 = ["sub - " + s for s in countvec_2.get_feature_names()] 

In [11]:
bow_1_df = pd.DataFrame(data=bow_1.todense(), columns=modified_features_1)
bow_2_df = pd.DataFrame(data=bow_2.todense(), columns=modified_features_2)

In [12]:
# Merge with campaign data
campaign_df_copy = pd.concat([campaign_df_copy, bow_1_df, bow_2_df], axis=1)

In [13]:
# Email Body
clean_body = campaign_df['email_body'].map(lambda x: cleanData(x, True, True, True))

In [14]:
# Keeping minimum document frequency high to only include high frequency terms 
countvec_1 = CountVectorizer(analyzer='word', ngram_range = (1,1), min_df=5, max_features=100)
countvec_2 = CountVectorizer(analyzer='word', ngram_range = (2,2), min_df=5, max_features=100)

In [15]:
bow_1 = countvec_1.fit_transform(clean_body)
bow_2 = countvec_2.fit_transform(clean_body)

In [16]:
modified_features_1 = ["body - " + s for s in countvec_1.get_feature_names()] 
modified_features_2 = ["body - " + s for s in countvec_2.get_feature_names()] 

In [17]:
bow_1_df = pd.DataFrame(data=bow_1.todense(), columns=modified_features_1)
bow_2_df = pd.DataFrame(data=bow_2.todense(), columns=modified_features_2)

In [18]:
# Merge with campaign data
campaign_df_copy = pd.concat([campaign_df_copy, bow_1_df, bow_2_df], axis=1)

In [19]:
campaign_df_copy.head()

Unnamed: 0,campaign_id,communication_type,total_links,no_of_internal_links,no_of_images,no_of_sections,sub - 2017,sub - 2018,sub - ai,sub - come,...,body - scienc practition,body - scienc profession,body - scientist data,body - session workshop,body - summit 2017,body - talent hunt,body - thought leader,body - ultim learn,body - upcom challeng,body - upcom event
0,29,Newsletter,67,61,12,3,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0
1,30,Upcoming Events,18,14,7,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
2,31,Conference,15,13,5,1,0,0,0,0,...,1,0,0,0,2,0,1,0,0,0
3,32,Conference,24,19,7,1,1,0,1,0,...,0,0,0,0,2,0,0,0,0,0
4,33,Others,7,3,1,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


#### Merging Data

In [20]:
train = pd.merge(train_df, campaign_df_copy, on='campaign_id')
test = pd.merge(test_df, campaign_df_copy, on='campaign_id')

In [21]:
# Converting ids to string
train['user_id'] = train['user_id'].astype('str')
train['campaign_id'] = train['campaign_id'].astype('str')

test['user_id'] = test['user_id'].astype('str')
test['campaign_id'] = test['campaign_id'].astype('str')

#### Binning Numeric Features

In [22]:
train['bin_no_of_images'] = pd.cut(train['no_of_images'], 
                                   [0, 3, 6, 9, 12, 15, 18, 21], 
                                   labels=['1-3', '3-6', '6-9', '9-12', '12-15', '15-18', '18-21'])

test['bin_no_of_images'] = pd.cut(test['no_of_images'], 
                                   [0, 3, 6, 9, 12, 15, 18, 21], 
                                   labels=['1-3', '3-6', '6-9', '9-12', '12-15', '15-18', '18-21'])

In [23]:
train['bin_no_of_internal_links'] = pd.cut(train['no_of_internal_links'], 
                                   [0, 30, 100, 200], 
                                   labels=['1-30', '30-100', '>100'])

test['bin_no_of_internal_links'] = pd.cut(test['no_of_internal_links'], 
                                   [0, 30, 100, 200], 
                                   labels=['1-30', '30-100', '>100'])

In [24]:
# Calculate no_of_external_links = total_links - no_of_internal_links
train['no_of_external_links'] = train['total_links'] - train['no_of_internal_links']
test['no_of_external_links'] = test['total_links'] - test['no_of_internal_links']

In [25]:
train['bin_no_of_external_links'] = pd.cut(train['no_of_external_links'], 
                                   [0, 3, 6, 9, 12], 
                                   labels=['1-3', '3-6', '6-9', '9-12'])

test['bin_no_of_external_links'] = pd.cut(test['no_of_external_links'], 
                                   [0, 3, 6, 9, 12], 
                                   labels=['1-3', '3-6', '6-9', '9-12'])

#### Date/Time Features

In [26]:
train['send_date'] = pd.to_datetime(train['send_date'])
test['send_date'] = pd.to_datetime(test['send_date'])

In [27]:
train['day_of_week'] = train['send_date'].dt.dayofweek
test['day_of_week'] = test['send_date'].dt.dayofweek

In [28]:
from datetime import datetime, time
def time_of_day(_datetime):
    group = None
    
    if time(6, 1) <= _datetime.time() <= time(12, 0):        
        group = 0
    elif time(12, 1) <= _datetime.time() <= time(16, 0):        
        group = 1
    elif time(16, 1) <= _datetime.time() <= time(20, 0):        
        group = 2
    else:        
        group = 3
    
    return group

In [29]:
train['time_group'] = train['send_date'].map(lambda x: time_of_day(x))
test['time_group'] = test['send_date'].map(lambda x: time_of_day(x))

In [30]:
train.head()

Unnamed: 0,id,user_id,campaign_id,send_date,is_open,is_click,communication_type,total_links,no_of_internal_links,no_of_images,...,body - thought leader,body - ultim learn,body - upcom challeng,body - upcom event,bin_no_of_images,bin_no_of_internal_links,no_of_external_links,bin_no_of_external_links,day_of_week,time_group
0,42_14051,14051,42,2017-01-09 19:55:00,0,0,Newsletter,88,79,13,...,1,0,0,0,12-15,30-100,9,6-9,0,2
1,42_177808,177808,42,2017-01-09 20:13:00,0,0,Newsletter,88,79,13,...,1,0,0,0,12-15,30-100,9,6-9,0,3
2,42_133077,133077,42,2017-01-09 20:11:00,0,0,Newsletter,88,79,13,...,1,0,0,0,12-15,30-100,9,6-9,0,3
3,42_118677,118677,42,2017-01-09 20:15:00,0,0,Newsletter,88,79,13,...,1,0,0,0,12-15,30-100,9,6-9,0,3
4,42_25809,25809,42,2017-01-09 19:49:00,0,0,Newsletter,88,79,13,...,1,0,0,0,12-15,30-100,9,6-9,0,2


In [31]:
test.head()

Unnamed: 0,id,campaign_id,user_id,send_date,communication_type,total_links,no_of_internal_links,no_of_images,no_of_sections,sub - 2017,...,body - thought leader,body - ultim learn,body - upcom challeng,body - upcom event,bin_no_of_images,bin_no_of_internal_links,no_of_external_links,bin_no_of_external_links,day_of_week,time_group
0,63_122715,63,122715,2018-01-02 22:35:00,Newsletter,68,64,15,5,0,...,0,2,0,0,12-15,30-100,4,3-6,1,3
1,63_124394,63,124394,2018-01-02 22:47:00,Newsletter,68,64,15,5,0,...,0,2,0,0,12-15,30-100,4,3-6,1,3
2,63_95168,63,95168,2018-01-02 22:44:00,Newsletter,68,64,15,5,0,...,0,2,0,0,12-15,30-100,4,3-6,1,3
3,63_31556,63,31556,2018-01-02 23:09:00,Newsletter,68,64,15,5,0,...,0,2,0,0,12-15,30-100,4,3-6,1,3
4,63_138377,63,138377,2018-01-02 22:48:00,Newsletter,68,64,15,5,0,...,0,2,0,0,12-15,30-100,4,3-6,1,3


In [32]:
del(train['send_date'])
del(test['send_date'])

In [33]:
train_ids_df = train[['id', 'campaign_id', 'user_id']].copy()
test_ids_df = test[['id', 'campaign_id', 'user_id']].copy()

In [34]:
del(train['id'])
del(train['campaign_id'])
del(train['user_id'])

del(test['id'])
del(test['campaign_id'])
del(test['user_id'])

In [35]:
train.head()

Unnamed: 0,is_open,is_click,communication_type,total_links,no_of_internal_links,no_of_images,no_of_sections,sub - 2017,sub - 2018,sub - ai,...,body - thought leader,body - ultim learn,body - upcom challeng,body - upcom event,bin_no_of_images,bin_no_of_internal_links,no_of_external_links,bin_no_of_external_links,day_of_week,time_group
0,0,0,Newsletter,88,79,13,4,1,0,0,...,1,0,0,0,12-15,30-100,9,6-9,0,2
1,0,0,Newsletter,88,79,13,4,1,0,0,...,1,0,0,0,12-15,30-100,9,6-9,0,3
2,0,0,Newsletter,88,79,13,4,1,0,0,...,1,0,0,0,12-15,30-100,9,6-9,0,3
3,0,0,Newsletter,88,79,13,4,1,0,0,...,1,0,0,0,12-15,30-100,9,6-9,0,3
4,0,0,Newsletter,88,79,13,4,1,0,0,...,1,0,0,0,12-15,30-100,9,6-9,0,2


In [36]:
test.head()

Unnamed: 0,communication_type,total_links,no_of_internal_links,no_of_images,no_of_sections,sub - 2017,sub - 2018,sub - ai,sub - come,sub - contest,...,body - thought leader,body - ultim learn,body - upcom challeng,body - upcom event,bin_no_of_images,bin_no_of_internal_links,no_of_external_links,bin_no_of_external_links,day_of_week,time_group
0,Newsletter,68,64,15,5,0,1,0,1,1,...,0,2,0,0,12-15,30-100,4,3-6,1,3
1,Newsletter,68,64,15,5,0,1,0,1,1,...,0,2,0,0,12-15,30-100,4,3-6,1,3
2,Newsletter,68,64,15,5,0,1,0,1,1,...,0,2,0,0,12-15,30-100,4,3-6,1,3
3,Newsletter,68,64,15,5,0,1,0,1,1,...,0,2,0,0,12-15,30-100,4,3-6,1,3
4,Newsletter,68,64,15,5,0,1,0,1,1,...,0,2,0,0,12-15,30-100,4,3-6,1,3


In [37]:
combined_df = pd.concat([train, test], axis=0)

In [38]:
combined_df.head()

Unnamed: 0,bin_no_of_external_links,bin_no_of_images,bin_no_of_internal_links,body - 10,body - 15,body - 2017,body - 2018,body - 24,body - across,body - ai,...,sub - scientist,sub - summit,sub - summit 2017,sub - top,sub - top data,sub - win,sub - win iphon,sub - win prize,time_group,total_links
0,6-9,12-15,30-100,0,0,1,0,0,1,0,...,0,1,1,0,0,0,0,0,2,88
1,6-9,12-15,30-100,0,0,1,0,0,1,0,...,0,1,1,0,0,0,0,0,3,88
2,6-9,12-15,30-100,0,0,1,0,0,1,0,...,0,1,1,0,0,0,0,0,3,88
3,6-9,12-15,30-100,0,0,1,0,0,1,0,...,0,1,1,0,0,0,0,0,3,88
4,6-9,12-15,30-100,0,0,1,0,0,1,0,...,0,1,1,0,0,0,0,0,2,88


In [40]:
combined_df[['is_open', 'is_click']].tail()

Unnamed: 0,is_open,is_click
773853,,
773854,,
773855,,
773856,,
773857,,


In [41]:
combined_df = pd.get_dummies(combined_df)

In [42]:
combined_df.head()

Unnamed: 0,body - 10,body - 15,body - 2017,body - 2018,body - 24,body - across,body - ai,body - analyt,body - analyt onlin,body - analyt vidhya,...,bin_no_of_internal_links_1-30,bin_no_of_internal_links_30-100,bin_no_of_internal_links_>100,communication_type_Conference,communication_type_Corporate,communication_type_Hackathon,communication_type_Newsletter,communication_type_Others,communication_type_Upcoming Events,communication_type_Webinar
0,0,0,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,0,0,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,0,0,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4,0,0,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [43]:
train_new = combined_df[(combined_df['is_open'].notnull())]
test_new = combined_df[(combined_df['is_open'].isnull())]

In [44]:
train_new.reset_index(drop=True, inplace=True)
test_new.reset_index(drop=True, inplace=True)

In [46]:
len(train_new) == len(train)

True

In [47]:
len(test_new) == len(test)

True

In [50]:
train_new = pd.concat([train_ids_df, train_new], axis=1)
test_new = pd.concat([test_ids_df, test_new], axis=1)

In [51]:
train_new.head()

Unnamed: 0,id,campaign_id,user_id,body - 10,body - 15,body - 2017,body - 2018,body - 24,body - across,body - ai,...,bin_no_of_internal_links_1-30,bin_no_of_internal_links_30-100,bin_no_of_internal_links_>100,communication_type_Conference,communication_type_Corporate,communication_type_Hackathon,communication_type_Newsletter,communication_type_Others,communication_type_Upcoming Events,communication_type_Webinar
0,42_14051,42,14051,0,0,1,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
1,42_177808,42,177808,0,0,1,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
2,42_133077,42,133077,0,0,1,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
3,42_118677,42,118677,0,0,1,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
4,42_25809,42,25809,0,0,1,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0


In [52]:
test_new.head()

Unnamed: 0,id,campaign_id,user_id,body - 10,body - 15,body - 2017,body - 2018,body - 24,body - across,body - ai,...,bin_no_of_internal_links_1-30,bin_no_of_internal_links_30-100,bin_no_of_internal_links_>100,communication_type_Conference,communication_type_Corporate,communication_type_Hackathon,communication_type_Newsletter,communication_type_Others,communication_type_Upcoming Events,communication_type_Webinar
0,63_122715,63,122715,7,0,0,8,0,0,3,...,0,1,0,0,0,0,1,0,0,0
1,63_124394,63,124394,7,0,0,8,0,0,3,...,0,1,0,0,0,0,1,0,0,0
2,63_95168,63,95168,7,0,0,8,0,0,3,...,0,1,0,0,0,0,1,0,0,0
3,63_31556,63,31556,7,0,0,8,0,0,3,...,0,1,0,0,0,0,1,0,0,0
4,63_138377,63,138377,7,0,0,8,0,0,3,...,0,1,0,0,0,0,1,0,0,0


#### Storing in pickle


In [53]:
train_new.to_pickle(r'..\data\processed\train_v1.pkl')
test_new.to_pickle(r'..\data\processed\test_v1.pkl')