In [2]:
import pandas as pd
import matplotlib.pyplot as plt

tr = pd.read_csv('train/train.csv')
te = pd.read_csv('test/test_BDIfz5B.csv')
campaignData = pd.read_csv('train/campaign_data.csv')

In [3]:
campaignData.dtypes

campaign_id              int64
communication_type      object
total_links              int64
no_of_internal_links     int64
no_of_images             int64
no_of_sections           int64
email_body              object
subject                 object
email_url               object
dtype: object

<h1>Parsing text data

In [72]:
campaignData['communication_type'].unique()

array(['Newsletter', 'Upcoming Events', 'Conference', 'Others', 'Webinar',
       'Corporate', 'Hackathon'], dtype=object)

In [4]:
from sklearn.preprocessing import LabelEncoder

comType = LabelEncoder()
campaignData['commType'] = comType.fit_transform(campaignData['communication_type'])
print(campaignData['commType'].dtypes)
campaignData[['commType', 'communication_type']].sample(5)

int64


Unnamed: 0,commType,communication_type
16,2,Hackathon
23,3,Newsletter
40,2,Hackathon
1,5,Upcoming Events
49,2,Hackathon


<h1>Getting a taste</h1>

In [47]:
X = pd.merge(tr.iloc[:, [0,2,3]], campaignData, how='left', on='campaign_id')
X.dtypes

id                      object
campaign_id              int64
send_date               object
communication_type      object
total_links              int64
no_of_internal_links     int64
no_of_images             int64
no_of_sections           int64
email_body              object
subject                 object
email_url               object
commType                 int64
dtype: object

In [49]:
X['parsedDateTime'] = pd.to_datetime(X['send_date'], format='%d-%m-%Y %H:%M')
print(X['parsedDateTime'].sample())

X['hourSent'] = X['parsedDateTime'].dt.hour
X.dtypes

548446   2017-11-06 22:58:00
Name: parsedDateTime, dtype: datetime64[ns]


id                              object
campaign_id                      int64
send_date                       object
communication_type              object
total_links                      int64
no_of_internal_links             int64
no_of_images                     int64
no_of_sections                   int64
email_body                      object
subject                         object
email_url                       object
commType                         int64
parsedDateTime          datetime64[ns]
hourSent                         int64
dtype: object

In [37]:
del X['subject']
del X['email_url']
del X['email_body']
del X['communication_type']
del X['send_date']
del X['parsedDateTime']
X.dtypes

id                      object
campaign_id              int64
total_links              int64
no_of_internal_links     int64
no_of_images             int64
no_of_sections           int64
commType                 int64
hourSent                 int64
dtype: object

In [41]:
y = tr['is_click']

In [42]:
X.iloc[:,1:].sample(2)

Unnamed: 0,campaign_id,total_links,no_of_internal_links,no_of_images,no_of_sections,commType,hourSent
516824,42,88,79,13,4,3,20
32192,30,18,14,7,1,5,14


In [43]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge', penalty='l2')
clf.fit(X.iloc[:,1:],y)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [44]:
campaignData.dtypes

campaign_id              int64
communication_type      object
total_links              int64
no_of_internal_links     int64
no_of_images             int64
no_of_sections           int64
email_body              object
subject                 object
email_url               object
commType                 int64
dtype: object

In [51]:
test = pd.merge(te.iloc[:, [0,1,3]], campaignData.iloc[:, [0,2,3,4,5,9]], how='left', on='campaign_id')
test.dtypes

id                      object
campaign_id              int64
send_date               object
total_links              int64
no_of_internal_links     int64
no_of_images             int64
no_of_sections           int64
commType                 int64
dtype: object

In [52]:
test['parsedDateTime'] = pd.to_datetime(test['send_date'], format='%d-%m-%Y %H:%M')
print(test['parsedDateTime'].sample())

test['hourSent'] = test['parsedDateTime'].dt.hour
test.dtypes

234460   2018-01-02 08:19:00
Name: parsedDateTime, dtype: datetime64[ns]


id                              object
campaign_id                      int64
send_date                       object
total_links                      int64
no_of_internal_links             int64
no_of_images                     int64
no_of_sections                   int64
commType                         int64
parsedDateTime          datetime64[ns]
hourSent                         int64
dtype: object

In [53]:
del test['send_date']
del test['parsedDateTime']
test.dtypes

id                      object
campaign_id              int64
total_links              int64
no_of_internal_links     int64
no_of_images             int64
no_of_sections           int64
commType                 int64
hourSent                 int64
dtype: object

In [54]:
test.iloc[:,1:].sample(2)

Unnamed: 0,campaign_id,total_links,no_of_internal_links,no_of_images,no_of_sections,commType,hourSent
357966,74,140,130,15,4,3,21
422368,64,49,45,14,4,5,15


In [55]:
result = clf.predict(test.iloc[:,1:])

In [56]:
result.shape

(773858,)

In [142]:
#pd.Dataframe(result, columns=['result']).to_csv('C:\code\AVlordOfTheMachines\resultSGDClassifier.csv', index=False)

#prediction = pd.DataFrame(predictions, columns=['predictions']).to_csv('prediction.csv')

#import numpy as np
#np.savetxt('resultSGDClassifier.csv', result, delimiter=',')

In [59]:
pd.DataFrame(result, columns=['is_click']).to_csv('resultSGDClassifierV2.csv')

In [60]:
pd.DataFrame(test['id'], columns=['id']).to_csv('1.csv')