### Preprocessing the columns for frequency of week, previous week, month, year, 6-months, weekday and the hour-offset

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

from dateutil import parser

In [3]:
## Read csvs
train_df = pd.read_csv('dsba-fml-foundations-of-machine-learning/train.csv', index_col=0)
test_df = pd.read_csv('dsba-fml-foundations-of-machine-learning/test.csv', index_col=0)

In [4]:
train_df.head()

Unnamed: 0,date,org,tld,ccs,bcced,mail_type,images,urls,salutations,designation,chars_in_subject,chars_in_body,label
0,"Mon, 6 Nov 2017 11:13:45 +0100",reply,ebay.in,0,0,multipart/alternative,35,120,0,0,49.0,80027,2
1,"Wed, 14 Feb 2018 11:00:16 -0000",edm,efinmail.com,0,0,multipart/alternative,1,7,0,0,107.0,2961,1
2,"Wed, 6 Jul 2016 19:53:37 +0000",usebackpack,com,0,0,text/html,4,17,0,0,35.0,25149,1
3,"Fri, 11 Oct 2019 11:25:40 +0200",granular,ai,0,0,multipart/mixed,0,0,0,0,15.0,635296,1
4,"Tue, 07 Nov 2017 11:07:18 +0000 (UTC)",github,com,1,0,multipart/alternative,2,11,0,0,49.0,2355,1


In [5]:
# create array for every feature (it is faster to save information for every row to array while iterating through the dataframe and then adding an array afterwards, than adding the information to the df while iterating it)
datetimes = []
dates = []
years = []
months = []
days = []
weekdays = []
times = []
offsets = []

for i, row in train_df.iterrows():
    time = parser.parse(row['date'].split('(')[0]) # parse string to datetime element
    local_time = time.astimezone() # convert it to local timezone (paris)
    datetimes.append(local_time) # save it to datetimes array
    date = local_time.date() # remove the time from datetime object
    dates.append(date) # add to date array
    years.append(date.year) # get year and add it to array
    months.append(date.month) # get month and add it to array
    days.append(date.day) # get day and add it to array
    weekdays.append(local_time.weekday()) # get weekday and add it to array
    times.append(local_time.time()) # add time to array
    try:
        # calculate the offset in hours between the original timestamp (sender time) and the time converted to the local timezone (receipient time)
        offset = time.utcoffset().seconds/3600 - local_time.utcoffset().seconds/3600
        offsets.append(offset)
    except:
        # if there is any error, because there is no timezone given etc. we just add zero.
        offsets.append(0)

# insert all new features into the dataframe
train_df.insert(0, 'datetime', datetimes)
train_df['date'] = dates
train_df.insert(2, 'year', years)
train_df.insert(3, 'month', months)
train_df.insert(4, 'day', days)
train_df.insert(5, 'weekday', weekdays)
train_df.insert(6, 'time', times)
train_df.insert(7, 'hour offset', offsets)

In [6]:
train_df.head()

Unnamed: 0,datetime,date,year,month,day,weekday,time,hour offset,org,tld,ccs,bcced,mail_type,images,urls,salutations,designation,chars_in_subject,chars_in_body,label
0,2017-11-06 11:13:45+01:00,2017-11-06,2017,11,6,0,11:13:45,0.0,reply,ebay.in,0,0,multipart/alternative,35,120,0,0,49.0,80027,2
1,2018-02-14 12:00:16+01:00,2018-02-14,2018,2,14,2,12:00:16,-1.0,edm,efinmail.com,0,0,multipart/alternative,1,7,0,0,107.0,2961,1
2,2016-07-06 21:53:37+02:00,2016-07-06,2016,7,6,2,21:53:37,-2.0,usebackpack,com,0,0,text/html,4,17,0,0,35.0,25149,1
3,2019-10-11 11:25:40+02:00,2019-10-11,2019,10,11,4,11:25:40,0.0,granular,ai,0,0,multipart/mixed,0,0,0,0,15.0,635296,1
4,2017-11-07 12:07:18+01:00,2017-11-07,2017,11,7,1,12:07:18,-1.0,github,com,1,0,multipart/alternative,2,11,0,0,49.0,2355,1


In [7]:
senders = []
for i, row in train_df.iterrows():
    if pd.isnull(row['tld']): 
        # if there is no top level domain, we use only the organisation. If this is also empty, the sender will just be empty
        sender = row['org']
    elif pd.isnull(row['org']):
        # if there is no organisation, we use only the top level domain. If this is also empty, the sender will just be empty
        sender = row['tld']
    elif row['tld'][0] == '.':
        # if the tld already begins with a dot, we can just concat org and tld
        sender = row['org'] + row['tld']
    else: 
        # otherwise we add a dot between org and tld
        sender = row['org'] + '.' + row['tld']
    senders.append(sender)

In [8]:
# get the number of unique senders: 1187
len(set(senders))

1187

In [None]:
# add sender feature to dataframe
train_df.insert(8, 'sender', senders)

In [None]:
train_df.head()

In [None]:
from dateutil.relativedelta import relativedelta

total_mails_by_sender = []
first_mail_of_sender = []
sender_freq_total_period = []
sender_freq_one_year = []
sender_freq_six_months = []
sender_freq_one_month = []
sender_freq_one_week = []

for i, row in train_df.iterrows():
    today = row['date'] # get date of the mail
    # filter dataframe to show only mails of the sender of the mail
    df = train_df[train_df['sender']==row['sender']]
    # filter dataframe to show only mails sent previous to the date of the mail
    mail_dates_total_period = sorted(df[df['date']<=row['date']]['date'].values)
    # count the number of mails of that sender previous to the currently looked at mail
    nb_mails = len(mail_dates_total_period)
    total_mails_by_sender.append(nb_mails)
    
    if mail_dates_total_period:
        
        # get the date of the first mail that was sent by the sender
        first_date = mail_dates_total_period[0]
        first_mail_of_sender.append(first_date)
        
        # get the number of days between the first mail of the sender and the current mail
        timedelta_total_period = (today - first_date).days
        # calculate daily frequence of mails over the entire period the sender has sent mails until the date of the current mail
        freq_total_period = nb_mails/timedelta_total_period if timedelta_total_period != 0 else 0
        sender_freq_total_period.append(freq_total_period)
        
        # filter dataframe to contain only mails sent in the previous year to the current mail and count them
        nb_mails_prev_year = len([x for x in mail_dates_total_period if x >= (today + relativedelta(years=-1))])
        # calculate the daily frequency over the previous year
        freq_prev_year = nb_mails_prev_year/365
        sender_freq_one_year.append(freq_prev_year)
        
        # filter dataframe to contain only mails sent in the previous six months to the current mail and count them
        nb_mails_prev_six_month = len([x for x in mail_dates_total_period if x >= (today + relativedelta(months=-6))])
        # count the number of days in the previous six months
        timedelta_six_months = (today - (today + relativedelta(months=-6))).days
        # calculate the daily frequency of mails over the previous six months
        freq_prev_six_months = nb_mails_prev_six_month/timedelta_six_months
        sender_freq_six_months.append(freq_prev_six_months)
        
        # filter dataframe to contain only mails sent in the previous month to the current mail and count them
        nb_mails_prev_month = len([x for x in mail_dates_total_period if x >= (today + relativedelta(months=-1))])
        # count number of days in the previous month
        timedelta_one_month = (today - (today + relativedelta(months=-1))).days
        # calculate the daily frequency of mails over the previous month
        freq_prev_month = nb_mails_prev_month/timedelta_one_month
        sender_freq_one_month.append(freq_prev_month)
        
        # filter dataframe to contain only mails sent in the previous week to the current mail and count them
        nb_mails_prev_week = len([x for x in mail_dates_total_period if x >= (today + relativedelta(weeks=-1))])
        # calculate the daily frequency of mails over the previous week
        freq_prev_week = nb_mails_prev_week/7
        sender_freq_one_week.append(freq_prev_week)
        
    else:
        # if no previous mails to the current mail exist, we set all the values to zero. This indicates that the mail is the first mail of the sender.
        first_mail_of_sender.append(0)
        sender_freq_total_period.append(0)
        sender_freq_one_year.append(0)
        sender_freq_six_months.append(0)
        sender_freq_one_month.append(0)
        sender_freq_one_week.append(0)
        

In [None]:
train_df.insert(9, 'total_mails_by_sender', total_mails_by_sender)
train_df.insert(10, 'first_mail_of_sender', first_mail_of_sender)
train_df.insert(11, 'sender_freq_total_period', sender_freq_total_period)
train_df.insert(12, 'sender_freq_prev_year', sender_freq_one_year)
train_df.insert(13, 'sender_freq_prev_six_months', sender_freq_six_months)
train_df.insert(14, 'sender_freq_prev_month', sender_freq_one_month)
train_df.insert(15, 'sender_freq_prev_week', sender_freq_one_week)

In [None]:
train_df

In [None]:
nb_mails_per_sender = {}
for sender in set(senders):
    nb_mails_per_sender[sender] = len(train_df[train_df['sender'] == sender]['date'])
    print(sender)
    all_dates = sorted(train_df[train_df['sender'] == sender]['date'])
    if all_dates:
        first_date = all_dates[0]
        last_date = all_dates[-1]
        timedelta = first_date - last_date
        print(all_dates)
        print(first_date)
        print(last_date)

In [None]:
## Filtering column "mail_type"
train_x = train_df[['mail_type']]
train_x = train_x.fillna(value='None')
train_y = train_df[['label']]
test_x = test_df[['mail_type']]
test_x = test_x.fillna(value='None')

In [None]:
## Do one hot encoding of categorical feature
feat_enc = OneHotEncoder()
feat_enc.fit(np.vstack([train_x, test_x]))
train_x_featurized = feat_enc.transform(train_x)
test_x_featurized = feat_enc.transform(test_x)

In [None]:
## Train a simple KNN classifier using featurized data
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_x_featurized, train_y)
pred_y = neigh.predict(test_x_featurized)

In [None]:
## Save results to submission file
pred_df = pd.DataFrame(pred_y, columns=['label'])
pred_df.to_csv("knn_sample_submission.csv", index=True, index_label='Id')