# Data Preprocessing

For the competition, our goal was to produce as many rich features as possible in order to capture maximal amounts of information from the dataset

Below is our approach on preprocessing the data.

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn as sk 
import matplotlib.pyplot as plt 
import xgboost as xgb 
import seaborn as sns 
from math import inf
import scipy

In [484]:
#Copying TLD and ORG columns 
copy_train = pd.read_csv("C://Users//Sauraj (Work mode)//Desktop//train.csv")
copy_test = pd.read_csv("C://Users//Sauraj (Work mode)//Desktop//test.csv")

We suspected that the data must be having duplicated rows since some of the rows seemed to repeat at various locations, so in order to reduce data repetition and minimize the chance of having increased false-positives, our approach was heuristic in nature, by just removing the duplicates that were present in the dataset

In [485]:
cols_to_check = ['date','mail_type','images','urls','chars_in_subject','chars_in_body','org','tld']
copy_train[copy_train.duplicated(subset=cols_to_check, keep='first')]

Unnamed: 0.1,Unnamed: 0,date,org,tld,ccs,bcced,mail_type,images,urls,salutations,designation,chars_in_subject,chars_in_body,label
232,232,4 Dec 2013 13:50:13 -0000,phpclasses,org,0,0,multipart/alternative,21,55,0,0,50.0,26592,1
434,434,"Thu, 4 Feb 2016 16:00:36 +0530",iiitd,ac.in,2,0,multipart/related,0,0,1,0,96.0,1521139,1
577,577,"Sat, 5 Mar 2016 12:23:17 +0530 (IST)",sampark,gov.in,0,0,text/html,52,101,0,0,75.0,28650,2
985,985,"Tue, 18 Aug 2015 12:14:47 +0530",iiitd,ac.in,7,0,multipart/alternative,2,13,1,1,34.0,9562,1
987,987,"Fri, 07 Apr 2017 15:30:01 +0530",nrsc,gov.in,0,0,multipart/alternative,0,4,1,0,32.0,1210,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80168,80168,"Fri, 15 Feb 2019 18:07:55 +0100",asvspoof,org,0,0,multipart/alternative,0,7,1,0,29.0,16533,1
80170,80170,"Tue, 07 Apr 2015 14:12:29 +0000 (UTC)",quora,com,0,0,multipart/alternative,0,88,1,1,98.0,80913,1
80172,80172,"Fri, 1 May 2015 11:48:55 +0530 (IST)",,,0,0,text/html,0,2,1,0,73.0,13464,0
80174,80174,"Fri, 14 Dec 2018 09:01:13 +0000 (UTC)",medium,com,0,0,multipart/alternative,32,239,1,1,169.0,105276,0


In [486]:
#This is another dataset we made with outliers removed (Our hypothesis was that removing outliers should improve performance )
train_data = pd.read_csv("C://Users//Sauraj (Work mode)//Desktop//total_train_removed_outlier_with_0.98.csv")
test_data = pd.read_csv("C://Users//Sauraj (Work mode)//Desktop//total_test_removed_outlier_with_0.98.csv")

In [487]:
#This is another dataset we made with our initial Feature engineering notebook (FE_initial.ipynb)
train_copy = pd.read_csv("C://Users//Sauraj (Work mode)//Desktop//Databases//train_data_processed.csv")
test_copy = pd.read_csv("C://Users//Sauraj (Work mode)//Desktop//Databases//test_data_processed.csv")

In [114]:
#Columns which we need to copy 
cols_to_copy = ['total_mails_by_sender','first_mail_of_sender','sender_freq_total_period','sender_freq_prev_year',
                'sender_freq_prev_week','sender_freq_prev_month','sender_freq_prev_six_months']

In [115]:
#Columns to copy into the original training and testing sets 
train_data = pd.concat([train_data, train_copy[cols_to_copy]], axis=1)
test_data = pd.concat([test_data, test_copy[cols_to_copy]], axis=1)

In [5]:
#Missing data calculator function to detect the number of missing values in each column 
def missing_data_calculator(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending = False)
    missing_data = pd.concat([total,percent], axis = 1, keys = ['Total', 'Percent'])
    return missing_data.head(100)

In [438]:
#Mail-type is more of a categorical feature, so the code here is to convert it to categorical 
train_data['mail_type'] = train_data['mail_type'].astype('category')
train_data['mail_type'] = train_data['mail_type'].cat.codes
#Chars_in_subject had NaN values, so filling them up with 0
train_data['chars_in_subject'] = train_data['chars_in_subject'].fillna(0)
test_data['chars_in_subject'] = test_data['chars_in_subject'].fillna(0)

In [488]:
# train_data['label'] = train_copy['label']
# train_data['tld'] = copy_train['tld']
# train_data['org'] = copy_train['org']

# test_data['tld'] = copy_test['tld']
# test_data['org'] = copy_test['org']

# test_data['date'] = copy_test['date']
# train_data['date'] = copy_train['date']


# test_data['mail_type'] = copy_test['mail_type']
# train_data['mail_type'] = copy_train['mail_type']


In [421]:
cols_to_check = ['date','mail_type','images','urls','chars_in_subject','chars_in_body','org','tld']
dropping_index = train_data[train_data.duplicated(subset=cols_to_check, keep='first')].index

In [489]:
train_data = train_data.drop(dropping_index, axis=0)

In [486]:
train_data = train_data.iloc[indexes]

In [490]:
#Information about the train data after removing duplicates 
train_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49140 entries, 0 to 80173
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   49140 non-null  int64  
 1   hour_offset                  49140 non-null  float64
 2   ccs                          49140 non-null  int64  
 3   bcced                        49140 non-null  int64  
 4   total_mails_by_sender        49140 non-null  int64  
 5   sender_freq_total_period     49140 non-null  float64
 6   sender_freq_prev_year        49140 non-null  float64
 7   sender_freq_prev_week        49140 non-null  float64
 8   sender_freq_prev_month       49140 non-null  float64
 9   sender_freq_prev_six_months  49140 non-null  float64
 10  images                       49140 non-null  int64  
 11  urls                         49140 non-null  int64  
 12  salutations                  49140 non-null  int64  
 13  designation     

In [491]:
#Information about the test data
test_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34365 entries, 0 to 34364
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   34365 non-null  int64  
 1   hour_offset                  34365 non-null  float64
 2   ccs                          34365 non-null  int64  
 3   bcced                        34365 non-null  int64  
 4   total_mails_by_sender        34365 non-null  int64  
 5   sender_freq_total_period     34365 non-null  float64
 6   sender_freq_prev_year        34365 non-null  float64
 7   sender_freq_prev_week        34365 non-null  float64
 8   sender_freq_prev_month       34365 non-null  float64
 9   sender_freq_prev_six_months  34365 non-null  float64
 10  images                       34365 non-null  int64  
 11  urls                         34365 non-null  int64  
 12  salutations                  34365 non-null  int64  
 13  designation     

In [211]:
#Log-transformation function to alleviate skewness (This was just a prototype function to see if log-transform is needed or not)
def log_transformer(data, column):
    print("Pre-transformation Skewness: {}".format(scipy.stats.skew(data[column])))
    data[column] = np.log1p(data[column])
    print("Post-transformation Skewness: {}".format(scipy.stats.skew(data[column])))
    return data[column]


In [493]:
#Checking the skewness of the features (What the approach was that if skew is more than 1 or less than -1, we would log-transform it)
train_data.skew()

Unnamed: 0                      0.388898
hour_offset                     1.200528
ccs                             3.573887
bcced                          17.956983
total_mails_by_sender           2.115314
sender_freq_total_period        1.393086
sender_freq_prev_year           1.780520
sender_freq_prev_week           4.238770
sender_freq_prev_month          1.695669
sender_freq_prev_six_months     1.617779
images                          1.760993
urls                            1.884381
salutations                     0.430141
designation                     2.610617
chars_in_subject                1.059809
chars_in_body                  21.833610
label                           2.038638
dtype: float64

Most ML algorithms expect normality in their data since the gaussianity in the data helps to extract the optimal parameters that help to maximize the predictive power of the models. 

We had various approaches for tackling this, with the following used: 

1. ~StandardScaler~: Was dropped since it is sensitive to outliers and extreme datapoints, and we did not have enough statistical evidence to reject outliers from the data or impute them with a mean, since mean is also sensitive to outliers. 

2. ~Box-Cox transform~: Helped only for some columns, but came with the drawback that the skewness would often cross +1, leading to shape imbalance. 

3. ~MinMaxScaler~: Scaled the data between -1 and 1 to make it normalized, but only scaled the data and not change the shape of the distribution. 


Finally, we came to QuantileTransformer, a preprocessing method that converts the numeric columns into a normal distribution by partitioning them into bins and finding the parameter that helps to turn the columns into a normally distributed shape.

In [1]:
from sklearn.preprocessing import QuantileTransformer


#These columns are numeric in nature, so normalize them=
cols = ['total_mails_by_sender','sender_freq_total_period','sender_freq_prev_year','sender_freq_prev_week',
        'sender_freq_prev_month','sender_freq_prev_six_months','images','urls','chars_in_subject','chars_in_body']

#For loop (Because we don't do these things manually)
for col in cols:
    mms = QuantileTransformer(n_quantiles=100, output_distribution='normal', ignore_implicit_zeros=True)
    train_data[col] = mms.fit_transform(train_data[col].to_numpy().reshape(-1, 1))
    test_data[col] = mms.transform(test_data[col].to_numpy().reshape(-1, 1))

NameError: name 'train_data' is not defined

# Bringing Normality in the data

In [478]:
#Log-transformation (Was applied before, but dropped later)
train_data['images'] = np.log1p(train_data['images'])
train_data['urls'] = np.log1p(train_data['urls'])
train_data['chars_in_subject'] = np.log1p(train_data['chars_in_subject'])
train_data['chars_in_body'] = np.log1p(train_data['chars_in_body'])

test_data['images'] = np.log1p(test_data['images'])
test_data['urls'] = np.log1p(test_data['urls'])
test_data['chars_in_subject'] = np.log1p(test_data['chars_in_subject'])
test_data['chars_in_body'] = np.log1p(test_data['chars_in_body'])

In [288]:
# Dampening the effect of outliers in the training set 

# train_data['images'] = log_transformer(train_data, 'images')
# train_data['chars_in_body'] = log_transformer(train_data, 'chars_in_body')
# train_data['urls'] = log_transformer(train_data, 'urls')
# train_data['chars_in_subject'] = log_transformer(train_data, 'chars_in_subject')
# train_data['total_mails_by_sender'] = log_transformer(train_data, 'total_mails_by_sender')
# train_data['sender_freq_total_period'] = log_transformer(train_data, 'sender_freq_total_period')
# train_data['sender_freq_prev_year'] = log_transformer(train_data, 'sender_freq_prev_year')
# train_data['sender_freq_prev_week'] = log_transformer(train_data, 'sender_freq_prev_week')
# train_data['sender_freq_prev_month'] = log_transformer(train_data, 'sender_freq_prev_month')
# train_data['sender_freq_prev_six_months'] = log_transformer(train_data, 'sender_freq_prev_six_months')


# Dampening the effect of outliers in the testing set 

# test_data['images'] = log_transformer(test_data, 'images')
# test_data['chars_in_body'] = log_transformer(test_data, 'chars_in_body')
# test_data['urls'] = log_transformer(test_data, 'urls')
# test_data['chars_in_subject'] = log_transformer(test_data, 'chars_in_subject')
# test_data['total_mails_by_sender'] = log_transformer(test_data, 'total_mails_by_sender')
# test_data['sender_freq_total_period'] = log_transformer(test_data, 'sender_freq_total_period')
# test_data['sender_freq_prev_year'] = log_transformer(test_data, 'sender_freq_prev_year')
# test_data['sender_freq_prev_week'] = log_transformer(test_data, 'sender_freq_prev_week')
# test_data['sender_freq_prev_month'] = log_transformer(test_data, 'sender_freq_prev_month')
# test_data['sender_freq_prev_six_months'] = log_transformer(test_data, 'sender_freq_prev_six_months')



Pre-transformation Skewness: 259.2875244854419
Post-transformation Skewness: 0.4784786447007777
Pre-transformation Skewness: 20.794114254400494
Post-transformation Skewness: -0.8549831830804311
Pre-transformation Skewness: 140.9033857557555
Post-transformation Skewness: -0.3038343614984591
Pre-transformation Skewness: 2.3847461829109706
Post-transformation Skewness: -1.3569506502331155
Pre-transformation Skewness: 1.9650694911025917
Post-transformation Skewness: -0.4306094761764705
Pre-transformation Skewness: 1.5765969840800516
Post-transformation Skewness: 0.6977705114465274
Pre-transformation Skewness: 1.5922024544286655
Post-transformation Skewness: 0.8328332957332146
Pre-transformation Skewness: 4.743116650178704
Post-transformation Skewness: 0.6678900545232977
Pre-transformation Skewness: 1.398686194476134
Post-transformation Skewness: 0.6164297138092417
Pre-transformation Skewness: 1.413013345066901
Post-transformation Skewness: 0.7152531580243529
Pre-transformation Skewness: 12

### The Preprocessing functions 

In [86]:
from dateutil import parser

#Function that creates a new column called "sender", capturing the details of the sender by appending 
#the org and tld together. 
def create_senders(df):
    #Senders list
    senders = [] 

    for i, row in df.iterrows():
        if pd.isnull(row['tld']):
            sender = row['org']
        elif pd.isnull(row['org']):
            sender = row['tld']
        elif row['tld'][0] == '.':
            sender = row['org'] + row['tld']
        else:
            sender = row['org'] + '.' + row['tld']
        senders.append(sender)

#Insert the "sender" feature first
    df.insert(8, 'sender', senders)
    print("Sender information added")
    return df

#Date-time extraction feature for calculating the date and time 
def date_time_extraction(df):
    datetimes = []
    dates = []
    years = []
    months = []
    days = []
    weekdays = []
    times = []
    offsets = []
    for i, row in df.iterrows():
        time = parser.parse(row['date'].split('(')[0])
        local_time = time.astimezone()
        datetimes.append(local_time)
        date = local_time.date()
        dates.append(date)
        years.append(date.year)
        months.append(date.month)
        days.append(date.day)
        weekdays.append(local_time.weekday())
        times.append(local_time.time())
        try:
            offset = time.utcoffset().seconds/3600 - local_time.utcoffset().seconds/3600
            offsets.append(offset)
        except:
            offsets.append(None)
            #print("Error")

    df.insert(0,'datetime', datetimes)
    print('Dates added')
    df['date'] = dates
    df.insert(2,'year', years)
    print('Year added')
    df.insert(3,'month', months)
    print('Month added')
    df.insert(4,'day', days)
    print('Day added')
    df.insert(5,'weekday', weekdays)
    print('Weekday added')
    df.insert(6,'time', times)
    print('Time added')
    df.insert(7,'hour_offset', offsets)
    print('Hour-offset added')
    
    return df 

# def row_deleter(df):
#     df = df.drop('Unnamed: 0', axis=1)
#     print('Row deletion complete')
#     return df 

#Function to calculate img-url ratio
def img_url_ratio(df):
    df['img_url_ratio'] = df['images']/df['urls']
    df['img_url_ratio'].fillna(0)
    print('Img/url complete')
    return df


#Function to calculate body-subject ratio
def body_subject_ratio(df):
    df['body_subj_ratio'] = df['chars_in_body']/df['chars_in_subject']
    df['body_subj_ratio'].fillna(0)
    df.loc[df["body_subj_ratio"] == inf] = 0 
    print('Body/subject ratio added')
    return df 


def ht_hs_binary(df):
    """
    Function for representing in binary whether an email 
    has a subject/title or not 
    """
    df.loc[df["chars_in_subject"] != 0, "has_subject"] = 1
    df.loc[df["chars_in_subject"] == 0, "has_subject"] = 0

    df.loc[df["chars_in_body"] != 0, "has_body"] = 1
    df.loc[df["chars_in_body"] == 0, "has_body"] = 0
    print('HT-HS-binary encoding complete')
    return df


def mail_type_preprocessing(df):
    
    #Preprocessing "multipart/text" and "Multipart/Text" repetition issue 
    df['mail_type'] = df['mail_type'].str.lower().unique()
    df[['data_type','source_type']] = df['mail_type'].str.split('/', expand=True)
    return df 

def mailtype_dummy_vars(df):
    #Preprocessing for mailtype to make them into dummy variables 
#     df['mail_type'] = df['mail_type'].str.lower().unique()
    df['mail_type'] = df['mail_type'].astype('category')
    df['mail_type'] = df['mail_type'].cat.codes
    
    return df 

def forward_level(df):
    #Feature to determine if the mail was cced and bcced together 
    # If CC & BCC, forward level is type 2
    # If either CC or BCC, forward level is type 1
    # If none, then type 0 
    
    df.loc[(df["ccs"] == 0) & (df["bcced"] == 0), "forward_level"] = 0 
    df.loc[(df["ccs"] >= 1) | (df["bcced"] > 0), "forward_level"] = 1 
    df.loc[(df["ccs"] == 1) & (df["bcced"] == 1), "forward_level"] = 2
    print('Forward levels added')
    
    return df 

#Function to create dummy values for each column. 
def dummy_transform(df, column_name=str):
    df = pd.get_dummies(df, columns=[column_name])
    print("Dummy variable transformation for {} complete".format(column_name))
    return df 


#Function that detects if a certain email had a sender or not 
def no_sender_reported(df):
    """
    Certain emails are coming without any specified address or top level domain. 
    Because these emails are anomalous in nature, it is best to classify them as 
    a 'no-sender-reported' type email"
    """
    df['No_sender_reported'] = (train_data['org'].isnull()) & (train_data['tld'].isnull())
    df['No_sender_reported'] = 1 - (pd.get_dummies(train_data['No_sender_reported']))  # want 1 on NaN values, not the opposite
    print("No sender reported feature created")
    return df

#Function to create columns for organizations wrt to the labels they have 
def org_set_information(df):
    #Find all the unique values in each subset
    labelled_update = train_data[train_data['label']==0]['org'].unique()
    labelled_personal = train_data[train_data['label']==1]['org'].unique()
    labelled_promotions = train_data[train_data['label']==2]['org'].unique()
    labelled_forums = train_data[train_data['label']==3]['org'].unique()
    labelled_purchases = train_data[train_data['label']==4]['org'].unique()
    labelled_travel = train_data[train_data['label']==5]['org'].unique()
    labelled_spam = train_data[train_data['label']==6]['org'].unique()
    labelled_social = train_data[train_data['label']==7]['org'].unique()


    #Creating binary columns for the col "label" values
    df['update_org'] = df['org'].apply(lambda x : 1 if (x in labelled_update) else 0)
    df['personal_org'] = df['org'].apply(lambda x : 1 if (x in labelled_personal) else 0)
    df['promotions_org'] = df['org'].apply(lambda x : 1 if (x in labelled_promotions) else 0)
    df['forums_org'] = df['org'].apply(lambda x : 1 if (x in labelled_forums) else 0)
    df['purchases_org'] = df['org'].apply(lambda x : 1 if (x in labelled_purchases) else 0)
    df['travel_org'] = df['org'].apply(lambda x : 1 if (x in labelled_travel) else 0)
    df['spam_org'] = df['org'].apply(lambda x : 1 if (x in labelled_spam) else 0)
    df['social_org'] = df['org'].apply(lambda x : 1 if (x in labelled_social) else 0)
    print("Organization set information complete")

    
    return df
    
#Function to create columns for top-level domain wrt to the labels they have 
def tld_set_information(df):
    #Find all the unique values in each subset
    labelled_update = train_data[train_data['label']==0]['tld'].unique()
    labelled_personal = train_data[train_data['label']==1]['tld'].unique()
    labelled_promotions = train_data[train_data['label']==2]['tld'].unique()
    labelled_forums = train_data[train_data['label']==3]['tld'].unique()
    labelled_purchases = train_data[train_data['label']==4]['tld'].unique()
    labelled_travel = train_data[train_data['label']==5]['tld'].unique()
    labelled_spam = train_data[train_data['label']==6]['tld'].unique()
    labelled_social = train_data[train_data['label']==7]['tld'].unique()


    #Creating binary columns for the col "label" values
    df['update_tld'] = df['tld'].apply(lambda x : 1 if (x in labelled_update) else 0)
    df['personal_tld'] = df['tld'].apply(lambda x : 1 if (x in labelled_personal) else 0)
    df['promotions_tld'] = df['tld'].apply(lambda x : 1 if (x in labelled_promotions) else 0)
    df['forums_tld'] = df['tld'].apply(lambda x : 1 if (x in labelled_forums) else 0)
    df['purchases_tld'] = df['tld'].apply(lambda x : 1 if (x in labelled_purchases) else 0)
    df['travel_tld'] = df['tld'].apply(lambda x : 1 if (x in labelled_travel) else 0)
    df['spam_tld'] = df['tld'].apply(lambda x : 1 if (x in labelled_spam) else 0)
    df['social_tld'] = df['tld'].apply(lambda x : 1 if (x in labelled_social) else 0)
    
    print("TLD information set complete")
    return df
    
    
#Function to create columns for sender wrt to the labels they have 
def sender_set_information(df):
    update_set = set(train_data[train_data['label']==0]['sender'].unique())
    social_set = set(train_data[train_data['label']==1]['sender'].unique())
    forum_set = set(train_data[train_data['label']==2]['sender'].unique())
    promo_set = set(train_data[train_data['label']==3]['sender'].unique())
    
    df['forum_sender'] = df['sender'].apply(lambda x : int(x in forum_set))
    df['social_sender'] = df['sender'].apply(lambda x : int(x in social_set))
    df['promo_sender'] = df['sender'].apply(lambda x : int(x in promo_set))
    df['update_sender'] = df['sender'].apply(lambda x : int(x in update_set))
    
    df['update_sender_only'] = (df['update_sender'] * (1-df['promo_sender']) 
                                    * (1- df['social_sender']) * (1-df['forum_sender'])) 
    df['promo_sender_only'] = (df['promo_sender'] * (1-df['update_sender']) 
                                    * (1- df['social_sender']) * (1-df['forum_sender']))
    
    df['forum_sender_only'] = (df['forum_sender'] * (1-df['promo_sender']) 
                                    * (1- df['social_sender']) * (1-df['update_sender'])) 
    df['social_sender_only'] = (df['social_sender'] * (1-df['promo_sender']) 
                                    * (1- df['forum_sender']) * (1-df['update_sender'])) 
    df['promo_and_update_sender'] = df['promo_sender'] * df['update_sender'] 
    
    #add the sender domain depth by counting the number of "." in sender
    df['sender_depth'] = df['sender'].apply(lambda s : str(s).count('.'))
    
    print("Sender set information added")
    return df

And here, we run the preprocessing code

In [569]:
train_data = create_senders(train_data)
test_data = create_senders(test_data)

train_data = date_time_extraction(train_data)
test_data = date_time_extraction(test_data)

train_data = row_deleter(train_data)
test_data = row_deleter(test_data)

train_data = img_url_ratio(train_data)
test_data = img_url_ratio(test_data)

train_data = body_subject_ratio(train_data)
test_data = body_subject_ratio(test_data)

train_data = ht_hs_binary(train_data)
test_data = ht_hs_binary(test_data)

# train_data = mail_type_preprocessing(train_data)
# test_data = mail_type_preprocessing(test_data)

train_data = mailtype_dummy_vars(train_data)
test_data = mailtype_dummy_vars(test_data)

train_data = forward_level(train_data)
test_data = forward_level(test_data)

train_data = no_sender_reported(train_data)
test_data = no_sender_reported(test_data)

# train_data = dummy_transform(train_data, column_name='weekday')
# test_data = dummy_transform(test_data, column_name='weekday')
# train_data = dummy_transform(train_data, column_name='year')
# test_data = dummy_transform(test_data, column_name='year')
# train_data = dummy_transform(train_data, column_name='month')
# test_data = dummy_transform(test_data, column_name='month')
# train_data = dummy_transform(train_data, column_name='forward_level')
# test_data = dummy_transform(test_data, column_name='forward_level')
# train_data = dummy_transform(train_data, column_name='days')
# test_data = dummy_transform(test_data, column_name='days')

train_data = no_sender_reported(train_data)
test_data = no_sender_reported(test_data)

train_data = org_set_information(train_data)
test_data = org_set_information(test_data)

train_data = tld_set_information(train_data)
test_data = tld_set_information(test_data)

train_data = sender_set_information(train_data)
test_data = sender_set_information(test_data)

Sender information added
Sender information added
Dates added
Year added
Month added
Day added
Weekday added
Time added
Hour-offset added
Dates added
Year added
Month added
Day added
Weekday added
Time added
Hour-offset added
Row deletion complete
Row deletion complete
Img/url complete
Img/url complete
Body/subject ratio added
Body/subject ratio added
HT-HS-binary encoding complete
HT-HS-binary encoding complete
Forward levels added
Forward levels added
No sender reported feature created
No sender reported feature created
No sender reported feature created
No sender reported feature created
Organization set information complete
Organization set information complete
TLD information set complete
TLD information set complete
Sender set information added
Sender set information added


In [570]:
#Re-checking the missing data
missing_data_calculator(test_data)

Unnamed: 0,Total,Percent
No_sender_reported,6464,0.188098
sender,1536,0.044697
org,1536,0.044697
tld,1536,0.044697
hour_offset,106,0.003085
sender_depth,0,0.0
mail_type,0,0.0
date,0,0.0
chars_in_body,0,0.0
chars_in_subject,0,0.0


## Level 2 Features 

We decided to add in more information about the emails, such as aggregate information about the emails, the frequency count of the email, and creating more dummy columns 

In [246]:
#Z-score lambda function that will help to create a z-score feature for certain numeric columns 
zscore = lambda x: (x - x.mean()) / x.std()

In [571]:
#Fill in missing data
train_data['img_url_ratio'] = train_data['img_url_ratio'].fillna(0)
train_data['body_subj_ratio'] = train_data['img_url_ratio'].fillna(0)

test_data['img_url_ratio'] = test_data['img_url_ratio'].fillna(0)
test_data['body_subj_ratio'] = test_data['img_url_ratio'].fillna(0)

#Replace all infinity values with 0 
train_data = train_data.replace([np.inf, -np.inf],0)
test_data = test_data.replace([np.inf, -np.inf], 0)

In [572]:
#The processed train-data in action 
train_data

Unnamed: 0,datetime,year,month,day,weekday,time,hour_offset,ccs,bcced,total_mails_by_sender,...,forum_sender,social_sender,promo_sender,update_sender,update_sender_only,promo_sender_only,forum_sender_only,social_sender_only,promo_and_update_sender,sender_depth
0,2017-11-06 11:13:45+01:00,2017,11,6,0,11:13:45,0.0,0,0,-0.252945,...,1,1,0,1,0,0,0,0,0,2
1,2018-02-14 12:00:16+01:00,2018,2,14,2,12:00:16,-1.0,0,0,-1.399657,...,1,1,0,1,0,0,0,0,0,2
2,2016-07-06 21:53:37+02:00,2016,7,6,2,21:53:37,-2.0,0,0,0.926883,...,1,1,0,1,0,0,0,0,0,1
3,2019-10-11 11:25:40+02:00,2019,10,11,4,11:25:40,0.0,0,0,-0.094262,...,1,1,1,1,0,0,0,0,1,1
4,2017-11-07 12:07:18+01:00,2017,11,7,1,12:07:18,-1.0,1,0,-0.444660,...,0,1,1,1,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80162,2017-11-20 16:48:47+01:00,2017,11,20,0,16:48:47,17.0,0,0,-1.029957,...,1,1,0,0,0,0,0,0,0,1
80165,2020-04-25 14:20:07+02:00,2020,4,25,5,14:20:07,15.0,0,0,0.821920,...,1,1,0,1,0,0,0,0,0,1
80169,2017-07-19 02:24:33+02:00,2017,7,19,2,02:24:33,-2.0,0,0,0.228967,...,1,1,0,1,0,0,0,0,0,1
80171,2020-06-30 14:55:20+02:00,2020,6,30,1,14:55:20,15.0,1,0,0.315932,...,0,1,1,1,0,0,0,0,1,1


In [573]:
#Replacing missing values in 'hour_offset' with -100
train_data['hour_offset'] = train_data['hour_offset'].fillna(-100)
test_data['hour_offset'] = test_data['hour_offset'].fillna(-100)

In [574]:
#Filling the mising org, tld, sender for "org","tld" and "sender" with "DNE"

train_data['org'] = train_data['org'].fillna("DNE")
test_data['org'] = test_data['org'].fillna("DNE")

train_data['tld'] = train_data['tld'].fillna("DNE")
test_data['tld'] = test_data['tld'].fillna("DNE")

train_data['sender'] = train_data['sender'].fillna("DNE")
test_data['sender'] = test_data['sender'].fillna("DNE")

In [578]:
#Hour-offset is not a numeric column, so converting it to categorical
train_data['hour_offset'] = train_data['hour_offset'].astype('category')
test_data['hour_offset'] = test_data['hour_offset'].astype('category')

### Groupby transformations on the train_set

In [581]:
#Calculate the mean, std, skewness, count, median and zscores for all the columns in item, by grouping them on the basis 
#of their sender
for item in ['images','urls','chars_in_subject','chars_in_body','total_mails_by_sender',
             "img_url_ratio","body_subj_ratio"]:
    for indicator in ['mean', 'std','skew','count','median',zscores]:
        data_sender = train_data.groupby('sender')[item].transform(indicator).rename(f'sender_{item}_{indicator}')
        train_data = pd.concat([train_data, data_sender], axis=1)
        
#Calculate the mean, std, skewness, count, median and zscores for all the columns in "item", by grouping them on the basis 
#of their organization
for item in ['images','urls','chars_in_subject','chars_in_body','total_mails_by_sender',
             "img_url_ratio","body_subj_ratio"]:
    for indicator in ['mean', 'std','skew','count','median',zscores]:
        data_sender = train_data.groupby('org')[item].transform(indicator).rename(f'org_{item}_{indicator}')
        train_data = pd.concat([train_data, data_sender], axis=1)
        
        
#Calculate the mean, std, skewness, count, median and zscores for all the columns in "item", by grouping them on the basis 
#of their top-level domain     
for item in ['images','urls','chars_in_subject','chars_in_body','total_mails_by_sender',
             "img_url_ratio","body_subj_ratio"]:
    for indicator in ['mean', 'std','skew','count','median',zscores]:
        data_sender = train_data.groupby('tld')[item].transform(indicator).rename(f'tld_{item}_{indicator}')
        train_data = pd.concat([train_data, data_sender], axis=1)
        
        
#Calculate the mean, std, skewness, count, median and zscores for all the columns in "item", by grouping them on the basis 
#of their sender and the month on which the email was sent.
for item in ['chars_in_subject','chars_in_body','total_mails_by_sender']:
    for indicator in ['mean', 'std','count']:
        data_sender = train_data.groupby(['month','sender'])[item].transform(indicator).rename(f'month-sender_{item}_{indicator}')
        train_data = pd.concat([train_data, data_sender], axis=1)
        
        
#Calculate the mean, std, skewness, count, median and zscores for all the columns in "item", by grouping them on the basis 
#of their sender and the weekday on which the email was sent.
for item in ['chars_in_subject','chars_in_body','total_mails_by_sender']:
    for indicator in ['mean', 'std','count']:
        data_sender = train_data.groupby(['weekday','sender'])[item].transform(indicator).rename(f'weekday-sender_{item}_{indicator}')
        train_data = pd.concat([train_data, data_sender], axis=1)
        
#Calculate the mean, std, skewness, count, median and zscores for all the columns in "item", by grouping them on the basis 
#of their sender and the year on which the email was sent.
for item in ['chars_in_subject','chars_in_body','total_mails_by_sender']:
    for indicator in ['mean', 'std','count']:
        data_sender = train_data.groupby(['year','sender'])[item].transform(indicator).rename(f'year-sender_{item}_{indicator}')
        train_data = pd.concat([train_data, data_sender], axis=1)
        
        
#Calculate the mean, std, skewness, count, median and zscores for all the columns in "item", by grouping them on the basis 
#of their sender and the day on which the email was sent.
for item in ['chars_in_subject','chars_in_body','total_mails_by_sender']:
    for indicator in ['mean', 'std','count']:
        data_sender = train_data.groupby(['day','sender'])[item].transform(indicator).rename(f'day-sender_{item}_{indicator}')
        train_data = pd.concat([train_data, data_sender], axis=1)

0        0.346939
1        0.306569
2        0.183168
3        0.198473
4        0.342466
           ...   
80162    0.275229
80165    0.130841
80169    0.226027
80171    0.200000
80173    0.203125
Name: ccs, Length: 49140, dtype: float64

### Groupby transformations on the test_set

In [585]:
#Calculate the mean, std, skewness, count, median and zscores for all the columns in item, by grouping them on the basis 
#of their sender
for item in ['images','urls','chars_in_subject','chars_in_body','total_mails_by_sender',
             "img_url_ratio","body_subj_ratio"]:
    for indicator in ['mean', 'std','skew','count','median',zscores]:
        data_sender = test_data.groupby('sender')[item].transform(indicator).rename(f'sender_{item}_{indicator}')
        test_data = pd.concat([test_data, data_sender], axis=1)
             
#Calculate the mean, std, skewness, count, median and zscores for all the columns in "item", by grouping them on the basis 
#of their organization
for item in ['images','urls','chars_in_subject','chars_in_body','total_mails_by_sender',
             "img_url_ratio","body_subj_ratio"]:
    for indicator in ['mean', 'std','skew','count','median',zscores]:
        data_sender = test_data.groupby('org')[item].transform(indicator).rename(f'org_{item}_{indicator}')
        test_data = pd.concat([test_data, data_sender], axis=1)
        
#Calculate the mean, std, skewness, count, median and zscores for all the columns in "item", by grouping them on the basis 
#of their top-level domain    
for item in ['images','urls','chars_in_subject','chars_in_body','total_mails_by_sender',
             "img_url_ratio","body_subj_ratio"]:
    for indicator in ['mean', 'std','skew','count','median',zscores]:
        data_sender = test_data.groupby('tld')[item].transform(indicator).rename(f'tld_{item}_{indicator}')
        test_data = pd.concat([test_data, data_sender], axis=1)
           
#Calculate the mean, std, skewness, count, median and zscores for all the columns in "item", by grouping them on the basis 
#of their sender and the month on which the email was sent.            
for item in ['chars_in_subject','chars_in_body','total_mails_by_sender']:
    for indicator in ['mean', 'std','count']:
        data_sender = test_data.groupby(['month','sender'])[item].transform(indicator).rename(f'month-sender_{item}_{indicator}')
        test_data = pd.concat([test_data, data_sender], axis=1)

#Calculate the mean, std, skewness, count, median and zscores for all the columns in "item", by grouping them on the basis 
#of their sender and the weekday on which the email was sent.
for item in ['chars_in_subject','chars_in_body','total_mails_by_sender']:
    for indicator in ['mean', 'std','count']:
        data_sender = test_data.groupby(['weekday','sender'])[item].transform(indicator).rename(f'weekday-sender_{item}_{indicator}')
        test_data = pd.concat([test_data, data_sender], axis=1)

#Calculate the mean, std, skewness, count, median and zscores for all the columns in "item", by grouping them on the basis 
#of their sender and the year on which the email was sent.
for item in ['chars_in_subject','chars_in_body','total_mails_by_sender']:
    for indicator in ['mean','std','count']:
        data_sender = test_data.groupby(['year','sender'])[item].transform(indicator).rename(f'year-sender_{item}_{indicator}')
        test_data = pd.concat([test_data, data_sender], axis=1)
  
#Calculate the mean, std, skewness, count, median and zscores for all the columns in "item", by grouping them on the basis 
#of their sender and the day on which the email was sent.
for item in ['chars_in_subject','chars_in_body','total_mails_by_sender']:
    for indicator in ['mean','std','count']:
        data_sender = test_data.groupby(['day','sender'])[item].transform(indicator).rename(f'day-sender_{item}_{indicator}')
        test_data = pd.concat([test_data, data_sender], axis=1)

Similarly, further feature transformations were done for $\texttt{img_url_ratio}$ and $\texttt{body_subj_ratio}$

In [583]:
for item in ['images','urls','chars_in_subject','chars_in_body']:
    for indicator in ['mean', 'std','skew',zscores]:
        data_sender = train_data.groupby('img_url_ratio')[item].transform(indicator).rename(f'img_url_ratio_{item}_{indicator}')
        train_data = pd.concat([train_data, data_sender], axis=1)
        
for item in ['images','urls','chars_in_subject','chars_in_body']:
    for indicator in ['mean', 'std','skew',zscores]:
        data_sender = train_data.groupby('body_subj_ratio')[item].transform(indicator).rename(f'body_subj_ratio_{item}_{indicator}')
        train_data = pd.concat([train_data, data_sender], axis=1)
        
        
for item in ['images','urls','chars_in_subject','chars_in_body']:
    for indicator in ['mean', 'std','skew',zscores]:
        data_sender = test_data.groupby('img_url_ratio')[item].transform(indicator).rename(f'img_url_ratio_{item}_{indicator}')
        test_data = pd.concat([test_data, data_sender], axis=1)
        
for item in ['images','urls','chars_in_subject','chars_in_body']:
    for indicator in ['mean', 'std','skew',zscores]:
        data_sender = test_data.groupby('body_subj_ratio')[item].transform(indicator).rename(f'body_subj_ratio_{item}_{indicator}')
        test_data = pd.concat([test_data, data_sender], axis=1)
        

for indicator in ['mean', 'std','skew',zscores]:
    data_sender = train_data.groupby('year')['sender_freq_prev_year'].transform(indicator).rename(f'sender_freq_year_{indicator}')
    train_data = pd.concat([train_data, data_sender], axis=1)

for indicator in ['mean', 'std','skew',zscores]:
    data_sender = test_data.groupby('year')['sender_freq_prev_year'].transform(indicator).rename(f'sender_freq_year_{indicator}')
    test_data = pd.concat([test_data, data_sender], axis=1)
    
    
for indicator in ['mean', 'std','skew',zscores]:
    data_sender = train_data.groupby('weekday')['sender_freq_prev_week'].transform(indicator).rename(f'sender_freq_week_{indicator}')
    train_data = pd.concat([train_data, data_sender], axis=1)

for indicator in ['mean', 'std','skew',zscores]:
    data_sender = test_data.groupby('weekday')['sender_freq_prev_week'].transform(indicator).rename(f'sender_freq_week_{indicator}')
    test_data = pd.concat([test_data, data_sender], axis=1)

    
for indicator in ['mean', 'std','skew',zscores]:
    data_sender = train_data.groupby('month')['sender_freq_prev_month'].transform(indicator).rename(f'sender_freq_month_{indicator}')
    train_data = pd.concat([train_data, data_sender], axis=1)

for indicator in ['mean', 'std','skew',zscores]:
    data_sender = test_data.groupby('month')['sender_freq_prev_month'].transform(indicator).rename(f'sender_freq_month_{indicator}')
    test_data = pd.concat([test_data, data_sender], axis=1)


In [609]:
# Columns to remove after the processing is complete 
cols_to_remove = ["datetime","time","org","tld","sender"]
train_data = train_data.drop(cols_to_remove, axis=1)
test_data = test_data.drop(cols_to_remove, axis=1)
train_data  = train_data.drop(['date'], axis=1)
test_data  = test_data.drop(['date'], axis=1)

Creating dummy columns 

In [590]:
train_data = dummy_transform(train_data, column_name='mail_type')
test_data = dummy_transform(test_data, column_name='mail_type')

Dummy variable transformation for mail_type complete
Dummy variable transformation for mail_type complete


In [591]:
train_data = dummy_transform(train_data, column_name='year')
test_data = dummy_transform(test_data, column_name='year')

Dummy variable transformation for year complete
Dummy variable transformation for year complete


In [592]:
train_data = dummy_transform(train_data, column_name='day')
test_data = dummy_transform(test_data, column_name='day')

Dummy variable transformation for day complete
Dummy variable transformation for day complete


In [593]:
train_data = dummy_transform(train_data, column_name='month')
test_data = dummy_transform(test_data, column_name='month')

Dummy variable transformation for month complete
Dummy variable transformation for month complete


In [123]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [312]:
# test_data = test_data.drop('mail_type_-1',axis=1)

In [601]:
missing_data_calculator(test_data)

Unnamed: 0,Total,Percent
month_12,0,0.0
sender_images_<function <lambda> at 0x00000224B8D13798>,0,0.0
sender_urls_std,0,0.0
sender_urls_skew,0,0.0
sender_urls_count,0,0.0
...,...,...
purchases_org,0,0.0
travel_org,0,0.0
spam_org,0,0.0
social_org,0,0.0


In [597]:
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

train_data = train_data.replace([np.inf, -np.inf], 0)
test_data = test_data.replace([np.inf, -np.inf], 0)

TypeError: unhashable type: 'list'

In [157]:
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

In [206]:
y_vals = pd.get_dummies(train_data['label'])

In [290]:
import seaborn as sns
import numpy as np
import math
import scipy.stats as ss
import matplotlib.pyplot as plt

def skew_autotransform(DF, include = None, exclude = None, plot = False, threshold = 1, exp = False):
    
    #Get list of column names that should be processed based on input parameters
    if include is None and exclude is None:
        colnames = DF.columns.values
    elif include is not None:
        colnames = include
    elif exclude is not None:
        colnames = [item for item in list(DF.columns.values) if item not in exclude]
    else:
        print('No columns to process!')
    
    #Helper function that checks if all values are positive
    def make_positive(series):
        minimum = np.amin(series)
        #If minimum is negative, offset all values by a constant to move all values to positive teritory
        if minimum <= 0:
            series = series + abs(minimum) + 0.01
        return series
    
    
    #Go throug desired columns in DataFrame
    for col in colnames:
        #Get column skewness
        skew = DF[col].skew()
        transformed = True
        
        if plot:
            #Prep the plot of original data
            sns.set_style("darkgrid")
            sns.set_palette("Blues_r")
            fig, axes = plt.subplots(1, 2, figsize=(10, 5))
            ax1 = sns.distplot(DF[col], ax=axes[0])
            ax1.set(xlabel='Original ' + col)
        
        #If skewness is larger than threshold and positively skewed; If yes, apply appropriate transformation
        if abs(skew) > threshold and skew > 0:
            skewType = 'positive'
            #Make sure all values are positive
            DF[col] = make_positive(DF[col])
            
            if exp:
               #Apply log transformation 
               DF[col] = DF[col].apply(math.log)
            else:
                #Apply boxcox transformation
                DF[col] = ss.boxcox(DF[col])[0]
            skew_new = DF[col].skew()
         
        elif abs(skew) > threshold and skew < 0:
            skewType = 'negative'
            #Make sure all values are positive
            DF[col] = make_positive(DF[col])
            
            if exp:
               #Apply exp transformation 
               DF[col] = DF[col].pow(10)
            else:
                #Apply boxcox transformation
                DF[col] = ss.boxcox(DF[col])[0]
            skew_new = DF[col].skew()
        
        else:
            #Flag if no transformation was performed
            transformed = False
            skew_new = skew
        
        #Compare before and after if plot is True
        if plot:
            print('\n ------------------------------------------------------')     
            if transformed:
                print('\n %r had %r skewness of %2.2f' %(col, skewType, skew))
                print('\n Transformation yielded skewness of %2.2f' %(skew_new))
                sns.set_palette("Paired")
                ax2 = sns.distplot(DF[col], ax=axes[1], color = 'r')
                ax2.set(xlabel='Transformed ' + col)
                plt.show()
            else:
                print('\n NO TRANSFORMATION APPLIED FOR %r . Skewness = %2.2f' %(col, skew))
                ax2 = sns.distplot(DF[col], ax=axes[1])
                ax2.set(xlabel='NO TRANSFORM ' + col)
                plt.show()
                

    return DF


# Initial Modelling 

Our step was to go with an initial modelling of the data by using a basic ML model, such as Random forests

In [634]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler, StandardScaler
from skmultilearn.problem_transform import LabelPowerset
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import fbeta_score, precision_score, make_scorer, average_precision_score
# import cv2
import warnings


#Create a train and test partition 
X_train = train_data.drop('label', axis=1)
y_train = train_data['label']

In [294]:
#Using StandardScaler to standardize the data and make the distribution close to norma
mms = StandardScaler()
X_train_scaled = mms.fit_transform(X_train)
X_test_scaled = mms.transform(test_data)

In [611]:
#Fitting RF with 500 trees and a depth of 3
rf = RandomForestClassifier(n_estimators=500, max_depth=3)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, n_estimators=500)

In [612]:
prediction = rf.predict(X_train)

In [617]:
np.unique(y_train)

array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int64)

In [635]:
#Using LGBM for training on the data (Boosting approach)



from sklearn.metrics import f1_score

#Custom F1-metric score for calculating F1 score during 5-fold CV on LGBM
def f1_eval(preds, dtrain):
    labels = dtrain.get_label()
    preds = preds.reshape(len(np.unique(labels)), -1)
    preds = preds.T.argmax(axis = 1)
    f_score = f1_score(preds, labels, average="micro")
    return 'f1_score', f_score, True



import lightgbm as lgb

from sklearn.preprocessing import MinMaxScaler

#MinMax scaling done for LGBM since LGBM performs best when the data is scaled to unit length 
mms = MinMaxScaler()
X_train_scaled = mms.fit_transform(X_train)
X_test_scaled = mms.transform(X_test)

#PCA applied to reduce the dimanesions of the dataset (We chose 75 components that captured more than 90% variance.)
from sklearn.decomposition import PCA 

pca = PCA(n_components=75)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)


#Because we have a class imbalance problem, we also 
from sklearn.utils.class_weight import compute_class_weight
class_weights = list(compute_class_weight('balanced',
                                             np.unique(y_train),
                                             y_train))

w_array = np.ones(y_train.shape[0])
for i, val in enumerate(y_train):
    w_array[i] = class_weights[val-1]


dataset = lgb.Dataset(X_train, label=y_train)
# test_dataset = lgb.Dataset(X_test)
params = {'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'n_jobs': -1,
          'num_leaves': 1000,
          'learning_rate': 0.026623466966581126,
          'max_depth': 500,
          'num_iterations': 1000,
          'lambda_l1': 2.959759088169741,
#           'lambda_l2': 1.331172832164913,
          'bagging_fraction': 0.9655406551472153,
          'bagging_freq': 5,
          'num_class': 8,
          'colsample_bytree': 0.6867118652742716,
         'learning_rate': 0.01}




#parameters dict for LightGBM
lgb_params =  {
    'boosting': 'gbdt', 
    'colsample_bytree': 1, 
    'class_weight':class_weights,
    'learning_rate': 0.01, 
    'max_depth': 200, 
    'min_child_samples': 100, 
    'n_iterations':100,
    'n_estimators': 1000, 
    'num_leaves': 500,  
    'objective': 'multiclass',
    'num_class':8,
    'reg_alpha': 0.6, 
    'reg_lambda': 0.3, 
    'subsample': 0.8,
    'verbose':1
    }


params_2 = {
    'application': 'multiclass', # for binary classification
    'num_class' : 8, # used for multi-classes
    'boosting_type': 'goss', # traditional gradient boosting decision tree
    'num_iterations': 100, 
    'learning_rate': 0.01,
    'num_leaves': 300,
    'device': 'cpu', # you can use GPU to achieve faster learning
    'max_depth': 0, # <0 means no limit
    # Small number of bins may reduce training accuracy but can deal with over-fitting
    'lambda_l1': 2, # L1 regularization
    'lambda_l2': 3, # L2 regularization
    'subsample_for_bin': 300, # number of samples for constructing bins
    'subsample': 1, # subsample ratio of the training instance
    'colsample_bytree': 0.8, # subsample ratio of columns when constructing the tree
    'min_split_gain': 0.5, # minimum loss reduction required to make further partition on a leaf node of the tree
    'min_child_weight': 1, # minimum sum of instance weight (hessian) needed in a leaf
    'min_child_samples': 5# minimum number of data needed in a leaf
}

1        1
2        1
3        1
4        1
        ..
80162    2
80165    0
80169    1
80171    3
80173    0
Name: label, Length: 48145, dtype: int64 as keyword args. From version 0.25 passing these as positional arguments will result in an error


In [636]:
mod = lgb.train(lgb_params, dataset, valid_sets=[dataset], feval=f1_eval,early_stopping_rounds = 25)



[1]	training's multi_logloss: 1.40541	training's f1_score: 0.455811
Training until validation scores don't improve for 25 rounds
[2]	training's multi_logloss: 1.39276	training's f1_score: 0.455811
[3]	training's multi_logloss: 1.38069	training's f1_score: 0.455811
[4]	training's multi_logloss: 1.36901	training's f1_score: 0.455811
[5]	training's multi_logloss: 1.35777	training's f1_score: 0.455811
[6]	training's multi_logloss: 1.34691	training's f1_score: 0.455811
[7]	training's multi_logloss: 1.33638	training's f1_score: 0.455811
[8]	training's multi_logloss: 1.32619	training's f1_score: 0.455811
[9]	training's multi_logloss: 1.31624	training's f1_score: 0.455811
[10]	training's multi_logloss: 1.30655	training's f1_score: 0.455811
[11]	training's multi_logloss: 1.2971	training's f1_score: 0.455811
[12]	training's multi_logloss: 1.28787	training's f1_score: 0.455811
[13]	training's multi_logloss: 1.27887	training's f1_score: 0.455811
[14]	training's multi_logloss: 1.27005	training's f1

In [637]:
train_preds = mod.predict(X_train)

In [642]:
X_train

Unnamed: 0,weekday,hour_offset,ccs,bcced,total_mails_by_sender,sender_freq_total_period,sender_freq_prev_year,sender_freq_prev_week,sender_freq_prev_month,sender_freq_prev_six_months,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,0,0.0,0,0,-0.252945,-0.785055,-0.170463,0.114185,0.191052,0.012660,...,0,0,0,0,0,0,0,0,1,0
1,2,-1.0,0,0,-1.399657,-1.288755,-1.275817,-0.799083,-1.120205,-1.223003,...,0,0,0,0,0,0,0,0,0,0
2,2,-2.0,0,0,0.926883,0.705070,1.022306,0.870846,0.498429,0.738248,...,0,0,0,0,1,0,0,0,0,0
3,4,0.0,0,0,-0.094262,-0.350209,-0.056577,0.308666,0.106119,-0.125140,...,0,0,0,0,0,0,0,1,0,0
4,1,-1.0,1,0,-0.444660,-1.103774,-0.414117,0.321971,-0.101452,-0.280212,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80162,0,17.0,0,0,-1.029957,-1.042092,-0.927754,-1.335178,-1.549706,-0.853898,...,0,0,0,0,0,0,0,0,1,0
80165,5,15.0,0,0,0.821920,0.238388,0.663768,0.375793,0.421485,0.529557,...,0,1,0,0,0,0,0,0,0,0
80169,2,-2.0,0,0,0.228967,-0.033017,0.357598,0.216904,0.330141,0.366399,...,0,0,0,0,1,0,0,0,0,0
80171,1,15.0,1,0,0.315932,-0.307292,0.188187,-0.037988,0.377650,0.137154,...,0,0,0,1,0,0,0,0,0,0


In [None]:
train_data.to_csv("C://Users//Sauraj (Work mode)//Desktop//OP.csv")
test_data.to_csv("C://Users//Sauraj (Work mode)//Desktop//OP_test.csv")

In [638]:
train_preds = train_preds.round().argmax(axis=1)

In [639]:
f1_score(y_train, train_preds, average='micro')

0.9326617509606397

In [643]:
preds = mod.predict(test_data)
pred_df = pd.DataFrame(preds.round().argmax(axis=1), columns=['label'])

In [644]:
pred_df['label'].value_counts()

0    14387
3    10087
2     5195
7     2496
1     2156
6       42
5        1
4        1
Name: label, dtype: int64

In [629]:
len(class_weights)

8

In [645]:
pred_df.to_csv('C://Users//Sauraj (Work mode)//Desktop//gg.csv')

In [646]:
pred_df['label'].unique()

array([3, 0, 2, 7, 1, 5, 6, 4], dtype=int64)