In [3]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score


In [4]:
train_df = pd.read_csv('train.csv', index_col=0)
test_df = pd.read_csv('test.csv', index_col=0)

In [5]:
train_df.head()

# date: unix style date format, date-time on which the email was received
# org: organisation of the sender
# tld: top level domain of the organisation eg. com, ac.in, fr, and org.
# ccs: number of emails cced with this email
# bcced: is the receiver bcc'd in the email (binary)
# mail_type: type of the mail body, e.g. text/plain and text/html
# images: number of images in the mail body
# urls: number of urls in the mail body
# salutations: is salutation used in the email (binary)
# designation: is designation of the sender mentioned in the email (binary)
# chars_in_subject: number of characters in the mail subject
# chars_in_body: number of characters in the mail body
# label: label of this email. Eight classes are 'Updates', 'Personal', ‘Promotions’, 'Forums', 'Purchases', 'Travel',
#                                                'Spam', and ‘Social’. Class ids start from 0 to 7




# Data cleaning important things:
# Removing or substituting NA values
# chars_in_subject should be turned into int64, object should be converted to string?


# Feature Engineering:
# We can create chars_in_email = chars_in_subject + chars_in_body
# data = images + urls
# Number of capitalized words (or capital letters)
# Sum of all character lengths of words
# Number of stopwords in email
# Number of words containing numbers and letters
# Number of punctuations
# number of unique urls in an email
# Number of words containing only letters
# Max Ratio of digit characters to all characters of each word in an email
# Binary variable indicating if the email subject contains special characters
# 1-7 day of the week (for example we know that promotions starts monday or friday)
# beginning-middle-end period of the month (promotions starts end of month)
# 1-12 month (promotions starts April, December, September)
# AM-PM take only the hour the email was sent (pay attention to the GMT time)
# word_freq_WORD =percentage of words in the e-mail that match WORD ('Updates', 'Personal', ‘Promotions’, 'Forums', 'Purchases', 'Travel',
#                                                'Spam', and ‘Social’)
# i.e. 100 * (number of times the  WORD appears in the email)/ total number of words in e-mail



# Data Cleaning:
# Lowering case
# Removal of special characters
# Removal of stopwords
# Removal of hyperlinks
# Removal of numbers
# Removal of whitespaces

# Dimensionality Reduction: PCA or LDA

# Feature Scaling:
# from sklearn.preprocessing import RobustScaler
# scaler = RobustScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

Unnamed: 0,date,org,tld,ccs,bcced,mail_type,images,urls,salutations,designation,chars_in_subject,chars_in_body,label
0,"Mon, 6 Nov 2017 11:13:45 +0100",reply,ebay.in,0,0,multipart/alternative,35,120,0,0,49.0,80027,2
1,"Wed, 14 Feb 2018 11:00:16 -0000",edm,efinmail.com,0,0,multipart/alternative,1,7,0,0,107.0,2961,1
2,"Wed, 6 Jul 2016 19:53:37 +0000",usebackpack,com,0,0,text/html,4,17,0,0,35.0,25149,1
3,"Fri, 11 Oct 2019 11:25:40 +0200",granular,ai,0,0,multipart/mixed,0,0,0,0,15.0,635296,1
4,"Tue, 07 Nov 2017 11:07:18 +0000 (UTC)",github,com,1,0,multipart/alternative,2,11,0,0,49.0,2355,1


In [9]:
train_df.isnull().sum()

date                   0
org                 3451
tld                 3453
ccs                    0
bcced                  0
mail_type            172
images                 0
urls                   0
salutations            0
designation            0
chars_in_subject      16
chars_in_body          0
label                  0
dtype: int64

In [10]:
train_df.dtypes 

# df['column'].astype(str)
# df['column'] = df['column'].astype('|S')

date                 object
org                  object
tld                  object
ccs                   int64
bcced                 int64
mail_type            object
images                int64
urls                  int64
salutations           int64
designation           int64
chars_in_subject    float64
chars_in_body         int64
label                 int64
dtype: object

In [11]:
train_df.max() # binary respected

  train_df.max()


date                Wed, 9 Sep 2020 19:00:30 +0000
ccs                                            179
bcced                                            1
images                                       83481
urls                                         21540
salutations                                      1
designation                                      1
chars_in_subject                             606.0
chars_in_body                             74381084
label                                            7
dtype: object

In [12]:
train_df.min() # no negative values

  train_df.min()


date                01 Apr 2018 18:50:30 +0530
ccs                                          0
bcced                                        0
images                                       0
urls                                         0
salutations                                  0
designation                                  0
chars_in_subject                           0.0
chars_in_body                                4
label                                        0
dtype: object

In [13]:
train_x = train_df[['mail_type']]
train_x = train_x.fillna(value='None')
train_y = train_df[['label']]

test_x = test_df[['mail_type']]
test_x = test_x.fillna(value='None')

In [14]:
feat_enc = OneHotEncoder()
feat_enc.fit(np.vstack([train_x, test_x]))
train_x_featurized = feat_enc.transform(train_x)
test_x_featurized = feat_enc.transform(test_x)



In [15]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_x_featurized, train_y)
pred_y = neigh.predict(test_x_featurized)

  return self._fit(X, y)


In [16]:
pred_df = pd.DataFrame(pred_y, columns=['label'])
pred_df.to_csv("knn_sample_submission.csv", index=True, index_label='Id')