In [1]:
import pandas as pd
import numpy as np
import os
import re

# Preprocessing Header

In [2]:
df = pd.read_csv("../Raw Data/extracted_header.csv")

In [3]:
df.head(n=3)

Unnamed: 0,sender_email,reply_to_email,return_path,inline_image_count,Label
0,rssfeeds@example.com,,rssfeeds@example.com,0,Ham
1,thompson@shelob.ce.ttu.edu,exmh-users@example.com,exmh-users-admin@example.com,0,Ham
2,beberg@mithral.com,,fork-admin@xent.com,0,Ham


In [4]:
df.isna().sum()

sender_email           130
reply_to_email        9312
return_path            340
inline_image_count       0
Label                    0
dtype: int64

In [5]:
df.shape

(12827, 5)

In [6]:
df.Label.value_counts() / len(df) * 100

Spam    81.211507
Ham     18.788493
Name: Label, dtype: float64

In [7]:
df.groupby("Label")["inline_image_count"].agg(["mean","median"])

Unnamed: 0_level_0,mean,median
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
Ham,0.0,0
Spam,0.607085,0


In [8]:
df[df.Label == "Ham"]["inline_image_count"].unique() # All Ham emails don't have a single inline image

array([0])

In [9]:
a=df[df.Label == "Spam"]["inline_image_count"]

In [10]:
(a == 0).sum() / len(a) * 100 # 86% of Spam Emails don't have a single inline image

85.79245464145147

In [11]:
df["has_inline_image"] = df["inline_image_count"] >= 1
df.drop("inline_image_count",axis=1,inplace=True)

In [12]:
df[df.Label == "Ham"][["sender_email","reply_to_email","return_path"]].sample(10)

Unnamed: 0,sender_email,reply_to_email,return_path
1404,geege@barrera.org,,fork-admin@xent.com
1176,rssfeeds@example.com,,rssfeeds@example.com
193,rssfeeds@example.com,,rssfeeds@example.com
222,garym@canada.com,garym@canada.com,fork-admin@xent.com
434,fork_list@hotmail.com,,fork-admin@xent.com
1434,garym@canada.com,garym@canada.com,fork-admin@xent.com
1905,rssfeeds@example.com,,rssfeeds@example.com
1903,rssfeeds@example.com,,rssfeeds@example.com
2226,msquadrat.nospamplease@gmx.net,,spamassassin-talk-admin@example.sourceforge.net
1294,sati_home@yahoo.com,,fork-admin@xent.com


In [13]:
df[df.Label == "Spam"][["sender_email","reply_to_email","return_path"]].sample(10)

Unnamed: 0,sender_email,reply_to_email,return_path
4028,news@e-pepper.net,news@e-pepper.net,news@e-pepper.net
9438,oijih@jpb.o2.pl,,oijih@jpb.o2.pl
4254,ujuchkksdsxft@mycity.com.cn,,ujuchkksdsxft@mycity.com.cn
11434,tdtulstpdnnme@dwp.net,,tdtulstpdnnme@dwp.net
8436,fskweqssbtecpw@8848.net,,fskweqssbtecpw@8848.net
4825,hlclcoxkwpbb@freewebemail.com,,hlclcoxkwpbb@freewebemail.com
5628,nim@th.com,,nim@th.com
10609,WendyBerliner@lumbago.net,WendyBerliner@lumbago.net,WendyBerliner@lumbago.net
3521,eurostarlotto10@netscape.net,,eurostarlotto10@netscape.net
9930,Newtonjcwt@mauimail.com,,Newtonjcwt@mauimail.com


In [14]:
df.isna().sum() / len(df) * 100

sender_email         1.013487
reply_to_email      72.596866
return_path          2.650659
Label                0.000000
has_inline_image     0.000000
dtype: float64

In [15]:
df["no_reply_to_email"] = df["reply_to_email"].isna() # true -> does not have
df.groupby("Label")["no_reply_to_email"].agg(["mean"])*100

Unnamed: 0_level_0,mean
Label,Unnamed: 1_level_1
Ham,74.439834
Spam,72.170491


In [16]:
df["no_return_path"] = df["return_path"].isna() # true -> does not have
df.groupby("Label")["no_return_path"].agg(["mean"])*100

# 3.26% of spam don't have return path

Unnamed: 0_level_0,mean
Label,Unnamed: 1_level_1
Ham,0.0
Spam,3.263896


In [17]:
df[df["no_return_path"] == True]

Unnamed: 0,sender_email,reply_to_email,return_path,Label,has_inline_image,no_reply_to_email,no_return_path
2484,rkkss@redseven.de,,,Spam,False,True,True
2541,rkkss@redseven.de,,,Spam,False,True,True
2571,emailrewardz@emailrewardz.email-publisher.com,perf-remove.3565.64698.13893713.0.0.4@boing.to...,,Spam,False,False,True
2693,meg34807147238s03@isppan.waw.pl,meg34807147238s03@isppan.waw.pl,,Spam,False,False,True
2745,meg34807147238s03@isppan.waw.pl,meg34807147238s03@isppan.waw.pl,,Spam,False,False,True
...,...,...,...,...,...,...,...
12625,,,,Spam,False,True,True
12639,YCPZHUNZN@marchmail.com,YCPZHUNZN@marchmail.com,,Spam,False,False,True
12648,keh-ming@a-vip.com,,,Spam,False,True,True
12747,505jeff@acadia.net,,,Spam,False,True,True


In [18]:
a=df[(df["no_reply_to_email"]==True) & (df["no_return_path"] == True)]
print(a["Label"].value_counts() / len(a) * 100)
print("Overall Distribution of class:")
print(df.Label.value_counts() / len(df) * 100)
print("% of such records: " + str(len(a)/len(df)*100))
print(a.sample(7)[["Label","no_reply_to_email","no_return_path"]])

Spam    100.0
Name: Label, dtype: float64
Overall Distribution of class:
Spam    81.211507
Ham     18.788493
Name: Label, dtype: float64
% of such records: 2.3388165588212364
      Label  no_reply_to_email  no_return_path
12483  Spam               True            True
11478  Spam               True            True
9490   Spam               True            True
8363   Spam               True            True
5046   Spam               True            True
3960   Spam               True            True
9621   Spam               True            True


In [19]:
a=df[(df["no_reply_to_email"]==False) & (df["no_return_path"] == False)]
print(a["Label"].value_counts() / len(a) * 100)
print("Overall Distribution of class:")
print(df.Label.value_counts() / len(df) * 100)
print("% of such records: " + str(len(a)/len(df)*100))
print(a.sample(7)[["Label","no_reply_to_email","no_return_path"]])

Spam    82.273381
Ham     17.726619
Name: Label, dtype: float64
Overall Distribution of class:
Spam    81.211507
Ham     18.788493
Name: Label, dtype: float64
% of such records: 27.09129180634599
      Label  no_reply_to_email  no_return_path
9562   Spam              False           False
7534   Spam              False           False
6049   Spam              False           False
9114   Spam              False           False
10133  Spam              False           False
4979   Spam              False           False
1403    Ham              False           False


In [20]:
a=df[(df["no_reply_to_email"]==True) & (df["no_return_path"] == False)]
print(a["Label"].value_counts() / len(a) * 100)
print("Overall Distribution of class:")
print(df.Label.value_counts() / len(df) * 100)
print("% of such records: " + str(len(a)/len(df)*100))
print(a.sample(7)[["Label","no_reply_to_email","no_return_path"]])

Spam    80.093209
Ham     19.906791
Name: Label, dtype: float64
Overall Distribution of class:
Spam    81.211507
Ham     18.788493
Name: Label, dtype: float64
% of such records: 70.25804942698994
      Label  no_reply_to_email  no_return_path
8959   Spam               True           False
2959   Spam               True           False
12693  Spam               True           False
11877  Spam               True           False
4958   Spam               True           False
1265    Ham               True           False
7363   Spam               True           False


In [21]:
a=df[(df["no_reply_to_email"]==False) & (df["no_return_path"] == True)]
print(a["Label"].value_counts() / len(a) * 100)
print("Overall Distribution of class:")
print(df.Label.value_counts() / len(df) * 100)
print("% of such records: " + str(len(a)/len(df)*100))
print(a.sample(7)[["Label","no_reply_to_email","no_return_path"]])

Spam    100.0
Name: Label, dtype: float64
Overall Distribution of class:
Spam    81.211507
Ham     18.788493
Name: Label, dtype: float64
% of such records: 0.3118422078428315
      Label  no_reply_to_email  no_return_path
7395   Spam              False            True
11826  Spam              False            True
10915  Spam              False            True
7796   Spam              False            True
5080   Spam              False            True
8458   Spam              False            True
2571   Spam              False            True


In [22]:
df["no_reply_no_return"] = (df["no_reply_to_email"] == True) & (df["no_return_path"] == True)
df["yes_reply_no_return"] = (df["no_reply_to_email"] == False) & (df["no_return_path"] == True)
print(df.columns)

Index(['sender_email', 'reply_to_email', 'return_path', 'Label',
       'has_inline_image', 'no_reply_to_email', 'no_return_path',
       'no_reply_no_return', 'yes_reply_no_return'],
      dtype='object')


In [23]:
df.drop(["no_reply_to_email","reply_to_email","return_path"],axis=1,inplace=True)
df.head()

Unnamed: 0,sender_email,Label,has_inline_image,no_return_path,no_reply_no_return,yes_reply_no_return
0,rssfeeds@example.com,Ham,False,False,False,False
1,thompson@shelob.ce.ttu.edu,Ham,False,False,False,False
2,beberg@mithral.com,Ham,False,False,False,False
3,rssfeeds@example.com,Ham,False,False,False,False
4,garym@canada.com,Ham,False,False,False,False


In [24]:
df.isna().sum()

sender_email           130
Label                    0
has_inline_image         0
no_return_path           0
no_reply_no_return       0
yes_reply_no_return      0
dtype: int64

In [25]:
df = df.dropna()

In [26]:
tld_pattern_compiled = re.compile(r"(@.+\.)(\w+)")
def get_tld(email):
    return tld_pattern_compiled.search(email).groups()[1]
    
df["top_level_domain"] = df["sender_email"].apply(get_tld)

In [27]:
df["top_level_domain"].value_counts()

com     6873
net     2108
org      424
de       311
uk       302
        ... 
name       1
su         1
ma         1
er         1
ps         1
Name: top_level_domain, Length: 150, dtype: int64

In [28]:
df.head()

Unnamed: 0,sender_email,Label,has_inline_image,no_return_path,no_reply_no_return,yes_reply_no_return,top_level_domain
0,rssfeeds@example.com,Ham,False,False,False,False,com
1,thompson@shelob.ce.ttu.edu,Ham,False,False,False,False,edu
2,beberg@mithral.com,Ham,False,False,False,False,com
3,rssfeeds@example.com,Ham,False,False,False,False,com
4,garym@canada.com,Ham,False,False,False,False,com


In [29]:
account_pattern_compiled = re.compile(r"(.+@)")
def get_account(email):
    return account_pattern_compiled.search(email).groups()[0][:-1]
    
df["account_name"] = df["sender_email"].apply(get_account)

In [30]:
domain_pattern_compiled = re.compile(r"(@.+)(\.\w+)")
def get_domain(email):
    return domain_pattern_compiled.search(email).groups()[0][1:]
    
df["domain"] = df["sender_email"].apply(get_domain)

In [31]:
df[df.Label == "Spam"][["account_name","top_level_domain","domain","Label"]].sample(10)

Unnamed: 0,account_name,top_level_domain,domain,Label
2468,info,com,smokesdirect,Spam
7419,vhgkqpjvwejcrh,com,emiliomorenatti,Spam
10910,heinrich,net,3n,Spam
8861,noconnor_ep,net,mediaone,Spam
11000,kathleen_worley_35,nl,vdschoor.myweb,Spam
10354,TQYRV,com,chroddii,Spam
5723,bookstoredolan,com,lebaron,Spam
8630,kdgcistt,com,anjungcafe,Spam
7664,carton,com,smoke.savquest,Spam
9274,stockpoint,net,Venturepro.prserv,Spam


In [32]:
df[df.Label == "Ham"][["account_name","top_level_domain","domain","Label"]].sample(10)

Unnamed: 0,account_name,top_level_domain,domain,Label
1228,dl,com,silcom,Ham
988,rssfeeds,com,example,Ham
454,mail,net,vipul,Ham
1493,rssfeeds,com,example,Ham
61,hoppel,mil,opt.nrl.navy,Ham
393,rssfeeds,com,example,Ham
2409,listuser,edu,neo.pittstate,Ham
2374,deafbox,com,hotmail,Ham
505,rssfeeds,com,example,Ham
1326,geege4,net,bellsouth,Ham


In [33]:
df[df["Label"] == "Spam"]["top_level_domain"].value_counts()

com     5557
net     1652
de       299
uk       276
org      166
        ... 
name       1
su         1
ma         1
er         1
ps         1
Name: top_level_domain, Length: 145, dtype: int64

In [34]:
df[df["Label"] == "Ham"]["top_level_domain"].value_counts()

com    1316
net     456
org     258
edu      83
Com      39
ie       38
uk       26
AU       25
ch       22
au       21
ca       19
fm       13
fi       13
de       12
be        8
at        7
COM       6
fr        5
nu        5
FI        5
EDU       5
it        5
ru        4
to        4
za        2
ph        2
br        2
bz        2
us        2
es        1
mx        1
dk        1
mil       1
ee        1
Name: top_level_domain, dtype: int64

In [35]:
df[df["Label"] == "Spam"]["domain"].value_counts()

yahoo                                     529
hotmail                                   222
netscape                                   70
msn                                        66
yahoo.co                                   65
                                         ... 
rgid                                        1
millennium.co                               1
jositi.bigsavingsguy                        1
ACaen-251-1-37-214.w83-115.abo.wanadoo      1
dcsri                                       1
Name: domain, Length: 4915, dtype: int64

In [36]:
df[df["Label"] == "Ham"]["domain"].value_counts()

example         649
hotmail          58
slack            57
perl             56
comcast          45
               ... 
motorola          1
firstusa          1
blu               1
bite-ltd          1
kssp.upd.edu      1
Name: domain, Length: 355, dtype: int64

In [37]:
df[df["account_name"] == "rssfeeds"].size / len(df) * 100

43.45120894699535

In [38]:
df[(df["account_name"] == "rssfeeds") & (df["Label"] == "Spam")].size /df[df["account_name"] == "rssfeeds"].size  * 100

0.0

In [39]:
df[(df["account_name"] == "rssfeeds") & (df["Label"] == "Ham")].size /df[df["account_name"] == "rssfeeds"].size  * 100

100.0

### In Production while predicting:
    if rss in sender_mail, and user likes rss feeds: Ham else Spam

In [40]:
df.drop(df[df["account_name"] == "rssfeeds"].index,inplace=True)

In [41]:
df["free_in_domain"] = df["domain"].map(lambda x: "free" in x)
df["free_in_account"] = df["account_name"].map(lambda x: "free" in x)

In [42]:
df.groupby("Label")['free_in_domain',"free_in_account"].mean()*100

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,free_in_domain,free_in_account
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
Ham,0.111297,0.0
Spam,1.866433,0.087489


In [43]:
df.head()

Unnamed: 0,sender_email,Label,has_inline_image,no_return_path,no_reply_no_return,yes_reply_no_return,top_level_domain,account_name,domain,free_in_domain,free_in_account
1,thompson@shelob.ce.ttu.edu,Ham,False,False,False,False,edu,thompson,shelob.ce.ttu,False,False
2,beberg@mithral.com,Ham,False,False,False,False,com,beberg,mithral,False,False
4,garym@canada.com,Ham,False,False,False,False,com,garym,canada,False,False
5,kragen@pobox.com,Ham,False,False,False,False,com,kragen,pobox,False,False
6,paul@cwie.net,Ham,False,False,False,False,net,paul,cwie,False,False


In [46]:
df[df.Label == "Spam"][["account_name","Label"]].sample(10)

Unnamed: 0,account_name,Label
3174,zafvyst,Spam
4963,-UXWPOST,Spam
4427,wcypzmujhxzcoy,Spam
11755,bob,Spam
4751,psawyer_pf,Spam
6096,marketplaza,Spam
2774,home_loans,Spam
3093,Claudia_Fechtner,Spam
12235,maiolica,Spam
12412,g.velez_on,Spam


In [47]:
df[df.Label == "Ham"][["account_name","Label"]].sample(10)

Unnamed: 0,account_name,Label
2233,bigpeted,Ham
1533,yyyy,Ham
161,kevinc,Ham
854,matthias,Ham
2184,Axel.Thimm,Ham
1254,cdale,Ham
777,harley,Ham
336,tomwhore,Ham
581,tony,Ham
1666,thomas,Ham


In [48]:
df.drop(["sender_email","free_in_account","domain","top_level_domain"],axis=1,inplace=True)

In [49]:
df.head()

Unnamed: 0,Label,has_inline_image,no_return_path,no_reply_no_return,yes_reply_no_return,account_name,free_in_domain
1,Ham,False,False,False,False,thompson,False
2,Ham,False,False,False,False,beberg,False
4,Ham,False,False,False,False,garym,False
5,Ham,False,False,False,False,kragen,False
6,Ham,False,False,False,False,paul,False


# Feature Extraction From Subject, Text


In [50]:
df = pd.read_csv("../Raw Data/extracted_body.csv")

In [51]:
df.shape

(15720, 3)

In [52]:
df.isna().sum()

Subject    247
Text         7
Label        0
dtype: int64