In [49]:
import pandas as pd
import numpy as np
import os
import re

# Preprocessing Header

In [50]:
df = pd.read_csv("../Raw Data/extracted_header.csv")

In [51]:
df.head(n=3)

Unnamed: 0,sender_email,reply_to_email,return_path,inline_image_count,Label
0,rssfeeds@example.com,,rssfeeds@example.com,0,Ham
1,thompson@shelob.ce.ttu.edu,exmh-users@example.com,exmh-users-admin@example.com,0,Ham
2,beberg@mithral.com,,fork-admin@xent.com,0,Ham


In [52]:
df.isna().sum()

sender_email           130
reply_to_email        9312
return_path            340
inline_image_count       0
Label                    0
dtype: int64

In [53]:
df.shape

(12827, 5)

In [54]:
df.Label.value_counts() / len(df) * 100

Spam    81.211507
Ham     18.788493
Name: Label, dtype: float64

In [55]:
df.groupby("Label")["inline_image_count"].agg(["mean","median"])

Unnamed: 0_level_0,mean,median
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
Ham,0.0,0
Spam,0.607085,0


In [56]:
df[df.Label == "Ham"]["inline_image_count"].unique() # All Ham emails don't have a single inline image

array([0])

In [57]:
a=df[df.Label == "Spam"]["inline_image_count"]

In [58]:
(a == 0).sum() / len(a) * 100 # 86% of Spam Emails don't have a single inline image

85.79245464145147

In [59]:
df["has_inline_image"] = df["inline_image_count"] >= 1
df.drop("inline_image_count",axis=1,inplace=True)

In [60]:
df[df.Label == "Ham"][["sender_email","reply_to_email","return_path"]].sample(10)

Unnamed: 0,sender_email,reply_to_email,return_path
1833,bill@whump.com,,fork-admin@xent.com
347,rssfeeds@example.com,,rssfeeds@example.com
704,rssfeeds@example.com,,rssfeeds@example.com
1819,waider@waider.ie,,ilug-admin@linux.ie
822,matthias@egwn.net,rpm-zzzlist@freshrpms.net,rpm-zzzlist-admin@freshrpms.net
1875,tomwhore@slack.net,,fork-admin@xent.com
878,rssfeeds@example.com,,rssfeeds@example.com
125,glen@netnoteinc.com,,ilug-admin@linux.ie
1824,matthias@rpmforge.net,rpm-zzzlist@freshrpms.net,rpm-zzzlist-admin@freshrpms.net
379,bitbitch@magnesium.net,bitbitch@magnesium.net,fork-admin@xent.com


In [61]:
df[df.Label == "Spam"][["sender_email","reply_to_email","return_path"]].sample(10)

Unnamed: 0,sender_email,reply_to_email,return_path
10932,pdvbmtbuahy@hotmail.com,,pdvbmtbuahy@hotmail.com
3555,1eosine9@tiscali.fr,,1eosine9@tiscali.fr
3620,ihyireiyles@smyth.net,,ihyireiyles@smyth.net
9344,rait@brem.ee,,
5844,Barberxdgjg@infoconex.com,,Barberxdgjg@infoconex.com
12360,publicdomain@consultant.com,publicdomain@consultant.com,publicdomain@consultant.com
6918,Martinez@interia.pl,,Martinez@interia.pl
3754,essie.palmerdn@openface.ca,,essie.palmerdn@openface.ca
2845,inconsiderable@fell.com,,40bruce-guenter.dyndns.org-S261713AbVEVAIi@vge...
5056,PLQOWP@kaiea.west.sun.com,,PLQOWP@kaiea.west.sun.com


In [62]:
df.isna().sum() / len(df) * 100

sender_email         1.013487
reply_to_email      72.596866
return_path          2.650659
Label                0.000000
has_inline_image     0.000000
dtype: float64

In [63]:
df["no_reply_to_email"] = df["reply_to_email"].isna() # true -> does not have
df.groupby("Label")["no_reply_to_email"].agg(["mean"])*100

Unnamed: 0_level_0,mean
Label,Unnamed: 1_level_1
Ham,74.439834
Spam,72.170491


In [64]:
df["no_return_path"] = df["return_path"].isna() # true -> does not have
df.groupby("Label")["no_return_path"].agg(["mean"])*100

# 3.26% of spam don't have return path

Unnamed: 0_level_0,mean
Label,Unnamed: 1_level_1
Ham,0.0
Spam,3.263896


In [65]:
df[df["no_return_path"] == True]

Unnamed: 0,sender_email,reply_to_email,return_path,Label,has_inline_image,no_reply_to_email,no_return_path
2484,rkkss@redseven.de,,,Spam,False,True,True
2541,rkkss@redseven.de,,,Spam,False,True,True
2571,emailrewardz@emailrewardz.email-publisher.com,perf-remove.3565.64698.13893713.0.0.4@boing.to...,,Spam,False,False,True
2693,meg34807147238s03@isppan.waw.pl,meg34807147238s03@isppan.waw.pl,,Spam,False,False,True
2745,meg34807147238s03@isppan.waw.pl,meg34807147238s03@isppan.waw.pl,,Spam,False,False,True
...,...,...,...,...,...,...,...
12625,,,,Spam,False,True,True
12639,YCPZHUNZN@marchmail.com,YCPZHUNZN@marchmail.com,,Spam,False,False,True
12648,keh-ming@a-vip.com,,,Spam,False,True,True
12747,505jeff@acadia.net,,,Spam,False,True,True


In [66]:
a=df[(df["no_reply_to_email"]==True) & (df["no_return_path"] == True)]
print(a["Label"].value_counts() / len(a) * 100)
print("Overall Distribution of class:")
print(df.Label.value_counts() / len(df) * 100)
print("% of such records: " + str(len(a)/len(df)*100))
print(a.sample(7)[["Label","no_reply_to_email","no_return_path"]])

Spam    100.0
Name: Label, dtype: float64
Overall Distribution of class:
Spam    81.211507
Ham     18.788493
Name: Label, dtype: float64
% of such records: 2.3388165588212364
      Label  no_reply_to_email  no_return_path
10117  Spam               True            True
4795   Spam               True            True
9613   Spam               True            True
3773   Spam               True            True
2541   Spam               True            True
5436   Spam               True            True
9253   Spam               True            True


In [67]:
a=df[(df["no_reply_to_email"]==False) & (df["no_return_path"] == False)]
print(a["Label"].value_counts() / len(a) * 100)
print("Overall Distribution of class:")
print(df.Label.value_counts() / len(df) * 100)
print("% of such records: " + str(len(a)/len(df)*100))
print(a.sample(7)[["Label","no_reply_to_email","no_return_path"]])

Spam    82.273381
Ham     17.726619
Name: Label, dtype: float64
Overall Distribution of class:
Spam    81.211507
Ham     18.788493
Name: Label, dtype: float64
% of such records: 27.09129180634599
      Label  no_reply_to_email  no_return_path
776     Ham              False           False
902     Ham              False           False
2879   Spam              False           False
12730  Spam              False           False
8929   Spam              False           False
636     Ham              False           False
6613   Spam              False           False


In [68]:
a=df[(df["no_reply_to_email"]==True) & (df["no_return_path"] == False)]
print(a["Label"].value_counts() / len(a) * 100)
print("Overall Distribution of class:")
print(df.Label.value_counts() / len(df) * 100)
print("% of such records: " + str(len(a)/len(df)*100))
print(a.sample(7)[["Label","no_reply_to_email","no_return_path"]])

Spam    80.093209
Ham     19.906791
Name: Label, dtype: float64
Overall Distribution of class:
Spam    81.211507
Ham     18.788493
Name: Label, dtype: float64
% of such records: 70.25804942698994
      Label  no_reply_to_email  no_return_path
11469  Spam               True           False
2536   Spam               True           False
8648   Spam               True           False
4209   Spam               True           False
11616  Spam               True           False
6719   Spam               True           False
3072   Spam               True           False


In [69]:
a=df[(df["no_reply_to_email"]==False) & (df["no_return_path"] == True)]
print(a["Label"].value_counts() / len(a) * 100)
print("Overall Distribution of class:")
print(df.Label.value_counts() / len(df) * 100)
print("% of such records: " + str(len(a)/len(df)*100))
print(a.sample(7)[["Label","no_reply_to_email","no_return_path"]])

Spam    100.0
Name: Label, dtype: float64
Overall Distribution of class:
Spam    81.211507
Ham     18.788493
Name: Label, dtype: float64
% of such records: 0.3118422078428315
      Label  no_reply_to_email  no_return_path
11417  Spam              False            True
11694  Spam              False            True
6502   Spam              False            True
11532  Spam              False            True
8753   Spam              False            True
8458   Spam              False            True
12639  Spam              False            True


In [70]:
df["no_reply_no_return"] = (df["no_reply_to_email"] == True) & (df["no_return_path"] == True)
df["yes_reply_no_return"] = (df["no_reply_to_email"] == False) & (df["no_return_path"] == True)
print(df.columns)

Index(['sender_email', 'reply_to_email', 'return_path', 'Label',
       'has_inline_image', 'no_reply_to_email', 'no_return_path',
       'no_reply_no_return', 'yes_reply_no_return'],
      dtype='object')


In [71]:
df.drop(["no_reply_to_email","reply_to_email","return_path"],axis=1,inplace=True)
df.head()

Unnamed: 0,sender_email,Label,has_inline_image,no_return_path,no_reply_no_return,yes_reply_no_return
0,rssfeeds@example.com,Ham,False,False,False,False
1,thompson@shelob.ce.ttu.edu,Ham,False,False,False,False
2,beberg@mithral.com,Ham,False,False,False,False
3,rssfeeds@example.com,Ham,False,False,False,False
4,garym@canada.com,Ham,False,False,False,False


In [72]:
df.isna().sum()

sender_email           130
Label                    0
has_inline_image         0
no_return_path           0
no_reply_no_return       0
yes_reply_no_return      0
dtype: int64

In [73]:
df = df.dropna()

In [74]:
tld_pattern_compiled = re.compile(r"(@.+\.)(\w+)")
def get_tld(email):
    return tld_pattern_compiled.search(email).groups()[1]
    
df["top_level_domain"] = df["sender_email"].apply(get_tld)

In [75]:
df["top_level_domain"].value_counts()

com     6873
net     2108
org      424
de       311
uk       302
        ... 
ps         1
kw         1
INFO       1
ba         1
name       1
Name: top_level_domain, Length: 150, dtype: int64

In [76]:
df.head()

Unnamed: 0,sender_email,Label,has_inline_image,no_return_path,no_reply_no_return,yes_reply_no_return,top_level_domain
0,rssfeeds@example.com,Ham,False,False,False,False,com
1,thompson@shelob.ce.ttu.edu,Ham,False,False,False,False,edu
2,beberg@mithral.com,Ham,False,False,False,False,com
3,rssfeeds@example.com,Ham,False,False,False,False,com
4,garym@canada.com,Ham,False,False,False,False,com


In [77]:
account_pattern_compiled = re.compile(r"(.+@)")
def get_account(email):
    return account_pattern_compiled.search(email).groups()[0][:-1]
    
df["account_name"] = df["sender_email"].apply(get_account)

In [78]:
domain_pattern_compiled = re.compile(r"(@.+)(\.\w+)")
def get_domain(email):
    return domain_pattern_compiled.search(email).groups()[0][1:]
    
df["domain"] = df["sender_email"].apply(get_domain)

In [79]:
df[df.Label == "Spam"][["account_name","top_level_domain","domain","Label"]].sample(10)

Unnamed: 0,account_name,top_level_domain,domain,Label
5646,franknkosi2002,com,yahoo,Spam
3400,rrsukkd,com,building,Spam
8236,roffie0scm,com,sepo,Spam
3946,amesfs,it,cima,Spam
3299,dmobleybk,dk,stardog,Spam
5650,qrjtehciwkb,com,myrealbox,Spam
11580,ravenna.tyler2001,mx,prod-infinitum.com,Spam
9321,qam3uhpzf,net,discover,Spam
4751,psawyer_pf,de,itmagic,Spam
12214,htinrnink,au,rocknet.net,Spam


In [80]:
df[df.Label == "Ham"][["account_name","top_level_domain","domain","Label"]].sample(10)

Unnamed: 0,account_name,top_level_domain,domain,Label
835,baartman,ca,lin12.triumf,Ham
900,vernon,com,b2unow,Ham
1367,hussein,org,stanfordalumni,Ham
2321,liblit,edu,eecs.berkeley,Ham
272,fork_list,com,hotmail,Ham
2158,baartman,ca,lin12.triumf,Ham
129,opslag,net,inklaar,Ham
134,rssfeeds,com,example,Ham
1267,dan,com,dankohn,Ham
2002,jeff.taylor,org,ieee,Ham


In [81]:
df[df["Label"] == "Spam"]["top_level_domain"].value_counts()

com     5557
net     1652
de       299
uk       276
org      166
        ... 
InFo       1
uy         1
ba         1
vg         1
INC        1
Name: top_level_domain, Length: 145, dtype: int64

In [82]:
df[df["Label"] == "Ham"]["top_level_domain"].value_counts()

com    1316
net     456
org     258
edu      83
Com      39
ie       38
uk       26
AU       25
ch       22
au       21
ca       19
fm       13
fi       13
de       12
be        8
at        7
COM       6
nu        5
FI        5
EDU       5
it        5
fr        5
ru        4
to        4
bz        2
us        2
za        2
br        2
ph        2
es        1
mil       1
ee        1
mx        1
dk        1
Name: top_level_domain, dtype: int64

In [83]:
df[df["Label"] == "Spam"]["domain"].value_counts()

yahoo                        529
hotmail                      222
netscape                      70
msn                           66
yahoo.co                      65
                            ... 
moirabaptist.freeserve.co      1
59b0j                          1
yojo                           1
stanislas.loria                1
genesshoes                     1
Name: domain, Length: 4915, dtype: int64

In [97]:
df[df["Label"] == "Ham"]["domain"].value_counts()

example      649
hotmail       58
slack         57
perl          56
comcast       45
            ... 
attbi          1
comp-wiz       1
gimp           1
trackbike      1
hotp           1
Name: domain, Length: 355, dtype: int64

In [98]:
len(df[df["account_name"] == "rssfeeds"]) / len(df) * 100

4.8279121052217056

In [99]:
df[df["account_name"] == "rssfeeds"].size / len(df) * 100

43.45120894699535

In [107]:
df[df["account_name"] == "rssfeeds"].shape

(613, 9)

In [109]:
613*9 == df[df["account_name"] == "rssfeeds"].size

True

In [86]:
df[(df["account_name"] == "rssfeeds") & (df["Label"] == "Spam")].size /df[df["account_name"] == "rssfeeds"].size  * 100

0.0

In [87]:
df[(df["account_name"] == "rssfeeds") & (df["Label"] == "Ham")].size /df[df["account_name"] == "rssfeeds"].size  * 100

100.0

In [95]:
613/12697*100

4.8279121052217056

In [93]:
df[df["account_name"] == "rssfeeds"]

Unnamed: 0,sender_email,Label,has_inline_image,no_return_path,no_reply_no_return,yes_reply_no_return,top_level_domain,account_name,domain
0,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
3,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
9,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
18,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
21,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
...,...,...,...,...,...,...,...,...,...
2387,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
2391,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
2393,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
2397,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example


In [92]:
df[df["sender_email"] == "rssfeeds@example.com"]

Unnamed: 0,sender_email,Label,has_inline_image,no_return_path,no_reply_no_return,yes_reply_no_return,top_level_domain,account_name,domain
0,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
3,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
9,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
18,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
21,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
...,...,...,...,...,...,...,...,...,...
2387,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
2391,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
2393,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example
2397,rssfeeds@example.com,Ham,False,False,False,False,com,rssfeeds,example


In [46]:
df.shape

(12697, 9)

In [48]:
df.shape

(12084, 9)

In [41]:
df["free_in_domain"] = df["domain"].map(lambda x: "free" in x)
df["free_in_account"] = df["account_name"].map(lambda x: "free" in x)

In [42]:
df.groupby("Label")['free_in_domain',"free_in_account"].mean()*100

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,free_in_domain,free_in_account
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
Ham,0.111297,0.0
Spam,1.866433,0.087489


In [43]:
df.head()

Unnamed: 0,sender_email,Label,has_inline_image,no_return_path,no_reply_no_return,yes_reply_no_return,top_level_domain,account_name,domain,free_in_domain,free_in_account
1,thompson@shelob.ce.ttu.edu,Ham,False,False,False,False,edu,thompson,shelob.ce.ttu,False,False
2,beberg@mithral.com,Ham,False,False,False,False,com,beberg,mithral,False,False
4,garym@canada.com,Ham,False,False,False,False,com,garym,canada,False,False
5,kragen@pobox.com,Ham,False,False,False,False,com,kragen,pobox,False,False
6,paul@cwie.net,Ham,False,False,False,False,net,paul,cwie,False,False


In [46]:
df[df.Label == "Spam"][["account_name","Label"]].sample(10)

Unnamed: 0,account_name,Label
3174,zafvyst,Spam
4963,-UXWPOST,Spam
4427,wcypzmujhxzcoy,Spam
11755,bob,Spam
4751,psawyer_pf,Spam
6096,marketplaza,Spam
2774,home_loans,Spam
3093,Claudia_Fechtner,Spam
12235,maiolica,Spam
12412,g.velez_on,Spam


In [47]:
df[df.Label == "Ham"][["account_name","Label"]].sample(10)

Unnamed: 0,account_name,Label
2233,bigpeted,Ham
1533,yyyy,Ham
161,kevinc,Ham
854,matthias,Ham
2184,Axel.Thimm,Ham
1254,cdale,Ham
777,harley,Ham
336,tomwhore,Ham
581,tony,Ham
1666,thomas,Ham


In [48]:
df.drop(["sender_email","free_in_account","domain","top_level_domain"],axis=1,inplace=True)

In [49]:
df.head()

Unnamed: 0,Label,has_inline_image,no_return_path,no_reply_no_return,yes_reply_no_return,account_name,free_in_domain
1,Ham,False,False,False,False,thompson,False
2,Ham,False,False,False,False,beberg,False
4,Ham,False,False,False,False,garym,False
5,Ham,False,False,False,False,kragen,False
6,Ham,False,False,False,False,paul,False


# Feature Extraction From Subject, Text


In [50]:
df = pd.read_csv("../Raw Data/extracted_body.csv")

In [51]:
df.shape

(15720, 3)

In [52]:
df.isna().sum()

Subject    247
Text         7
Label        0
dtype: int64