In [1]:
import pandas as pd
import numpy as np
import os
import re

# Preprocessing Header

In [2]:
df = pd.read_csv("../Raw Data/extracted_header.csv")

In [3]:
df.head(n=3)

Unnamed: 0,sender_email,reply_to_email,return_path,inline_image_count,Label
0,rssfeeds@example.com,,rssfeeds@example.com,0,Ham
1,thompson@shelob.ce.ttu.edu,exmh-users@example.com,exmh-users-admin@example.com,0,Ham
2,beberg@mithral.com,,fork-admin@xent.com,0,Ham


In [4]:
df.isna().sum()

sender_email           130
reply_to_email        9312
return_path            340
inline_image_count       0
Label                    0
dtype: int64

In [5]:
df.shape

(12827, 5)

In [6]:
df.Label.value_counts() / len(df) * 100

Spam    81.211507
Ham     18.788493
Name: Label, dtype: float64

In [7]:
df.groupby("Label")["inline_image_count"].agg(["mean","median"])

Unnamed: 0_level_0,mean,median
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
Ham,0.0,0
Spam,0.607085,0


In [8]:
df[df.Label == "Ham"]["inline_image_count"].unique() # All Ham emails don't have a single inline image

array([0])

In [9]:
a=df[df.Label == "Spam"]["inline_image_count"]

In [10]:
(a == 0).sum() / len(a) * 100 # 86% of Spam Emails don't have a single inline image

85.79245464145147

In [11]:
df["has_inline_image"] = df["inline_image_count"] >= 1
df.drop("inline_image_count",axis=1,inplace=True)

In [12]:
df[df.Label == "Ham"][["sender_email","reply_to_email","return_path"]].sample(10)

Unnamed: 0,sender_email,reply_to_email,return_path
1416,bill@whump.com,,fork-admin@xent.com
1694,rssfeeds@example.com,,rssfeeds@example.com
1126,garym@canada.com,garym@canada.com,fork-admin@xent.com
2398,rssfeeds@example.com,,rssfeeds@example.com
1273,eugen@leitl.org,,fork-admin@xent.com
913,eh@mad.scientist.com,,fork-admin@xent.com
79,beberg@mithral.com,,fork-admin@xent.com
475,harley@argote.ch,,fork-admin@xent.com
323,bitbitch@magnesium.net,bitbitch@magnesium.net,fork-admin@xent.com
927,matthias@rpmforge.net,rpm-zzzlist@freshrpms.net,rpm-zzzlist-admin@freshrpms.net


In [13]:
df[df.Label == "Spam"][["sender_email","reply_to_email","return_path"]].sample(10)

Unnamed: 0,sender_email,reply_to_email,return_path
7350,istlearning@istlearning.com,,istlearning@istlearning.com
5518,Riggsowohi@lili.uib.no,,Riggsowohi@lili.uib.no
4946,aguardhouse@ustaxsolutions.com,,aguardhouse@ustaxsolutions.com
5946,dunbar@acadia.net,,dunbar@acadia.net
11611,hlaihrpklgmj@freeweb.de,,hlaihrpklgmj@freeweb.de
3451,dwvyjcvprda@surftimemag.com,,dwvyjcvprda@surftimemag.com
12557,eojgylsst@netnet.com.sg,,eojgylsst@netnet.com.sg
7244,adylbaev@yahoo.com,,
11940,Davisonryv@linux.org,,Davisonryv@linux.org
11134,carter_te@modul-bus.de,,carter_te@modul-bus.de


In [14]:
df.isna().sum() / len(df) * 100

sender_email         1.013487
reply_to_email      72.596866
return_path          2.650659
Label                0.000000
has_inline_image     0.000000
dtype: float64

In [15]:
df["no_reply_to_email"] = df["reply_to_email"].isna() # true -> does not have
df.groupby("Label")["no_reply_to_email"].agg(["mean"])*100

Unnamed: 0_level_0,mean
Label,Unnamed: 1_level_1
Ham,74.439834
Spam,72.170491


In [16]:
df["no_return_path"] = df["return_path"].isna() # true -> does not have
df.groupby("Label")["no_return_path"].agg(["mean"])*100

# 3.26% of spam don't have return path

Unnamed: 0_level_0,mean
Label,Unnamed: 1_level_1
Ham,0.0
Spam,3.263896


In [17]:
df[df["no_return_path"] == True]

Unnamed: 0,sender_email,reply_to_email,return_path,Label,has_inline_image,no_reply_to_email,no_return_path
2484,rkkss@redseven.de,,,Spam,False,True,True
2541,rkkss@redseven.de,,,Spam,False,True,True
2571,emailrewardz@emailrewardz.email-publisher.com,perf-remove.3565.64698.13893713.0.0.4@boing.to...,,Spam,False,False,True
2693,meg34807147238s03@isppan.waw.pl,meg34807147238s03@isppan.waw.pl,,Spam,False,False,True
2745,meg34807147238s03@isppan.waw.pl,meg34807147238s03@isppan.waw.pl,,Spam,False,False,True
...,...,...,...,...,...,...,...
12625,,,,Spam,False,True,True
12639,YCPZHUNZN@marchmail.com,YCPZHUNZN@marchmail.com,,Spam,False,False,True
12648,keh-ming@a-vip.com,,,Spam,False,True,True
12747,505jeff@acadia.net,,,Spam,False,True,True


In [18]:
a=df[(df["no_reply_to_email"]==True) & (df["no_return_path"] == True)]
print(a["Label"].value_counts() / len(a) * 100)
print("Overall Distribution of class:")
print(df.Label.value_counts() / len(df) * 100)
print("% of such records: " + str(len(a)/len(df)*100))
print(a.sample(7)[["Label","no_reply_to_email","no_return_path"]])

Spam    100.0
Name: Label, dtype: float64
Overall Distribution of class:
Spam    81.211507
Ham     18.788493
Name: Label, dtype: float64
% of such records: 2.3388165588212364
      Label  no_reply_to_email  no_return_path
7289   Spam               True            True
7787   Spam               True            True
11332  Spam               True            True
8003   Spam               True            True
11478  Spam               True            True
12586  Spam               True            True
9734   Spam               True            True


In [19]:
a=df[(df["no_reply_to_email"]==False) & (df["no_return_path"] == False)]
print(a["Label"].value_counts() / len(a) * 100)
print("Overall Distribution of class:")
print(df.Label.value_counts() / len(df) * 100)
print("% of such records: " + str(len(a)/len(df)*100))
print(a.sample(7)[["Label","no_reply_to_email","no_return_path"]])

Spam    82.273381
Ham     17.726619
Name: Label, dtype: float64
Overall Distribution of class:
Spam    81.211507
Ham     18.788493
Name: Label, dtype: float64
% of such records: 27.09129180634599
      Label  no_reply_to_email  no_return_path
2249    Ham              False           False
824     Ham              False           False
8400   Spam              False           False
11846  Spam              False           False
7786   Spam              False           False
6511   Spam              False           False
4826   Spam              False           False


In [20]:
a=df[(df["no_reply_to_email"]==True) & (df["no_return_path"] == False)]
print(a["Label"].value_counts() / len(a) * 100)
print("Overall Distribution of class:")
print(df.Label.value_counts() / len(df) * 100)
print("% of such records: " + str(len(a)/len(df)*100))
print(a.sample(7)[["Label","no_reply_to_email","no_return_path"]])

Spam    80.093209
Ham     19.906791
Name: Label, dtype: float64
Overall Distribution of class:
Spam    81.211507
Ham     18.788493
Name: Label, dtype: float64
% of such records: 70.25804942698994
      Label  no_reply_to_email  no_return_path
2715   Spam               True           False
12017  Spam               True           False
3677   Spam               True           False
7750   Spam               True           False
2442   Spam               True           False
3949   Spam               True           False
2291    Ham               True           False


In [21]:
a=df[(df["no_reply_to_email"]==False) & (df["no_return_path"] == True)]
print(a["Label"].value_counts() / len(a) * 100)
print("Overall Distribution of class:")
print(df.Label.value_counts() / len(df) * 100)
print("% of such records: " + str(len(a)/len(df)*100))
print(a.sample(7)[["Label","no_reply_to_email","no_return_path"]])

Spam    100.0
Name: Label, dtype: float64
Overall Distribution of class:
Spam    81.211507
Ham     18.788493
Name: Label, dtype: float64
% of such records: 0.3118422078428315
      Label  no_reply_to_email  no_return_path
9796   Spam              False            True
5184   Spam              False            True
11687  Spam              False            True
7926   Spam              False            True
6094   Spam              False            True
10807  Spam              False            True
6502   Spam              False            True


In [22]:
df["no_reply_no_return"] = (df["no_reply_to_email"] == True) & (df["no_return_path"] == True)
df["yes_reply_no_return"] = (df["no_reply_to_email"] == False) & (df["no_return_path"] == True)
print(df.columns)

Index(['sender_email', 'reply_to_email', 'return_path', 'Label',
       'has_inline_image', 'no_reply_to_email', 'no_return_path',
       'no_reply_no_return', 'yes_reply_no_return'],
      dtype='object')


In [23]:
df.drop(["no_reply_to_email","reply_to_email","return_path"],axis=1,inplace=True)
df.head()

Unnamed: 0,sender_email,Label,has_inline_image,no_return_path,no_reply_no_return,yes_reply_no_return
0,rssfeeds@example.com,Ham,False,False,False,False
1,thompson@shelob.ce.ttu.edu,Ham,False,False,False,False
2,beberg@mithral.com,Ham,False,False,False,False
3,rssfeeds@example.com,Ham,False,False,False,False
4,garym@canada.com,Ham,False,False,False,False


In [24]:
df.isna().sum()

sender_email           130
Label                    0
has_inline_image         0
no_return_path           0
no_reply_no_return       0
yes_reply_no_return      0
dtype: int64

In [25]:
df = df.dropna()

In [26]:
tld_pattern_compiled = re.compile(r"(@.+\.)(\w+)")
def get_tld(email):
    return tld_pattern_compiled.search(email).groups()[1]
    
df["top_level_domain"] = df["sender_email"].apply(get_tld)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [27]:
df["top_level_domain"].value_counts()

com          6873
net          2108
org           424
de            311
uk            302
             ... 
18              1
kw              1
tj              1
specified       1
name            1
Name: top_level_domain, Length: 150, dtype: int64

In [28]:
df.head()

Unnamed: 0,sender_email,Label,has_inline_image,no_return_path,no_reply_no_return,yes_reply_no_return,top_level_domain
0,rssfeeds@example.com,Ham,False,False,False,False,com
1,thompson@shelob.ce.ttu.edu,Ham,False,False,False,False,edu
2,beberg@mithral.com,Ham,False,False,False,False,com
3,rssfeeds@example.com,Ham,False,False,False,False,com
4,garym@canada.com,Ham,False,False,False,False,com


In [29]:
account_pattern_compiled = re.compile(r"(.+@)")
def get_account(email):
    return account_pattern_compiled.search(email).groups()[0][:-1]
    
df["account_name"] = df["sender_email"].apply(get_account)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [30]:
domain_pattern_compiled = re.compile(r"(@.+)(\.\w+)")
def get_domain(email):
    return domain_pattern_compiled.search(email).groups()[0][1:]
    
df["domain"] = df["sender_email"].apply(get_domain)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [36]:
df[df.Label == "Spam"][["account_name","top_level_domain","domain","Label"]].sample(10)

Unnamed: 0,account_name,top_level_domain,domain,Label
10951,epvxvhclc,com,drizzle,Spam
6755,j.myers_ld,net,gateway,Spam
7560,Whitfield,com,bradsnet,Spam
8059,SWVAWL,it,freemail,Spam
10162,cyan.preston,fr,tiscali,Spam
7351,5laurinda,com,access-one,Spam
10676,kmlgvcvk,com,freeze,Spam
3570,transformers,lv,andrews,Spam
10188,ldaniels_15,at,removethis.siemens,Spam
7064,19671488,edu,winthrop,Spam


In [37]:
df[df.Label == "Ham"][["account_name","top_level_domain","domain","Label"]].sample(10)

Unnamed: 0,account_name,top_level_domain,domain,Label
981,rssfeeds,com,example,Ham
2379,aeriksson,fm,fastmail,Ham
278,pudge,org,perl,Ham
1661,owen,net,permafrost,Ham
230,owen,net,permafrost,Ham
896,rssfeeds,com,example,Ham
1732,tim.one,net,comcast,Ham
674,mcmasjc,com,tatanka.stortek,Ham
1364,garym,com,canada,Ham
2268,cwg-exmh,Com,DeepEddy,Ham


In [46]:
df[df["Label"] == "Spam"]["top_level_domain"].value_counts()

com     5557
net     1652
de       299
uk       276
org      166
        ... 
iNFo       1
kw         1
18         1
tj         1
name       1
Name: top_level_domain, Length: 145, dtype: int64

In [47]:
df[df["Label"] == "Ham"]["top_level_domain"].value_counts()

com    1316
net     456
org     258
edu      83
Com      39
ie       38
uk       26
AU       25
ch       22
au       21
ca       19
fm       13
fi       13
de       12
be        8
at        7
COM       6
FI        5
EDU       5
nu        5
fr        5
it        5
to        4
ru        4
us        2
bz        2
za        2
br        2
ph        2
dk        1
es        1
ee        1
mx        1
mil       1
Name: top_level_domain, dtype: int64

In [57]:
df[df["Label"] == "Spam"]["domain"].value_counts()

yahoo           529
hotmail         222
netscape         70
msn              66
yahoo.co         65
               ... 
vsso.sll          1
dr                1
g4                1
meganet           1
lowestoft.ac      1
Name: domain, Length: 4915, dtype: int64

In [58]:
df[df["Label"] == "Ham"]["domain"].value_counts()

example         649
hotmail          58
slack            57
perl             56
comcast          45
               ... 
impression        1
kssp.upd.edu      1
mceahern          1
realsoftware      1
tuxfan            1
Name: domain, Length: 355, dtype: int64

In [68]:
df[df["account_name"] == "rssfeeds"].size / len(df) * 100

53.107033157438764

In [69]:
df[(df["account_name"] == "rssfeeds") & (df["Label"] == "Spam")].size /df[df["account_name"] == "rssfeeds"].size  * 100

0.0

In [70]:
df[(df["account_name"] == "rssfeeds") & (df["Label"] == "Ham")].size /df[df["account_name"] == "rssfeeds"].size  * 100

100.0

### In Production while predicting:
    if rss in sender_mail, and user likes rss feeds: Ham else Spam

In [78]:
df.drop(df[df["account_name"] == "rssfeeds"].index,inplace=True)

In [79]:
df["free_in_domain"] = df["domain"].map(lambda x: "free" in x)
df["free_in_account"] = df["account_name"].map(lambda x: "free" in x)

In [80]:
df.groupby("Label")['free_in_domain',"free_in_account"].mean()*100

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,free_in_domain,free_in_account
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
Ham,0.111297,0.0
Spam,1.866433,0.087489


In [81]:
df.drop(["sender_email","free_in_account","domain","top_level_domain"],axis=1,inplace=True)

In [85]:
df.head()

Unnamed: 0,Label,has_inline_image,no_return_path,no_reply_no_return,yes_reply_no_return,account_name,free_in_domain
1,Ham,False,False,False,False,thompson,False
2,Ham,False,False,False,False,beberg,False
4,Ham,False,False,False,False,garym,False
5,Ham,False,False,False,False,kragen,False
6,Ham,False,False,False,False,paul,False


In [87]:
!pip3 install pyenchant

Collecting pyenchant
  Downloading pyenchant-3.1.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 1.0 MB/s eta 0:00:011
[?25hInstalling collected packages: pyenchant
Successfully installed pyenchant-3.1.1
You should consider upgrading via the '/Users/susmitvengurlekar/opt/miniconda3/bin/python -m pip install --upgrade pip' command.[0m


In [88]:
import enchant

ImportError: The 'enchant' C library was not found and maybe needs to be installed.
See  https://pyenchant.github.io/pyenchant/install.html
for details
