In [9]:
import tarfile
import os
from pathlib import Path

### Step 1: Downloading the Dataset -> Done

### Step 2: EDA

In [6]:
spam_data_path = Path("./data/20030228_spam.tar.bz2")
ham_data_path = Path("./data/20030228_easy_ham.tar.bz2")

In [14]:
# Decompression 
for path in [spam_data_path, ham_data_path]:
    tar_obj = tarfile.open(path)
    tar_obj.extractall(path="data/")
    tar_obj.close()

In [16]:
# Data has been decompressed
os.listdir("data/")

['spam', '20030228_spam.tar.bz2', '20030228_easy_ham.tar.bz2', 'easy_ham']

In [18]:
# Path for decompressed data
spam_path = Path("./data/spam/")
ham_path = Path("./data/easy_ham/")

In [45]:
# List of all spam and ham files
spam_files = [spam_path/file for file in os.listdir(spam_path)  if file != 'cmds']
ham_files = [ham_path/file for file in os.listdir(ham_path)  if file != 'cmds']

In [46]:
print(spam_files[:5])
print(ham_files[:5])

[PosixPath('data/spam/00075.28a918cd03a0ef5aa2f1e0551a798108'), PosixPath('data/spam/00376.f4ed5f002f9b6b320a67f1da9cacbe72'), PosixPath('data/spam/00029.de865ad8d5ad0df985ae2f72388befba'), PosixPath('data/spam/00192.e5a6bb15ae1e965f3b823c75e435651a'), PosixPath('data/spam/00409.e59f63e813b6766a9a4ddf0790634ca3')]
[PosixPath('data/easy_ham/01263.40cec40ea12c55f2ac9a98dc07c55d1c'), PosixPath('data/easy_ham/00238.dab1868a3b43de1e01ebdfd0e53de50f'), PosixPath('data/easy_ham/00088.945614c3f6213f59548ab21306451675'), PosixPath('data/easy_ham/02051.58e196144807bd76d7b77d4b7efb6d32'), PosixPath('data/easy_ham/01232.2f44f5a2186e97cf4d65cf191d98e646')]


In [47]:
len(spam_files), len(ham_files)

(500, 2500)

In [48]:
# File format -> Email format
f = open(spam_files[0])
content = f.read()
f.close()
print(content)

From iiu-owner@taint.org  Mon Aug 26 15:48:26 2002
Return-Path: <iiu-owner@taint.org>
Delivered-To: zzzz@localhost.spamassassin.taint.org
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 35D3247C86
	for <zzzz@localhost>; Mon, 26 Aug 2002 10:41:37 -0400 (EDT)
Received: from phobos [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for zzzz@localhost (single-drop); Mon, 26 Aug 2002 15:41:37 +0100 (IST)
Received: from dogma.slashnull.org (localhost [127.0.0.1]) by
    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7NIi2Z03983 for
    <zzzz-list-admin-iiu@jmason.org>; Fri, 23 Aug 2002 19:44:02 +0100
Received: from linux.local ([213.9.245.86]) by dogma.slashnull.org
    (8.11.6/8.11.6) with SMTP id g7NIh1Z03950 for <iiu-admin@taint.org>;
    Fri, 23 Aug 2002 19:43:02 +0100
Message-Id: <200208231843.g7NIh1Z03950@dogma.slashnull.org>
Received: (qmail 28875 invoked from network); 23 Aug 2002 18:18:58 -0000
Received: from un

In [49]:
# Using python's email library as it makes parsing easy
# https://docs.python.org/3/library/email.examples.html
from email import policy
from email.parser import BytesParser

In [50]:
def email_loader(filenames):
    parsed_email = []
    for file in filenames:
        with open(file, 'rb') as fp:
            parsed_email.append(BytesParser(policy=policy.default).parse(fp))
    return parsed_email

In [51]:
spam_email = email_loader(spam_files)
ham_email = email_loader(ham_files)

In [63]:
# Same as above, can be accessed as key value pairs
print(spam_email[0].items())
print(spam_email[0].get_content().strip())

[('Return-Path', '<iiu-owner@taint.org>'), ('Delivered-To', 'zzzz@localhost.spamassassin.taint.org'), ('Received', 'from localhost (localhost [127.0.0.1])\tby phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 35D3247C86\tfor <zzzz@localhost>; Mon, 26 Aug 2002 10:41:37 -0400 (EDT)'), ('Received', 'from phobos [127.0.0.1]\tby localhost with IMAP (fetchmail-5.9.0)\tfor zzzz@localhost (single-drop); Mon, 26 Aug 2002 15:41:37 +0100 (IST)'), ('Received', 'from dogma.slashnull.org (localhost [127.0.0.1]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7NIi2Z03983 for    <zzzz-list-admin-iiu@jmason.org>; Fri, 23 Aug 2002 19:44:02 +0100'), ('Received', 'from linux.local ([213.9.245.86]) by dogma.slashnull.org    (8.11.6/8.11.6) with SMTP id g7NIh1Z03950 for <iiu-admin@taint.org>;    Fri, 23 Aug 2002 19:43:02 +0100'), ('Message-Id', '<200208231843.g7NIh1Z03950@dogma.slashnull.org>'), ('Received', '(qmail 28875 invoked from network); 23 Aug 2002 18:18:58 -0000'), ('Received', 'f

In [69]:
# Checking the headers and items for the files
for key, val in zip(spam_email[23].keys(), spam_email[23].values()):
    print("{} : {}".format(key, val))

Return-Path : <evtwqmigru@datcon.co.uk>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (jalapeno [127.0.0.1])	by zzzzason.org (Postfix) with ESMTP id F3CF116F1B	for <zzzz@localhost>; Tue,  8 Oct 2002 11:02:13 +0100 (IST)
Received : from jalapeno [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Tue, 08 Oct 2002 11:02:13 +0100 (IST)
Received : from webnote.net (mail.webnote.net [193.120.211.219]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g989wGK10152 for    <zzzz@jmason.org>; Tue, 8 Oct 2002 10:58:16 +0100
Received : from sanaga.camtel.cm ([195.24.194.61]) by webnote.net    (8.9.3/8.9.3) with ESMTP id KAA21887 for <zzzz@spamassassin.taint.org>;    Tue, 8 Oct 2002 10:59:00 +0100
Received : from ens.fr (host42-226.pool8173.interbusiness.it    [81.73.226.42]) by sanaga.camtel.cm with SMTP (Microsoft Exchange Internet    Mail Service Version 5.5.1960.3) id 431SJ0H6; Tue, 8 Oct 2002 06:55:54    -0000
Message-I