In [3]:
import tarfile
import urllib.request
from pathlib import Path

def fetch_spam_data():
    spam_root = "http://spamassassin.apache.org/old/publiccorpus/"
    ham_url = spam_root + "20030228_easy_ham.tar.bz2"
    spam_url = spam_root + "20030228_spam.tar.bz2"

    spam_path = Path() / ".." / "datasets" / "spam"
    spam_path.mkdir(parents=True, exist_ok=True)
    for dir_name, tar_name, url in (("easy_ham", "ham", ham_url),
                                    ("spam", "spam", spam_url)):
        if not (spam_path / dir_name).is_dir():
            path = (spam_path / tar_name).with_suffix(".tar.bz2")
            print("Downloading", path)
            urllib.request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path / dir_name for dir_name in ("easy_ham", "spam")]

In [4]:
ham_dir, spam_dir = fetch_spam_data()

In [5]:
ham_filenames = [f for f in sorted(ham_dir.iterdir()) if len(f.name) > 20]
spam_filenames = [f for f in sorted(spam_dir.iterdir()) if len(f.name) > 20]

In [6]:
len(ham_filenames)

2500

In [7]:
len(spam_filenames)

500

In [8]:
import email
import email.policy

def load_email(filepath):
    with open(filepath, "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [9]:
ham_emails = [load_email(filepath) for filepath in ham_filenames]
spam_emails = [load_email(filepath) for filepath in spam_filenames]

In [10]:
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [11]:
print(spam_emails[6].get_content().strip())

Help wanted.  We are a 14 year old fortune 500 company, that is
growing at a tremendous rate.  We are looking for individuals who
want to work from home.

This is an opportunity to make an excellent income.  No experience
is required.  We will train you.

So if you are looking to be employed from home with a career that has
vast opportunities, then go:

http://www.basetel.com/wealthnow

We are looking for energetic and self motivated people.  If that is you
than click on the link and fill out the form, and one of our
employement specialist will contact you.

To be removed from our link simple go to:

http://www.basetel.com/remove.html


4139vOLW7-758DoDY1425FRhM1-764SMFc8513fCsLl40


In [12]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        multipart = ", ".join([get_email_structure(sub_email)
                               for sub_email in payload])
        return f"multipart({multipart})"
    else:
        return email.get_content_type()

In [13]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [14]:
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [15]:
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [16]:
for header, value in spam_emails[0].items():
    print(header, ":", value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

In [17]:
spam_emails[0]["Subject"]

'Life Insurance - Why Pay More?'

In [18]:
spam_emails[0]

<email.message.EmailMessage at 0x13e55e950>

In [19]:
spam_emails[0].items()

[('Return-Path', '<12a1mailbot1@web.de>'),
 ('Delivered-To', 'zzzz@localhost.spamassassin.taint.org'),
 ('Received',
  'from localhost (localhost [127.0.0.1])\tby phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32\tfor <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)'),
 ('Received',
  'from mail.webnote.net [193.120.211.219]\tby localhost with POP3 (fetchmail-5.9.0)\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)'),
 ('Received',
  'from dd_it7 ([210.97.77.167])\tby webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623\tfor <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100'),
 ('From', '12a1mailbot1@web.de'),
 ('Received',
  'from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);\t Sat, 24 Aug 2002 09:42:10 +0900'),
 ('To', 'dcek1a1@netsgo.com'),
 ('Subject', 'Life Insurance - Why Pay More?'),
 ('Date', 'Wed, 21 Aug 2002 20:31:57 -1600'),
 ('MIME-Version', '1.0'),
 ('Message-ID', '<0103c104200

In [20]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [21]:
for part in spam_emails[0].walk():
    print(part)

Return-Path: <12a1mailbot1@web.de>
Delivered-To: zzzz@localhost.spamassassin.taint.org
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32
	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received: from mail.webnote.net [193.120.211.219]
	by localhost with POP3 (fetchmail-5.9.0)
	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received: from dd_it7 ([210.97.77.167])
	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623
	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From: 12a1mailbot1@web.de
Received: from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft
 SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To: <dcek1a1@netsgo.com>
Subject: Life Insurance - Why Pay More?
Date: Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version: 1.0
Message-ID: <0103c1042001882DD_IT7@dd_it7>
Content-Type: text/html; charset="iso-8859-1"
Content-Transfer-Encoding: quoted-

In [22]:
spam_emails[0].get_content()

'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\n<HTML><HEAD>\n<META content="text/html; charset=windows-1252" http-equiv=Content-Type>\n<META content="MSHTML 5.00.2314.1000" name=GENERATOR></HEAD>\n<BODY><!-- Inserted by Calypso -->\n<TABLE border=0 cellPadding=0 cellSpacing=2 id=_CalyPrintHeader_ rules=none \nstyle="COLOR: black; DISPLAY: none" width="100%">\n  <TBODY>\n  <TR>\n    <TD colSpan=3>\n      <HR color=black noShade SIZE=1>\n    </TD></TR></TD></TR>\n  <TR>\n    <TD colSpan=3>\n      <HR color=black noShade SIZE=1>\n    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso --><FONT \ncolor=#000000 face=VERDANA,ARIAL,HELVETICA size=-2><BR></FONT></TD></TR></TABLE><!-- End Calypso --><FONT color=#ff0000 \nface="Copperplate Gothic Bold" size=5 PTSIZE="10">\n<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=#ff0000 \nface="Copperplate Gothic Bold" size=5 PTSIZE="10">\n<CENTER>Why Spend More Than You Have To?\n<CENTER><FONT co

In [23]:
from bs4 import BeautifulSoup

In [24]:
soup = BeautifulSoup(spam_emails[0].get_content(), 'html.parser')

In [25]:
soup.get_text(separator='\n', strip=True)

"Save up to 70% on Life Insurance.\nWhy Spend More Than You Have To?\nLife Quote Savings\nEnsuring your \n      family's financial security is very important. Life Quote Savings makes \n      buying life insurance simple and affordable. We Provide FREE Access to The \n      Very Best Companies and The Lowest Rates.\nLife Quote Savings\nis FAST, EASY and \n            SAVES you money! Let us help you get started with the best values in \n            the country on new coverage. You can SAVE hundreds or even thousands \n            of dollars by requesting a FREE quote from Lifequote Savings. Our \n            service will take you less than 5 minutes to complete. Shop and \n            compare. SAVE up to 70% on all types of Life insurance!\nClick Here For Your \n            Free Quote!\nProtecting your family is the best investment you'll ever \n          make!\nIf you are in receipt of this email \n      in error and/or wish to be removed from our list,\nPLEASE CLICK HERE\nAND TYPE RE

In [26]:
def html_to_plain_text(content):
    soup = BeautifulSoup(content, 'html.parser')
    return soup.get_text(separator='\n', strip=True)

In [27]:
def parse_email_content(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [28]:
def extract_email_info(message):
  info = {}
  info['title'] = message.get('Subject', None)
  info['content'] = parse_email_content(message)
  return f"{info['title']}\n{info['content']}"

In [29]:
extracted = extract_email_info(spam_emails[0])

In [30]:
from nltk.tokenize import word_tokenize

In [31]:
tokens = word_tokenize(extracted.lower())

In [32]:
tokens

['life',
 'insurance',
 '-',
 'why',
 'pay',
 'more',
 '?',
 'save',
 'up',
 'to',
 '70',
 '%',
 'on',
 'life',
 'insurance',
 '.',
 'why',
 'spend',
 'more',
 'than',
 'you',
 'have',
 'to',
 '?',
 'life',
 'quote',
 'savings',
 'ensuring',
 'your',
 'family',
 "'s",
 'financial',
 'security',
 'is',
 'very',
 'important',
 '.',
 'life',
 'quote',
 'savings',
 'makes',
 'buying',
 'life',
 'insurance',
 'simple',
 'and',
 'affordable',
 '.',
 'we',
 'provide',
 'free',
 'access',
 'to',
 'the',
 'very',
 'best',
 'companies',
 'and',
 'the',
 'lowest',
 'rates',
 '.',
 'life',
 'quote',
 'savings',
 'is',
 'fast',
 ',',
 'easy',
 'and',
 'saves',
 'you',
 'money',
 '!',
 'let',
 'us',
 'help',
 'you',
 'get',
 'started',
 'with',
 'the',
 'best',
 'values',
 'in',
 'the',
 'country',
 'on',
 'new',
 'coverage',
 '.',
 'you',
 'can',
 'save',
 'hundreds',
 'or',
 'even',
 'thousands',
 'of',
 'dollars',
 'by',
 'requesting',
 'a',
 'free',
 'quote',
 'from',
 'lifequote',
 'savings',
 

In [33]:
[w for w in tokens if w.isalnum()]

['life',
 'insurance',
 'why',
 'pay',
 'more',
 'save',
 'up',
 'to',
 '70',
 'on',
 'life',
 'insurance',
 'why',
 'spend',
 'more',
 'than',
 'you',
 'have',
 'to',
 'life',
 'quote',
 'savings',
 'ensuring',
 'your',
 'family',
 'financial',
 'security',
 'is',
 'very',
 'important',
 'life',
 'quote',
 'savings',
 'makes',
 'buying',
 'life',
 'insurance',
 'simple',
 'and',
 'affordable',
 'we',
 'provide',
 'free',
 'access',
 'to',
 'the',
 'very',
 'best',
 'companies',
 'and',
 'the',
 'lowest',
 'rates',
 'life',
 'quote',
 'savings',
 'is',
 'fast',
 'easy',
 'and',
 'saves',
 'you',
 'money',
 'let',
 'us',
 'help',
 'you',
 'get',
 'started',
 'with',
 'the',
 'best',
 'values',
 'in',
 'the',
 'country',
 'on',
 'new',
 'coverage',
 'you',
 'can',
 'save',
 'hundreds',
 'or',
 'even',
 'thousands',
 'of',
 'dollars',
 'by',
 'requesting',
 'a',
 'free',
 'quote',
 'from',
 'lifequote',
 'savings',
 'our',
 'service',
 'will',
 'take',
 'you',
 'less',
 'than',
 '5',
 'mi

In [34]:
import re

In [35]:
re.sub(r'-?\d+(\.\d+)?(e-?\d+)?', 'NUMBER', '-1.67e-3')

'NUMBER'

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
ctv= CountVectorizer()

In [40]:
ctv.fit_transform(tokens).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [41]:
ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)

In [42]:
matrix = ctv.fit_transform([extracted.lower()])

In [43]:
matrix.toarray()

array([[ 4,  2,  1,  1,  3,  1, 10,  1,  2,  2,  1,  1,  1,  1,  5,  1,
         1,  1,  1,  3,  1,  1,  1,  2,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  2,  1,  1,  1,  1,  2,  1,  1,  2,  3,  2,  1,  1,  1,  2,
         1,  2,  1,  4,  5,  1,  3,  1,  1,  7,  1,  1,  1,  1,  1,  1,
         1,  2,  1,  3,  3,  1,  2,  1,  2,  1,  1,  1,  5,  1,  1,  1,
         1,  1,  1,  3,  1,  4,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,
         5,  2,  1,  6,  1,  1,  2,  1,  1,  2,  1,  1,  2,  1,  1,  1,
         8,  3]])

In [44]:
ctv.vocabulary_

{'life': 57,
 'insurance': 52,
 '-': 5,
 'why': 108,
 'pay': 71,
 'more': 65,
 '?': 9,
 'save': 83,
 'up': 102,
 'to': 99,
 '70': 8,
 '%': 1,
 'on': 68,
 '.': 6,
 'spend': 91,
 'than': 95,
 'you': 112,
 'have': 45,
 'quote': 76,
 'savings': 85,
 'ensuring': 34,
 'your': 113,
 'family': 38,
 "'s": 3,
 'financial': 40,
 'security': 86,
 'is': 54,
 'very': 105,
 'important': 50,
 'makes': 62,
 'buying': 20,
 'simple': 89,
 'and': 14,
 'affordable': 12,
 'we': 106,
 'provide': 75,
 'free': 42,
 'access': 11,
 'the': 96,
 'best': 19,
 'companies': 24,
 'lowest': 60,
 'rates': 77,
 'fast': 39,
 ',': 4,
 'easy': 32,
 'saves': 84,
 'money': 64,
 '!': 0,
 'let': 56,
 'us': 103,
 'help': 46,
 'get': 44,
 'started': 92,
 'with': 111,
 'values': 104,
 'in': 51,
 'country': 27,
 'new': 66,
 'coverage': 28,
 'can': 22,
 'hundreds': 48,
 'or': 69,
 'even': 36,
 'thousands': 98,
 'of': 67,
 'dollars': 30,
 'by': 21,
 'requesting': 81,
 'a': 10,
 'from': 43,
 'lifequote': 58,
 'our': 70,
 'service': 87

In [45]:
import urlextract

url_extractor = urlextract.URLExtract()
some_text = "Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"
print(url_extractor.find_urls(some_text))

['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']


In [46]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem.snowball import SnowballStemmer

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, lower_case=True,
                 remove_punctuation=True, replace_urls=True,
                 replace_numbers=True, stemmer=nltk.PorterStemmer(),
                 use_word_tokenize=True):
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemmer = stemmer
        self.use_word_tokenize = use_word_tokenize
        self.ctv = self.build_count_vectorizer()

    def get_vocabulary(self):
        return self.ctv.vocabulary_
        

    def preprocess_(self, email):
        text = extract_email_info(email)
        if self.lower_case:
            text = text.lower()
        if self.replace_urls and url_extractor is not None:
            urls = list(set(url_extractor.find_urls(text)))
            urls.sort(key=lambda url: len(url), reverse=True)
            for url in urls:
                text = text.replace(url, " URL ")
        if self.replace_numbers:
            text = re.sub(r'-?\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
        if self.remove_punctuation:
            text = re.sub(r'\W+', ' ', text, flags=re.M)
        # if self.use_word_tokenize:
        #     tokens = word_tokenize(text)
        # else:
        #     tokens = text.split()
        # if self.stemmer is not None:
        #     tokens = [self.stemmer.stem(token) for token in tokens]
        return text
   
    def build_count_vectorizer(self):
        if self.stemmer is not None:
            def stemmed_words(doc):
                analyzer = CountVectorizer().build_analyzer()
                return (self.stemmer.stem(w) for w in analyzer(doc))
            return CountVectorizer(analyzer=stemmed_words)
        else:
            return CountVectorizer()
        
    def fit(self, X, y=None):
        X = [self.preprocess_(email) for email in X]
        self.ctv.fit(X)
        return self
        
    def transform(self, X, y=None):
        X = [self.preprocess_(email) for email in X]
        return self.ctv.transform(X)

In [47]:
EmailToWordCounterTransformer().preprocess_(spam_emails[0])

'life insurance why pay more save up to NUMBER on life insurance why spend more than you have to life quote savings ensuring your family s financial security is very important life quote savings makes buying life insurance simple and affordable we provide free access to the very best companies and the lowest rates life quote savings is fast easy and saves you money let us help you get started with the best values in the country on new coverage you can save hundreds or even thousands of dollars by requesting a free quote from lifequote savings our service will take you less than NUMBER minutes to complete shop and compare save up to NUMBER on all types of life insurance click here for your free quote protecting your family is the best investment you ll ever make if you are in receipt of this email in error and or wish to be removed from our list please click here and type remove if you reside in any state which prohibits e mail solicitations for insurance please disregard this email '

In [48]:
EmailToWordCounterTransformer().build_count_vectorizer().fit_transform([EmailToWordCounterTransformer().preprocess_(spam_emails[0])]).toarray()

array([[1, 1, 1, 6, 1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
        1, 1, 1, 2, 1, 1, 2, 3, 2, 1, 1, 1, 2, 1, 2, 1, 4, 5, 1, 3, 1, 1,
        7, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 3, 3, 3, 2, 2, 1, 2, 1, 1, 1, 5,
        1, 1, 2, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 5, 2, 1, 6, 2, 2,
        1, 1, 2, 1, 2, 1, 1, 1, 1, 8, 3]])

In [49]:
EmailToWordCounterTransformer().fit([spam_emails[0]])

In [50]:
EmailToWordCounterTransformer().fit_transform(spam_emails[:3])

<3x144 sparse matrix of type '<class 'numpy.int64'>'
	with 211 stored elements in Compressed Sparse Row format>

In [51]:
X_few = X_train[:3]
trasformer = EmailToWordCounterTransformer()
X_few_wordcounts = trasformer.fit_transform(X_few)
# X_few_wordcounts.toarray()
trasformer.get_vocabulary()

{'re': 110,
 'hanson': 54,
 'sept': 119,
 'number': 96,
 'messag': 84,
 'in': 64,
 'the': 134,
 'nation': 92,
 'review': 114,
 'chuck': 21,
 'murcko': 88,
 'wrote': 155,
 'stuff': 127,
 'yawn': 157,
 'vox': 146,
 'found': 46,
 'father': 38,
 'on': 98,
 'religion': 112,
 'some': 125,
 'interest': 67,
 'quot': 108,
 'url': 144,
 'thoma': 137,
 'jefferson': 70,
 'have': 55,
 'examin': 35,
 'all': 4,
 'known': 75,
 'superstit': 129,
 'of': 97,
 'word': 153,
 'and': 8,
 'do': 25,
 'not': 94,
 'find': 40,
 'our': 101,
 'particular': 104,
 'christian': 20,
 'one': 99,
 'redeem': 111,
 'featur': 39,
 'they': 135,
 'are': 9,
 'alik': 3,
 'fabl': 36,
 'mytholog': 91,
 'million': 85,
 'innoc': 66,
 'men': 83,
 'women': 152,
 'children': 19,
 'sinc': 123,
 'introduct': 68,
 'been': 14,
 'burnt': 16,
 'tortur': 139,
 'fine': 41,
 'imprison': 63,
 'what': 150,
 'ha': 51,
 'effect': 30,
 'thi': 136,
 'coercion': 22,
 'to': 138,
 'make': 79,
 'half': 52,
 'world': 154,
 'fool': 43,
 'other': 100,
 'hy

In [52]:
transformer = EmailToWordCounterTransformer()

In [53]:
X_train_transformed = transformer.fit_transform(X_train)

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(max_iter=1000, random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3)
score.mean()

0.9845833333333333

In [55]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = transformer.transform(X_test)

log_clf = LogisticRegression(max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print(f"Precision: {precision_score(y_test, y_pred):.2%}")
print(f"Recall: {recall_score(y_test, y_pred):.2%}")

Precision: 97.87%
Recall: 96.84%
