In [None]:
# ========== IMPORTS ==========

import pandas as pd
import re
import tldextract
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
import os


In [2]:
# ========== INITIALIZATION ==========

nltk.download('punkt', download_dir='nltk_data')
nltk.download('punkt_tab', download_dir='nltk_data')
nltk.download('wordnet', download_dir='nltk_data')
nltk.download('omw-1.4', download_dir='nltk_data')
nltk.download('stopwords', download_dir='nltk_data')
nltk.data.path.append('./nltk_data')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# ========== READ DATA ==========

df = pd.read_csv('emails.csv')
df.head()

Unnamed: 0,subject,body,label
0,Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1
1,Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1
2,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1
3,Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0
4,SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1


In [4]:
# ========== CONSTANTS ==========

URL_PATTERN = r'https?:\/\/[^\s<>"]+|www\.[^\s<>"]+'

GENERAL_REDIRECTS = {
    'bit.ly', 'tinyurl.com', 'ow.ly', 'rebrand.ly', 'is.gd',
    'buff.ly', 'adf.ly', 'shorte.st', 'cutt.ly', 'clk.im',
    'yellkey.com', 'v.gd'
}

URGENT_KEYWORDS = {
    'urgent', 'immediately', 'important', 'action', 'required', 'asap',
    'alert', 'verify', 'warning', 'account', 'suspend', 'suspended',
    'locked', 'security', 'update', 'login', 'log-in', 'expire',
    'expiration', 'failure', 'failed', 'unauthorized', 'breach',
    'verify', 'attention', 'risk', 'click', 'now', 'respond', 'response',
    'confirm', 'confirmation', 'access', 'limited', 'final', 'notice',
    'deadline', 'deactivation', 'reactivate', 'validate', 'critical',
    'problem', 'issue', 'payment', 'invoice', 'bill', 'charge', 'refund',
    'dispute', 'settlement', 'penalty', 'compliance', 'legal', 'violation'
}

STOP_WORDS = set(stopwords.words('english'))

In [None]:
# ========== METHODS ==========

def extract_urls(text: str) -> list:
    if not isinstance(text, str):
        return []
    return re.findall(URL_PATTERN, text)

def count_urls(text: str) -> int:
    if not isinstance(text, str):
        return 0
    return len(re.findall(URL_PATTERN, text))
    
def get_domain(url: str) -> str:
    ext = tldextract.extract(url)
    if ext.domain and ext.suffix:
        return f"{ext.domain}.{ext.suffix}"
    return None

def count_urgent_words(text):
    tokens = word_tokenize(str(text).lower())
    lemmas = [lemmatizer.lemmatize(word) for word in tokens]
    return sum(1 for word in lemmas if word in URGENT_KEYWORDS)

In [6]:
# ========== TOTAL URLS ==========

df['num_urls'] = df.apply(
    lambda row: count_urls(row['subject']) + count_urls(row['body']),
    axis=1
)

In [7]:
# ========== REDIRECTS ==========

df['num_redirects'] = df.apply(
    lambda row: sum(
        1 for url in extract_urls(str(row['subject']) + ' ' + str(row['body']))
        if (lambda ext: f"{ext.domain}.{ext.suffix}")(tldextract.extract(url)) in GENERAL_REDIRECTS
    ),
    axis=1
)

In [8]:
# ========== WORD COUNT ==========

df['num_words'] = df.apply(
    lambda row: len(str(row['body']).split()),
    axis=1
)

In [9]:
# ========== NON-LATIN CHARS ==========

df['num_chars_foreign'] = df.apply(
    lambda row: sum(1 for char in str(row['body']) if not char.isascii()),
    axis=1
)

In [10]:
# ========== SPECIAL CHARS ==========

df['num_chars_special'] = df.apply(
    lambda row: sum(1 for char in str(row['body']) if not char.isalnum() and not char.isspace()),
    axis=1
)

In [11]:
# ========== URGENCY ==========

df['num_urgent_words'] = df['body'].apply(count_urgent_words)

In [12]:
# ========== STOPWORDS COUNT ==========

df['num_stopwords'] = df['body'].apply(
    lambda row: sum(
        1 for word in word_tokenize(str(row).lower()) if word in STOP_WORDS
    )
)

In [13]:
# ========== NO STOPWORDS COLUMN ==========

df['body_no_stopwords'] = df['body'].apply(
    lambda row: ' '.join(
        word for word in word_tokenize(str(row).lower())
        if word.isalnum() and word not in STOP_WORDS
    )
)

In [14]:
df.describe()

Unnamed: 0,label,num_urls,num_redirects,num_words,num_chars_foreign,num_chars_special,num_urgent_words,num_stopwords
count,82138.0,82138.0,82138.0,82138.0,82138.0,82138.0,82138.0,82138.0
mean,0.518773,1.942073,0.002252,273.637999,1.472254,142.146059,2.640495,84.412245
std,0.49965,13.261647,0.099219,812.450941,55.468074,1158.892219,10.22566,221.385045
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,51.0,0.0,12.0,0.0,14.0
50%,1.0,0.0,0.0,130.0,0.0,39.0,1.0,37.0
75%,1.0,1.0,0.0,302.0,0.0,123.0,3.0,89.0
max,1.0,3133.0,14.0,127119.0,14154.0,215985.0,2301.0,16873.0


In [None]:
df.to_csv('../emails_augmented.csv', index=False)