In [1]:
import pickle
from pathlib import Path
from IPython.display import display

import numpy as np
import pandas as pd
from pandarallel import pandarallel
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from authorship_id.utils import preprocessing, setup_nltk

In [2]:
def parse_row(r: pd.Series):
    import email
    from authorship_id.utils import pos_tag_text
    msg = email.message_from_string(r["message"])
    r.drop(["message", "file"], inplace=True)
    r["label"] = msg["from"]
    r["message"] = msg.get_payload()
    r["pos_tags"] = pos_tag_text(r["message"])
    return r

In [3]:
pandarallel.initialize(progress_bar=True)
setup_nltk()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [4]:
dataset_dir = Path("../../enron_dataset")
df = pd.read_csv(dataset_dir / "emails.csv")
display(df.head())

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [5]:
print(df.iloc[0, 1])

Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>
Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)
From: phillip.allen@enron.com
To: tim.belden@enron.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: Tim Belden <Tim Belden/Enron@EnronXGate>
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Here is our forecast

 


In [6]:
%%time
df = df.parallel_apply(parse_row, axis=1)
display(df)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=64676), Label(value='0 / 64676')))…

Unnamed: 0,label,message,pos_tags
0,phillip.allen@enron.com,Here is our forecast\n\n,RB VBZ PRP$ NN
1,phillip.allen@enron.com,Traveling to have a business meeting takes the...,VBG TO VB DT NN NN VBZ DT NN IN IN DT NN . RB ...
2,phillip.allen@enron.com,test successful. way to go!!!,NN JJ . NN TO VB . . .
3,phillip.allen@enron.com,"Randy,\n\n Can you send me a schedule of the s...","NNP , NNP PRP VBP PRP DT NN IN DT JJ CC NN IN ..."
4,phillip.allen@enron.com,Let's shoot for Tuesday at 11:45.,VB POS NN IN NNP IN CD .
...,...,...,...
517396,john.zufferli@enron.com,This is a trade with OIL-SPEC-HEDGE-NG (John L...,DT VBZ DT NN IN NNP ( NNP NNP POS NN ) CC NNP ...
517397,john.zufferli@enron.com,Some of my position is with the Alberta Term b...,"DT IN PRP$ NN VBZ IN DT NNP NNP NN , PRP MD VB..."
517398,john.zufferli@enron.com,2\n\n -----Original Message-----\nFrom: \tDouc...,"CD : : JJ NN : : : IN : NNP , NNP NNP : NNP , ..."
517399,john.zufferli@enron.com,Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\...,NNP NNP NNP NNP CD NNP NNP CD NNP NNP CD NNP N...


CPU times: total: 40.8 s
Wall time: 21min 59s


In [7]:
non_authors = [
    'enron.announcements@enron.com', 'no.address@enron.com', '40enron@enron.com',
    'outlook.team@enron.com', 'arsystem@enron.com', 'updates@enron.com', 'calendar@enron.com', 'support@enron.com',
    'system@enron.com', 'donotreply@enron.com', 'mailer-daemon@enron.com', 'postmaster@enron.com',
]
author_counts = df[~df['label'].isin(non_authors)]['label'].value_counts()
NUM_TOP_AUTHORS = 10
top_authors = author_counts.nlargest(NUM_TOP_AUTHORS).index.tolist()
display(top_authors)
df = df[df['label'].isin(top_authors)]

['kay.mann@enron.com',
 'vince.kaminski@enron.com',
 'jeff.dasovich@enron.com',
 'pete.davis@enron.com',
 'chris.germany@enron.com',
 'sara.shackleton@enron.com',
 'tana.jones@enron.com',
 'steven.kean@enron.com',
 'kate.symes@enron.com',
 'matthew.lenhart@enron.com']

In [8]:
%%time
df.to_csv(dataset_dir / "emails_parsed.csv")

CPU times: total: 3.12 s
Wall time: 3.13 s


In [9]:
%%time

unique_labels = df['label'].unique()
label2id: dict[str, np.int64] = {
    lbl: np.int64(idx) for idx, lbl in enumerate(unique_labels)
}
id2label: dict[np.int64, str] = {idx: lbl for lbl, idx in label2id.items()}
df['label'] = df['label'].map(lambda x: label2id[x])

train_texts, test_texts, y_train, y_test = train_test_split(
    df['pos_tags'], df['label'], test_size=0.2, random_state=SEED
)
ngram_ranges_to_test = [(1, 1), (2, 2), (3, 3)]
min_df_to_test = [1, 2]

preprocessed_X = {}

print(f"Number of authors (classes): {NUM_TOP_AUTHORS}")
n_lda_components = NUM_TOP_AUTHORS - 1

for ngram_range in ngram_ranges_to_test:
    for min_df in min_df_to_test:
        print(f"--- ngram_range: {ngram_range}, min_df: {min_df} ---")
        X_train_orig, X_test_orig, _ = preprocessing(
                train_texts,
                test_texts,
                ngram_range=ngram_range,
                min_doc_freq=min_df,
                vectorizer_name="tfidf",
            )
        display(("X_train", "y_train", "X_test", "y_test"))
        display((X_train_orig.shape, y_train.shape, X_test_orig.shape, y_test.shape))
        n_lsa_components = min(500, X_train_orig.shape[1], X_train_orig.shape[0])
        lsa = TruncatedSVD(n_components=n_lsa_components, random_state=SEED)
        print(f"LSA components: {n_lsa_components}")

        lda = LinearDiscriminantAnalysis(n_components=min(n_lda_components, n_lsa_components-1))

        reduction_pipeline = Pipeline([
            ('lsa', lsa),
            ('lda', lda)
        ])

        X_train_reduced = reduction_pipeline.fit_transform(X_train_orig, y_train)
        X_test_reduced = reduction_pipeline.transform(X_test_orig)

        print(f"Reduced dimensions after LSA+LDA: {X_train_reduced.shape}")
        preprocessed_X[ngram_range, min_df] = X_train_reduced, X_test_reduced

Number of authors (classes): 10
--- ngram_range: (1, 1), min_df: 1 ---


('X_train', 'y_train', 'X_test', 'y_test')

((76154, 45), (76154,), (19039, 45), (19039,))

LSA components: 45
Reduced dimensions after LSA+LDA: (76154, 9)
--- ngram_range: (1, 1), min_df: 2 ---


('X_train', 'y_train', 'X_test', 'y_test')

((76154, 45), (76154,), (19039, 45), (19039,))

LSA components: 45
Reduced dimensions after LSA+LDA: (76154, 9)
--- ngram_range: (2, 2), min_df: 1 ---


('X_train', 'y_train', 'X_test', 'y_test')

((76154, 1640), (76154,), (19039, 1640), (19039,))

LSA components: 500
Reduced dimensions after LSA+LDA: (76154, 9)
--- ngram_range: (2, 2), min_df: 2 ---


('X_train', 'y_train', 'X_test', 'y_test')

((76154, 1619), (76154,), (19039, 1619), (19039,))

LSA components: 500
Reduced dimensions after LSA+LDA: (76154, 9)
--- ngram_range: (3, 3), min_df: 1 ---


('X_train', 'y_train', 'X_test', 'y_test')

((76154, 28179), (76154,), (19039, 28179), (19039,))

LSA components: 500
Reduced dimensions after LSA+LDA: (76154, 9)
--- ngram_range: (3, 3), min_df: 2 ---


('X_train', 'y_train', 'X_test', 'y_test')

((76154, 26813), (76154,), (19039, 26813), (19039,))

LSA components: 500
Reduced dimensions after LSA+LDA: (76154, 9)
CPU times: total: 17min 34s
Wall time: 2min 38s


In [10]:
with open(dataset_dir / "preprocessed_X.pkl", "wb") as f:
    pickle.dump(preprocessed_X, f)
with open(dataset_dir / "preprocessed_y.pkl", "wb") as f:
    pickle.dump((y_train, y_test), f)
with open(dataset_dir / "label2id.pkl", "wb") as f:
    pickle.dump(label2id, f)
with open(dataset_dir / "id2label.pkl", "wb") as f:
    pickle.dump(id2label, f)