In [None]:
!pip install -U spacy gensim transformers numpy scipy scikit-learn # restart the session after run

In [None]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')
data_dir = "/content/drive/MyDrive/Enron-emails.csv"
raw_emails = pd.read_csv(data_dir)

Mounted at /content/drive


## Preprocessing

In [None]:
import email
import re
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import spacy
from transformers import pipeline
from spacy.lang.en.stop_words import STOP_WORDS
from gensim.utils import simple_preprocess

nltk.download('punkt_tab')

# --------- Helper: Softmax ---------
def softmax(logits):
    exps = np.exp(logits - np.max(logits))
    return exps / exps.sum(axis=-1, keepdims=True)

# --------- Step 1: Clean and Extract Body ---------
def extract_email_body(raw_email_text):
    try:
        msg = email.message_from_string(raw_email_text)
        headers = {
            "subject": msg.get("Subject", ""),
            "from": msg.get("From", ""),
            "to": msg.get("To", ""),
            "date": msg.get("Date", ""),
        }
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == 'text/plain':
                    body = part.get_payload(decode=True)
                    if body:
                        headers["body"] = body.decode(errors="ignore")
                        return headers
        else:
            body = msg.get_payload(decode=True)
            if body:
                headers["body"] = body.decode(errors="ignore")
                return headers

    except Exception as e:
        print(f"Error parsing email: {e}")
    return {
        "subject": "",
        "from": "",
        "to": "",
        "date": "",
        "body": ""
    }

# --------- Step 2: Get Most Recent Message ---------
def extract_main_message(text):
    pattern = re.compile(r'-{2,}.*?(forwarded|original message).*', re.IGNORECASE)
    subject_splitter = re.compile(r'.*?Subject:.*?\n\n', flags=re.IGNORECASE | re.DOTALL)

    if not isinstance(text, str):
        return ""

    matches = list(pattern.finditer(text))
    if not matches:
        return text.strip()

    first_match_start = matches[0].start()
    pre = text[:first_match_start].strip()
    if pre:
        return pre

    section = text[matches[0].end():]
    subject_split = subject_splitter.split(section)
    return subject_split[1].strip() if len(subject_split) > 1 else section.strip()

# --------- Step 3: Strip Headers from Body ---------
def clean_messy_body(text):
    reply_splitter = re.compile(r'(\n\n)(From:|To:|Subject:|Date:)', re.IGNORECASE)
    match = reply_splitter.search(text)
    if match:
        return text[:match.start()].strip()
    return text

# --------- Step 4: Final Cleanup ---------
def clean_email_text(text):
    if not isinstance(text, str):
        return ""
    text = text.replace('\r', ' ').replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    text = ''.join(c for c in text if c.isprintable())
    return text.strip()


# --------- Step 5: Stop words Removal & Lemmatization ---------
enron_stopwords = set([
    'enron', 'attached', 'houston', 'please', 'thank', 'regards',
    'forwarded', 'original', 'email', 'message', 'subject', 'mailto',
    'sent', 'pm', 'am', 'also', 'let', 'know', 'get', 'one', 'would',
    'could', 'us', 'need', 'make', 'see', 'appreciate'
])

stop_words = STOP_WORDS.union(enron_stopwords)
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def lemmatize_text(text):
    if not isinstance(text, str):
        return ""
    doc = nlp(text.lower())
    return " ".join([token.lemma_ for token in doc if token.is_alpha])

def remove_stopwords(text):
    if isinstance(text, str):
        return " ".join([w for w in text.split() if w.lower() not in stop_words])
    return ""

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Extract Structural Elements

In [None]:
# --------- Load model/tokenizer ---------
MODEL_DIR = "/content/drive/MyDrive/deberta_structural_elements"
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.eval()
id2label = model.config.id2label

# --------- Predict Structure ---------
def predict_structured_segments(email_text, confidence_threshold=0.5):
    if not isinstance(email_text, str) or not email_text.strip():
        return {}
    sentences = sent_tokenize(email_text)
    grouped = {
        'greeting': [],
        'body': [],
        'closing': [],
        'signature': []
    }

    for sent in sentences:
        inputs = tokenizer(sent, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits.detach().cpu().numpy()[0]
            probs = softmax(logits)
            label_id = np.argmax(probs)
            label = id2label[label_id]
            label = label.lower()
            confidence = float(probs[label_id])

            if confidence >= confidence_threshold:
                grouped[label].append((sent, confidence))

    result = {}
    for label in grouped:
        if grouped[label]:
            texts, scores = zip(*grouped[label])
            result[label] = {
                "text": " ".join(texts),
                "avg_score": float(np.mean(scores))
            }
        else:
            result[label] = {
                "text": "",
                "avg_score": 0.0
            }
    return result

## NER

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

NER_MODEL_PATH = "/content/drive/MyDrive/mlm_ner_bert"

ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_PATH)
ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_PATH)

ner_pipeline = pipeline(
    "ner",
    model=ner_model,
    tokenizer=ner_tokenizer,
    aggregation_strategy="simple"
)

def extract_named_entities(text):
    if not isinstance(text, str) or not text.strip():
        return []
    ner_results = ner_pipeline(text)
    return [
        {
            "entity": res["entity_group"],
            "score": round(res["score"], 3),
            "word": res["word"]
        }
        for res in ner_results]

Device set to use cpu


## Theme Prediction


In [None]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
import pickle

drive_path = "/content/drive/MyDrive/LDA"
lda_model = LdaModel.load(f"{drive_path}/lda_model.gensim")
dictionary = Dictionary.load(f"{drive_path}/lda_dictionary.dict")
with open(f"{drive_path}/topic_labels.pkl", "rb") as f:
    topic_labels = pickle.load(f)

def predict_topic(subject, body, lda_model, dictionary, topic_labels, topn=1):
    text = f"{subject} {body}".strip()
    if not text:
        return "Unknown", 0.0
    clean_text = remove_stopwords(lemmatize_text(text))
    bow = dictionary.doc2bow(clean_text.split())
    if not bow:
        return "Unknown", 0.0
    topics = lda_model.get_document_topics(bow)
    if not topics:
        return "", 0.0

    top_topic, confidence = sorted(topics, key=lambda x: -x[1])[0]
    label = topic_labels.get(top_topic, f"Topic {top_topic}")
    return label, float(confidence)

## Full pipeline

In [None]:
def process_raw_email(raw_email_text):
    parts = extract_email_body(raw_email_text)
    body = parts.get("body", "")
    subject = parts.get("subject", "")
    main = extract_main_message(body)
    no_headers = clean_messy_body(main)
    cleaned = clean_email_text(no_headers)
    structured = predict_structured_segments(cleaned)
    named_entities = extract_named_entities(cleaned)
    topic_label, topic_score = predict_topic(subject, cleaned, lda_model, dictionary, topic_labels)

    return {
        "subject": subject,
        "from": parts.get("from", ""),
        "to": parts.get("to", ""),
        "date": parts.get("date", ""),
        "structured": structured,
        "named_entities": named_entities,
        "topic_label": topic_label,
        "topic_score": topic_score
    }

def flatten_output(email_json):
    return {
        "subject": email_json.get("subject", ""),
        "from": email_json.get("from", ""),
        "to": email_json.get("to", ""),
        "date": email_json.get("date", ""),
        "greeting": email_json.get("structured", {}).get("greeting", {}).get("text", ""),
        "body": email_json.get("structured", {}).get("body", {}).get("text", ""),
        "closing": email_json.get("structured", {}).get("closing", {}).get("text", ""),
        "signature": email_json.get("structured", {}).get("signature", {}).get("text", ""),
        "greeting_score": email_json.get("structured", {}).get("greeting", {}).get("avg_score", 0.0),
        "body_score": email_json.get("structured", {}).get("body", {}).get("avg_score", 0.0),
        "closing_score": email_json.get("structured", {}).get("closing", {}).get("avg_score", 0.0),
        "signature_score": email_json.get("structured", {}).get("signature", {}).get("avg_score", 0.0),
        "named_entities": email_json.get("named_entities", []),
        "topic_label": email_json.get("topic_label", "Unknown"),
        "topic_score": email_json.get("topic_score", 0.0),
    }


def flatten_output(email_json):
    result = {
        "subject": email_json.get("subject", ""),
        "from": email_json.get("from", ""),
        "to": email_json.get("to", ""),
        "date": email_json.get("date", ""),
        "body": email_json.get("structured", {}).get("body", {}).get("text", ""),
        "body_score": email_json.get("structured", {}).get("body", {}).get("avg_score", 0.0),
        "named_entities": email_json.get("named_entities", []),
        "topic_label": email_json.get("topic_label", ""),
        "topic_score": email_json.get("topic_score", 0.0)
    }

    structured = email_json.get("structured", {})
    for field in ["greeting", "closing", "signature"]:
        text = structured.get(field, {}).get("text", "").strip()
        if text:
            result[field] = text
            result[f"{field}_score"] = structured.get(field, {}).get("avg_score", 0.0)

    return result


In [None]:
example = raw_emails['message'][18]
structured_output = process_raw_email(example)

flattened_table = pd.DataFrame([flatten_output(structured_output)])
pd.set_option('display.max_colwidth', None)
flattened_table

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,subject,from,to,date,body,body_score,named_entities,topic_label,topic_score,signature,signature_score
0,Westgate,phillip.allen@enron.com,pallen70@hotmail.com,"Tue, 3 Oct 2000 09:30:00 -0700 (PDT)","Westgate Enclosed are demographics on the Westgate site from Investor's Alliance. Investor's Alliance says that these demographics are similar to the package on San Marcos that you received earlier. If there are any other questions or information requirements, let me know. Then, let me know your interest level in the Westgate project? San Marcos The property across the street from the Sagewood units in San Marcos is for sale and approved for 134 units. The land is selling for $2.50 per square foot as it is one of only two remaining approved multifamily parcels in West San Marcos, which now has a moratorium on development. Several new studies we have looked at show that the rents for our duplexes and for these new units are going to be significantly higher, roughly $1.25 per square foot if leased for the entire unit on a 12-month lease and $1.30-$1.40 psf if leased on a 12-month term, but by individual room. This property will have the best location for student housing of all new projects, just as the duplexes do now. If this project is of serious interest to you, please let me know as there is a very, very short window of opportunity. The equity requirement is not yet known, but it would be likely to be $300,000 to secure the land. I will know more on this question later today.",0.999931,"[{'entity': 'ORG', 'score': 0.653, 'word': 'Westgate'}, {'entity': 'ORG', 'score': 0.537, 'word': '##gate'}, {'entity': 'ORG', 'score': 0.988, 'word': 'Investor ' s Alliance'}, {'entity': 'ORG', 'score': 0.985, 'word': 'Investor ' s Alliance'}, {'entity': 'GPE', 'score': 0.9, 'word': 'San Marcos'}, {'entity': 'ORG', 'score': 0.504, 'word': '##gate'}, {'entity': 'GPE', 'score': 0.948, 'word': 'San Marcos'}, {'entity': 'GPE', 'score': 0.95, 'word': 'San Marcos'}, {'entity': 'MONEY', 'score': 0.993, 'word': '2. 50'}, {'entity': 'GPE', 'score': 0.665, 'word': 'West San Marcos'}, {'entity': 'MONEY', 'score': 0.843, 'word': '1. 25'}, {'entity': 'DATE', 'score': 0.985, 'word': '12 - month'}, {'entity': 'MONEY', 'score': 0.92, 'word': '$ 1. 30 - $ 1. 40 psf'}, {'entity': 'DATE', 'score': 0.984, 'word': '12 - month'}, {'entity': 'MONEY', 'score': 0.875, 'word': '300, 000'}, {'entity': 'TIME', 'score': 0.825, 'word': 'later today'}, {'entity': 'PERSON', 'score': 0.992, 'word': 'George W. Richards'}, {'entity': 'ORG', 'score': 0.976, 'word': 'Creekside Builders, LLC'}]",Business Operations & Market Analysis,0.301003,"Sincerely, George W. Richards President, Creekside Builders, LLC - winmail.dat",0.993306


In [None]:
example = raw_emails['message'][9]
structured_output = process_raw_email(example)
flattened = pd.DataFrame([flatten_output(structured_output)])
pd.set_option('display.max_colwidth', None)
flattened

Unnamed: 0,subject,from,to,date,greeting,body,closing,signature,greeting_score,body_score,closing_score,signature_score,topic_label,topic_score
0,FW: fixed forward or other Collar floor gas price terms,phillip.allen@enron.com,zimam@enron.com,"Mon, 16 Oct 2000 06:44:00 -0700 (PDT)",,"Phillip, > As discussed during our phone conversation, In a Parallon 75 microturbine > power generation deal for a national accounts customer, I am developing a > proposal to sell power to customer at fixed or collar/floor price. To do > so I need a corresponding term gas price for same. Microturbine is an > onsite generation product developed by Honeywell to generate electricity > on customer site (degen). using natural gas. In doing so, I need your > best fixed price forward gas price deal for 1, 3, 5, 7 and 10 years for > annual/seasonal supply to microturbines to generate fixed kWh for > customer. We have the opportunity to sell customer kWh 's using > microturbine or sell them turbines themselves. kWh deal must have limited/ > no risk forward gas price to make deal work. Therein comes Sempra energy > gas trading, truly you. > > We are proposing installing 180 - 240 units across a large number of > stores (60-100) in San Diego. > Store number varies because of installation hurdles face at small percent. > > For 6-8 hours a day Microturbine run time: > Gas requirement for 180 microturbines 227 - 302 MMcf per year > Gas requirement for 240 microturbines 302 - 403 MMcf per year > > Gas will likely be consumed from May through September, during peak > electric period. > Gas price required: Burnertip price behind (LDC) San Diego Gas & Electric > Need detail breakout of commodity and transport cost (firm or > interruptible). > > Should you have additional questions, give me a call. > Let me assure you, this is real deal!!",,"> > Buck Buckner, P.E., MBA > Manager, Business Development and Planning > Big Box Retail Sales > Honeywell Power Systems, Inc. > 8725 Pan American Frwy > Albuquerque, NM 87113 > 505-798-6424 > 505-798-6050x > 505-220-4129 > 888/501-3145 >",0.0,0.999928,0.0,0.99344,Gas Trading & Deal Management,0.400602


In [None]:
example = raw_emails['message'][11]
structured_output = process_raw_email(example)
flattened = pd.DataFrame([flatten_output(structured_output)])
pd.set_option('display.max_colwidth', None)
flattened

Unnamed: 0,subject,from,to,date,body,body_score,named_entities,topic_label,topic_score
0,,phillip.allen@enron.com,stagecoachmama@hotmail.com,"Fri, 13 Oct 2000 06:45:00 -0700 (PDT)","Lucy, Here are the rentrolls: Open them and save in the rentroll folder. Follow these steps so you don't misplace these files. 1. Click on Save As 2. Click on the drop down triangle under Save in: 3. Click on the (C): drive 4. Click on the appropriate folder 5. Click on Save: Phillip",0.999838,"[{'entity': 'PERSON', 'score': 0.992, 'word': 'Lucy'}, {'entity': 'ORG', 'score': 0.649, 'word': 'Save'}, {'entity': 'PERSON', 'score': 0.986, 'word': 'Phillip'}]",Data Requests & Project Reports,0.598155


In [None]:
example = raw_emails['message'][16]
structured_output = process_raw_email(example)
flattened = pd.DataFrame([flatten_output(structured_output)])
pd.set_option('display.max_colwidth', None)
flattened

Unnamed: 0,subject,from,to,date,body,body_score,named_entities,topic_label,topic_score,closing,closing_score
0,"Var, Reporting and Resources Meeting",phillip.allen@enron.com,ina.rangel@enron.com,"Wed, 4 Oct 2000 09:23:00 -0700 (PDT)","Please plan to attend the below Meeting: Topic: Var, Reporting and Resources Meeting Date: Wednesday, October 11th Time: 2:30 - 3:30 Location: EB30C1 If you have any questions/conflicts, please feel free to call me.",0.999924,"[{'entity': 'DATE', 'score': 0.995, 'word': 'Wednesday, October 11th'}, {'entity': 'TIME', 'score': 0.996, 'word': '2 : 30 - 3 : 30'}, {'entity': 'PERSON', 'score': 0.986, 'word': 'Rain'}]",Meetings & Scheduling Coordination,0.420089,"Thanks, Rain x.31560",0.992642
