In [60]:
import re
import spacy
import pandas as pd
from faker import Faker
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Initialize Faker for generating synthetic data
faker = Faker()
# Load spaCy model for entity recognition
nlp = spacy.load("en_core_web_lg")



In [62]:
# Load the Enron dataset from a CSV file
# Ensure the file path is correct before running the script
enron_emails = pd.read_csv(r"emails.csv")  # Update with the correct file path
print(enron_emails.head())

                       file                                            message
0     allen-p/_sent_mail/1.  Message-ID: <18782981.1075855378110.JavaMail.e...
1    allen-p/_sent_mail/10.  Message-ID: <15464986.1075855378456.JavaMail.e...
2   allen-p/_sent_mail/100.  Message-ID: <24216240.1075855687451.JavaMail.e...
3  allen-p/_sent_mail/1000.  Message-ID: <13505866.1075863688222.JavaMail.e...
4  allen-p/_sent_mail/1001.  Message-ID: <30922949.1075863688243.JavaMail.e...


In [63]:
# Function to parse email fields
def parse_email(message):
    """
    Parse an email message to extract structured fields.
    """
    fields = {}
    # Extract common fields using regex
    fields["Message-ID"] = re.search(r"Message-ID: (.+)", message).group(1).strip() if re.search(r"Message-ID: (.+)", message) else None
    fields["From"] = re.search(r"From: (.+)", message).group(1).strip() if re.search(r"From: (.+)", message) else None
    fields["To"] = re.search(r"To: (.+)", message).group(1).strip() if re.search(r"To: (.+)", message) else None
    fields["Subject"] = re.search(r"Subject: (.+)", message).group(1).strip() if re.search(r"Subject: (.+)", message) else None
    fields["Date"] = re.search(r"Date: (.+)", message).group(1).strip() if re.search(r"Date: (.+)", message) else None

    # Extract body by removing headers
    body = re.split(r"\n\n", message, maxsplit=1)
    fields["Body"] = body[1].strip() if len(body) > 1 else ""

    return fields

# Apply the parsing function to extract email fields
parsed_emails = enron_emails["message"].apply(parse_email)
parsed_emails_df = pd.DataFrame(parsed_emails.tolist())
print(parsed_emails_df.head())


                                      Message-ID                     From  \
0  <18782981.1075855378110.JavaMail.evans@thyme>  phillip.allen@enron.com   
1  <15464986.1075855378456.JavaMail.evans@thyme>  phillip.allen@enron.com   
2  <24216240.1075855687451.JavaMail.evans@thyme>  phillip.allen@enron.com   
3  <13505866.1075863688222.JavaMail.evans@thyme>  phillip.allen@enron.com   
4  <30922949.1075863688243.JavaMail.evans@thyme>  phillip.allen@enron.com   

                        To    Subject                                   Date  \
0     tim.belden@enron.com       None  Mon, 14 May 2001 16:39:00 -0700 (PDT)   
1  john.lavorato@enron.com        Re:   Fri, 4 May 2001 13:51:00 -0700 (PDT)   
2   leah.arsdall@enron.com   Re: test  Wed, 18 Oct 2000 03:00:00 -0700 (PDT)   
3    randall.gay@enron.com       None  Mon, 23 Oct 2000 06:13:00 -0700 (PDT)   
4     greg.piper@enron.com  Re: Hello  Thu, 31 Aug 2000 05:07:00 -0700 (PDT)   

                                                Body  
0

In [64]:
# Function to replace PII using spaCy and Faker
def replace_pii(text):
    """
    Replace sensitive information in the text using spaCy and Faker.
    """
    doc = nlp(text)
    count = 1
    for ent in doc.ents:
        if ent.label_ in ["PERSON", "ORG", "GPE", "EMAIL", "DATE"]:
            replacement = (
                faker.name() if ent.label_ == "PERSON" else
                faker.company() if ent.label_ == "ORG" else
                faker.city() if ent.label_ == "GPE" else
                faker.email() if ent.label_ == "EMAIL" else
                faker.date()
            )
            text = text.replace(ent.text, replacement)
    count = count + 1
    return text

# Fill missing values and apply de-identification to the dataset
parsed_emails_df["Body"] = parsed_emails_df["Body"].fillna("")
parsed_emails_df["Subject"] = parsed_emails_df["Subject"].fillna("")

# Apply de-identification to body and subject
parsed_emails_df["De-Identified_Body"] = parsed_emails_df["Body"].apply(replace_pii)
parsed_emails_df["De-Identified_Subject"] = parsed_emails_df["Subject"].apply(replace_pii)

In [68]:
# Initialize OpenAI API via LangChain
# Initialize OpenAI API via LangChain
openai_api_key = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(temperature=0.2, model="gpt-4", openai_api_key=openai_api_key)

# Define prompt template for subject transformation
subject_prompt_template = PromptTemplate(
    input_variables=["email_subject"],
    template="""
    Rewrite the subject line of the following email, ensuring it feels authentic, professional,
    and suitable for a different context. Maintain the overall intent and tone of the subject
    while introducing a subtle but realistic transformation.

    Original Subject:
    {email_subject}

    Provide the rewritten subject line.
    """
)

# Define prompt template for body transformation
body_prompt_template = PromptTemplate(
    input_variables=["email_body"],
    template="""
    Transform the body of the following email into a version that feels realistic, professional,
    and contextually suitable for a different organization and domain.

    Preserve the logical flow, professional tone, and structure of the email, ensuring it remains
    natural and authentic. Replace any identifiable details such as names, organizations, locations,
    and numbers with contextually appropriate alternatives. Subtly adapt industry-specific terms to
    align with the new context while maintaining the overall meaning and intent of the original message.

    Original Body:
    {email_body}

    Provide the transformed email body.
    """
)

# Create LangChains for Subject and Body
subject_transformation_chain = LLMChain(llm=llm, prompt=subject_prompt_template)
body_transformation_chain = LLMChain(llm=llm, prompt=body_prompt_template)

# Generate synthetic subject and body
parsed_emails_df["Synthetic_Subject"] = parsed_emails_df["De-Identified_Subject"].apply(
    lambda subject: subject_transformation_chain.run({"email_subject": subject}) if subject else ""
)

parsed_emails_df["Synthetic_Body"] = parsed_emails_df["De-Identified_Body"].apply(
    lambda body: body_transformation_chain.run({"email_body": body}) if body else ""
)

# Save results
parsed_emails_df.to_csv("synthetic_emails_with_subject_and_body.csv", index=False)
print("Synthetic emails with separate columns for subject and body saved successfully!")

Synthetic emails with separate columns for subject and body saved successfully!


In [69]:
from sentence_transformers import SentenceTransformer, util

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

def calculate_similarity(original, synthetic):
    """
    Calculate cosine similarity between original and synthetic text.
    """
    original_embedding = model.encode(original)
    synthetic_embedding = model.encode(synthetic)
    return util.cos_sim(original_embedding, synthetic_embedding).item()

# Calculate similarity scores
parsed_emails_df["Similarity_Score"] = parsed_emails_df.apply(
    lambda row: calculate_similarity(row["Body"], row["Synthetic_Body"]), axis=1
)

# Save results
parsed_emails_df.to_csv("synthetic_emails.csv", index=False)
print("Synthetic emails saved successfully!")

Synthetic emails saved successfully!


In [70]:
print(parsed_emails_df["Similarity_Score"].describe())

count    100.000000
mean       0.517384
std        0.155810
min        0.168305
25%        0.400129
50%        0.527207
75%        0.615530
max        0.869910
Name: Similarity_Score, dtype: float64
