In [1]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
import spacy
from sklearn.model_selection import train_test_split
from tensorflow.keras import regularizers

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("wcukierski/enron-email-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\stefa\.cache\kagglehub\datasets\wcukierski\enron-email-dataset\versions\2


In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
df_raw = pd.read_csv("emails.csv")
df_raw.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [5]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove specific email headers and metadata
    text = re.sub(r'^message-id:.*$', '', text, flags=re.MULTILINE | re.IGNORECASE)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)                      # Remove URLs
    text = re.sub(r"(?i)^from:.*|^sent:.*|^to:.*|^subject:.*", "", text)     # Remove common headers
    text = re.sub(r"<.*?>", "", text)                                        # Remove HTML tags

    # Replace personal info with placeholders
    text = re.sub(r"\b[\w.-]+?@\w+?\.\w+?\b", "[EMAIL]", text)
    text = re.sub(r"\b\d{10,}\b", "[PHONE]", text)
    text = re.sub(r"\b\d{4}-\d{2}-\d{2}\b", "[DATE]", text)

    # Strip leading/trailing whitespace
    text = text.strip()
    
    # Tokenize with spaCy
    doc = nlp(text)
    tokens = []

    for token in doc:
        # Keep words, numbers, and punctuation like . , ! ?
        if not token.is_space:
            tokens.append(token.text)

    return " ".join(tokens)

In [6]:
df_raw.info

<bound method DataFrame.info of                              file  \
0           allen-p/_sent_mail/1.   
1          allen-p/_sent_mail/10.   
2         allen-p/_sent_mail/100.   
3        allen-p/_sent_mail/1000.   
4        allen-p/_sent_mail/1001.   
...                           ...   
517396  zufferli-j/sent_items/95.   
517397  zufferli-j/sent_items/96.   
517398  zufferli-j/sent_items/97.   
517399  zufferli-j/sent_items/98.   
517400  zufferli-j/sent_items/99.   

                                                  message  
0       Message-ID: <18782981.1075855378110.JavaMail.e...  
1       Message-ID: <15464986.1075855378456.JavaMail.e...  
2       Message-ID: <24216240.1075855687451.JavaMail.e...  
3       Message-ID: <13505866.1075863688222.JavaMail.e...  
4       Message-ID: <30922949.1075863688243.JavaMail.e...  
...                                                   ...  
517396  Message-ID: <26807948.1075842029936.JavaMail.e...  
517397  Message-ID: <25835861.1075842029959

In [7]:
MAX_LEN = 1_000_000  # spaCy's limit

def safe_preprocess(text):
    if isinstance(text, str) and len(text) < MAX_LEN:
        return preprocess_text(text)
    else:
        return None  # or text[:MAX_LEN] to truncate



In [8]:

# Testing set
df_small = df_raw.head(2000).copy()  # Use only the first 2000 messages
# Apply the cleaning function to the 'message' column
df_small['message'] = df_small['message'].apply(safe_preprocess)
df_small.dropna(subset=['message'], inplace=True)
df_small.reset_index(drop=True, inplace=True)
df_small.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,"date : mon , 14 may 2001 16:39:00 -0700 ( pdt ..."
1,allen-p/_sent_mail/10.,"date : fri , 4 may 2001 13:51:00 -0700 ( pdt )..."
2,allen-p/_sent_mail/100.,"date : we d , 18 oct 2000 03:00:00 -0700 ( pdt..."
3,allen-p/_sent_mail/1000.,"date : mon , 23 oct 2000 06:13:00 -0700 ( pdt ..."
4,allen-p/_sent_mail/1001.,"date : thu , 31 aug 2000 05:07:00 -0700 ( pdt ..."


In [9]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_small['message'].values)

total_words = len(tokenizer.word_index) + 1

In [10]:
# Create input sequences
input_sequences = []
for line in df_small['message']:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(2, len(token_list)):
        n_gram_seq = token_list[:i+1]
        input_sequences.append(n_gram_seq)

In [11]:
MAX_SEQUENCES = 50000
input_sequences = input_sequences[:MAX_SEQUENCES]

In [12]:
# Pad sequences
max_sequence_len = 50  # Adjustable — 50 to 200 is typical

input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre', truncating='pre')

In [13]:
# Split X and y
X = input_sequences[:, :-1]
y = input_sequences[:, -1]  

In [14]:
# Build LSTM model
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_len - 1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [15]:
# Train model
history = model.fit(X, y, epochs=5, verbose=1)

Epoch 1/5
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 81ms/step - accuracy: 0.0702 - loss: 6.5601
Epoch 2/5
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 82ms/step - accuracy: 0.2980 - loss: 4.6430
Epoch 3/5
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 84ms/step - accuracy: 0.3873 - loss: 3.9619
Epoch 4/5
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 83ms/step - accuracy: 0.4159 - loss: 3.5607
Epoch 5/5
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 83ms/step - accuracy: 0.4412 - loss: 3.1985


In [16]:
# Predict function
def generate_text(seed_text, next_words=20):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted, axis=-1)[0]
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                seed_text += " " + word
                break
    return seed_text


In [17]:
# Test input
test_input = "please let me know"
print("Generated text:")
print(generate_text(test_input, next_words=10))

Generated text:
please let me know to email subject re mime version 1 0 content type
