In [None]:
from google.colab import drive
drive.mount('/content/drive')

root_path = '/content/drive/MyDrive/learning/nus/cs5260/project'
data_path = root_path + '/data'
cleaned_data_path = root_path + '/data/cleaned'

Mounted at /content/drive


In [None]:
import pandas as pd

In [None]:
emails = pd.read_csv(data_path + '/enron.csv')

# Rename the column Spam/Ham to spam
emails.rename(columns={'Spam/Ham': 'spam'}, inplace=True)

# Update ham to 0 and spam = 1 in spam column
emails['spam'] = emails['spam'].map({'ham': 0, 'spam': 1})

# Fill empty string for empty / NA value in Subject and Message columns
emails['Subject'].fillna('', inplace=True)
emails['Message'].fillna('', inplace=True)

# Create a new column called text by concatenating Subject and Message
emails['text'] = emails['Subject'] + ' ' + emails['Message']
emails['original_text'] = emails['text']
emails.drop(columns=['Subject', 'Message', 'Message ID', 'Date'], inplace=True)

# Count by spam status
emails['spam'].value_counts()

spam
1    17171
0    16545
Name: count, dtype: int64

In [None]:
print('Before removing duplicate emails:', emails.shape)

emails.drop_duplicates(inplace=True)
print('After removing duplicate emails:', emails.shape)

In [None]:
# Spam emails
emails[emails['spam'] == 1].sample(10)

Unnamed: 0,spam,text,original_text
30342,1,"i am so happy "" do i have to dress for the nex...","i am so happy "" do i have to dress for the nex..."
3713,1,learn to make a fortune on ebay !,learn to make a fortune on ebay !
29690,1,get a free wireless ready toshiba laptop ! con...,get a free wireless ready toshiba laptop ! con...
29524,1,how to make 400 dollars a day from home . pop ...,how to make 400 dollars a day from home . pop ...
22141,1,"solid loans for the usa ! dear sir or madam ,\...","solid loans for the usa ! dear sir or madam ,\..."
20844,1,"offce x * p professional $ 60 minnesota , whic...","offce x * p professional $ 60 minnesota , whic..."
19057,1,stoop it deaaaaad in it ' s tracks hi saundra ...,stoop it deaaaaad in it ' s tracks hi saundra ...
22324,1,get laid tonight there is this free date site ...,get laid tonight there is this free date site ...
30365,1,congratulations bank giro loterij\ninternation...,congratulations bank giro loterij\ninternation...
3801,1,"be a men byronizes munched ! armpit , budweise...","be a men byronizes munched ! armpit , budweise..."


In [None]:
# Non-spam emails
emails[emails['spam'] == 0].sample(10)

Unnamed: 0,spam,text,original_text
662,0,"aspect resources - meter # 78033 , # 78032 - 3...","aspect resources - meter # 78033 , # 78032 - 3..."
12681,0,ena trading track update any changes to the at...,ena trading track update any changes to the at...
23442,0,enron technology showcase announcement that ' ...,enron technology showcase announcement that ' ...
16933,0,pdci / iso share ( nob ) south to north derate...,pdci / iso share ( nob ) south to north derate...
14461,0,re : integration meeting i have arranged for t...,re : integration meeting i have arranged for t...
12009,0,fw : enron canada update - 06 / 25 / 01 fyi\n-...,fw : enron canada update - 06 / 25 / 01 fyi\n-...
23524,0,request submitted : access request for kenneth...,request submitted : access request for kenneth...
6650,0,"re : reminder thanks , so much for your support !","re : reminder thanks , so much for your support !"
14236,0,re : enron online counterparties mark\ni will ...,re : enron online counterparties mark\ni will ...
3507,0,2001 special stock option grant awards we are ...,2001 special stock option grant awards we are ...


In [None]:
import ast

def parse_and_concatenate(value):
    try:
        # Attempt to parse the string as a list
        parsed_value = ast.literal_eval(value)
        if isinstance(parsed_value, list):
            # Join the list elements into a single string if parsing is successful
            return ' '.join(str(item) for item in parsed_value)
    except (ValueError, SyntaxError):
        # Return the original value if parsing fails or if it's not a list
        return value

    # Return the original value if it's not a list
    return value

# Apply the function to the DataFrame column
emails['text'] = emails['text'].apply(parse_and_concatenate)

In [None]:
from bs4 import BeautifulSoup

def extract_text(html):
    soup = BeautifulSoup(html, "lxml")
    return soup.get_text()

emails['text'] = emails['text'].apply(extract_text)

  soup = BeautifulSoup(html, "lxml")


In [None]:
emails['text'] = emails['text'].apply(lambda x: x.replace("\\n", " ").replace("\\r", " ").replace("\\t", " "))
emails['text'] = emails['text'].str.replace(r'\s+', ' ', regex=True).str.replace(
    '[^a-zA-Z0-9 \!\?\.,;\:\-\—\(\)\[\]\{\}\'\"%\$&\*\/=@#\\+]',
    '',
    regex=True
)

emails

Unnamed: 0,spam,text,original_text
0,0,christmas tree farm pictures,christmas tree farm pictures
1,0,"vastar resources , inc . gary , production fro...","vastar resources , inc . gary , production fro..."
2,0,calpine daily gas nomination - calpine daily g...,calpine daily gas nomination - calpine daily g...
3,0,re : issue fyi - see note below - already done...,re : issue fyi - see note below - already done...
4,0,meter 7268 nov allocation fyi . - - - - - - - ...,meter 7268 nov allocation fyi .\n- - - - - - -...
...,...,...,...
33709,1,"cure premature ejaculation hello , did you eja...","cure premature ejaculation hello ,\ndid you ej..."
33711,1,= ? iso - 8859 - 1 ? q ? good news c = edali...,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...
33712,1,all prescript medicines are on special . to be...,all prescript medicines are on special . to be...
33713,1,the next generation online pharmacy . are you ...,the next generation online pharmacy . are you ...


In [None]:
emails['original_text'] = emails['text']

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def tokenize(text):
    return ' '.join(tokenizer.tokenize(text))

emails['text'] = emails['text'].apply(tokenize)

# Print the updated DataFrame
emails.head()

Unnamed: 0,spam,text,original_text
0,0,christmas tree farm pictures,christmas tree farm pictures
1,0,"vast ##ar resources , inc . gary , production ...","vastar resources , inc . gary , production fro..."
2,0,cal ##pine daily gas nomination - cal ##pine d...,calpine daily gas nomination - calpine daily g...
3,0,re : issue f ##yi - see note below - already d...,re : issue fyi - see note below - already done...
4,0,meter 72 ##6 ##8 nov allocation f ##yi . - - -...,meter 7268 nov allocation fyi .\n- - - - - - -...


In [None]:
print(emails.shape)

# Strip whitespace from the text column
emails['text'] = emails['text'].str.strip()

# Remove emails with empty text
emails = emails[emails['text'].str.len() > 50]

print(emails.shape)

(30494, 3)
(30067, 3)


In [None]:
emails['spam'].value_counts()

spam
0    15788
1    14279
Name: count, dtype: int64

In [None]:
# Spam emails
emails[emails['spam'] == 1].sample(10)

Unnamed: 0,spam,text,original_text
30946,1,. via ##g ' ra 80 % discount ' new cl ##all ##...,. viag ' ra 80 % discount ' new clalls softtab...
31340,1,it is your way to get quick relief it is a sma...,it is your way to get quick relief it is a sma...
18610,1,"this is super ! h real time , online , instant...","this is super ! h real time ,\nonline , instan..."
22144,1,"michael romero tu ##e , 21 jun 2005 07 : 33 : ...","michael romero tue , 21 jun 2005 07 : 33 : 23 ..."
25072,1,herald community herald - article on pc ' s mi...,herald community herald - article on pc ' s mi...
26915,1,our cool ##l med ##z how to save on your med #...,our cooll medz how to save on your medlcations...
21175,1,spec ##al offer for w ##ndo ##ws 20 o ##o adva...,specal offer for wndows 20 oo advanced server ...
19099,1,gr ##ox ##w your penis our pg ##f - q 3 pe ##c...,groxw\nyour\npenis\nour pgf - q 3 pecunis enl...
15684,1,con ##fi ##dant . department of minerals and e...,confidant . department of minerals and energy\...
4785,1,all of these lonely people would like someone ...,all of these lonely people would like someone ...


In [None]:
# Non-spam emails
emails[emails['spam'] == 0].sample(10)

Unnamed: 0,spam,text,original_text
903,0,wc 53 ##3 revision and notice of force maj ##e...,wc 533 revision and notice of force majeure ( ...
2406,0,"en ##ron / hp ##l actual ##s for january 15 , ...","enron / hpl actuals for january 15 , 2001 janu..."
3521,0,re : tx ##u may 01 this deal was not transport...,re : txu may 01 this deal was not transported ...
17689,0,start date : 1 / 12 / 02 ; hour ##ah ##ead hou...,start date : 1 / 12 / 02 ; hourahead hour : 16...
3002,0,re : no ##m / actual flow for april 4 th we ag...,re : nom / actual flow for april 4 th we agree...
11586,0,forthcoming un ##fa ##vor ##able new york time...,forthcoming unfavorable new york times article...
580,0,re : 98 - 68 ##9 ##2 for 3 / 15 / 2000 and 3 /...,re : 98 - 6892 for 3 / 15 / 2000 and 3 / 23 / ...
6171,0,here ' s your chance what do we need to know t...,here ' s your chance what do we need to know t...
2182,0,"en ##ron / hp ##l actual ##s for december 6 , ...","enron / hpl actuals for december 6 , 2000 teco..."
23595,0,request submitted : access request for kelly ....,request submitted : access request for kelly ....


In [None]:
# prompt: Select out emails with email addresses

import re

def extract_emails(text):
  """
  This function extracts email addresses from a given text.
  """
  email_regex = r"[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}"
  return re.findall(email_regex, text)

emails['emails'] = emails['text'].apply(extract_emails)

# Print the DataFrame with extracted email addresses
print(emails[emails['emails'].str.len() > 0].shape)

# Drop emails column
emails.drop(columns=['emails'], inplace=True)

(0, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emails['emails'] = emails['text'].apply(extract_emails)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emails.drop(columns=['emails'], inplace=True)


In [None]:
# prompt: Find rows contains email forwarding information

emails[emails['text'].str.contains(r'forward')]

Unnamed: 0,spam,text,original_text
1,0,"vastar resources , inc . gary , production fro...","vastar resources , inc . gary , production fro..."
3,0,re : issue fyi - see note below - already done...,re : issue fyi - see note below - already done...
4,0,meter 7268 nov allocation fyi . - - - - - - - ...,meter 7268 nov allocation fyi . - - - - - - - ...
7,0,duns number changes fyi - - - - - - - - - - - ...,duns number changes fyi - - - - - - - - - - - ...
11,0,lst rev dec . 1999 josey ranch nom fyi - - - -...,lst rev dec . 1999 josey ranch nom fyi - - - -...
...,...,...,...
33589,1,is it a microcap miracle ? investor alert - im...,is it a microcap miracle ? investor alert - im...
33629,1,wall - street pulse * * * watch this one july ...,wall - street pulse * * * watch this one july ...
33655,1,notification adexec investerings firma max - h...,notification adexec investerings firma max - h...
33691,1,quicker effects . lesser costs . best bargain ...,quicker effects . lesser costs . best bargain ...


In [None]:
# Find duplicate emails
duplicate_emails = emails[emails.duplicated(subset='text', keep=False)]
duplicate_emails.sort_values(by='text')

Unnamed: 0,spam,text,original_text
2828,0,"04 / 01 assignment , termination , expiration ...","04 / 01 assignment , termination , expiration ..."
2831,0,"04 / 01 assignment , termination , expiration ...","04 / 01 assignment , termination , expiration ..."
9456,0,2 - survey / information email 5 - 7 - 01 curr...,2 - survey / information email 5 - 7 - 01 curr...
9458,0,2 - survey / information email 5 - 7 - 01 curr...,2 - survey / information email 5 - 7 - 01 curr...
9455,0,3 - urgent - to prevent loss of information cr...,3 - urgent - to prevent loss of information cr...
...,...,...,...
8920,0,"yen outlook vince , as a followup to our meeti...","yen outlook vince , as a followup to our meeti..."
9428,0,"your advice is appreciated vince , in the morn...","your advice is appreciated vince , in the morn..."
9423,0,"your advice is appreciated vince , in the morn...","your advice is appreciated vince , in the morn..."
23973,0,your help needed please - cross - commodity re...,your help needed please - cross - commodity re...


In [None]:
# Drop duplidate emails
emails.drop_duplicates(subset=['text', 'spam'], inplace=True)
emails.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emails.drop_duplicates(subset=['text', 'spam'], inplace=True)


(29711, 3)

In [None]:
emails['spam'].value_counts()

spam
0    15468
1    14243
Name: count, dtype: int64

In [None]:
# Spam emails
emails[emails['spam'] == 1].sample(10)

Unnamed: 0,spam,text,original_text
31529,1,p % ill to improve cu & m flavour and v ! olum...,p % ill to improve cu & m flavour and v ! olum...
4526,1,we have vicodin and anything else save over 50...,we have vicodin and anything else save over 50...
4132,1,the only fix to penis growth limited time offe...,the only fix to penis growth limited time offe...
20823,1,over 150 fda approved meds % rnd alt % rnd a...,over 150 fda approved meds % rnd alt % rnd a...
15495,1,discounted microsoft money 2004 standard get s...,discounted microsoft money 2004 standard get s...
33193,1,"it ' s not a joke on our thousand , side happy...","it ' s not a joke on our thousand , side happy..."
30936,1,warez cd ' s microsoft windows xp professional...,warez cd ' s microsoft windows xp professional...
19305,1,"$ 54873 hello , i sent you an email a few days...","$ 54873 hello , i sent you an email a few days..."
9957,1,take positi 0 ns before breaking news expiosio...,take positi 0 ns before breaking news expiosio...
29898,1,inexpensive online medication here handout all...,inexpensive online medication here handout all...


In [None]:
# # prompt: Compute document embedding of all emails; then find all emails very similar to each other

# # Import necessary libraries
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# # Initialize a TfidfVectorizer object
# vectorizer = TfidfVectorizer()

# # Fit the vectorizer on the text column of the emails DataFrame
# X = vectorizer.fit_transform(emails['text'])

# # Compute the cosine similarity between each pair of emails
# cosine_matrix = cosine_similarity(X, X)

# # Find the indices of the most similar emails for each email
# most_similar_indices = cosine_matrix.argsort()[:, -5:-1]

# # Print the most similar emails for the first 10 emails
# for i in range(10):
#     print(f"Most similar emails for email {i + 1}:")
#     for j in most_similar_indices[i]:
#         print(f" - Email {j + 1}: {emails['text'].iloc[j]}")


In [None]:
# How many unique words are there across all text values?
unique_words = set()
for text in emails['text']:
    unique_words.update(text.split())
print(len(unique_words))

158272


In [None]:
emails[emails['text'].str.contains(r'html')]

Unnamed: 0,spam,text,original_text
304,0,http : / / www . pge - texas . com / www / gtt...,http : / / www . pge - texas . com / www / gtt...
959,0,june specials ! ! * * summer specials * * valu...,june specials ! ! * * summer specials * * valu...
1224,0,kingwood cove - $ 10 before 10 am ! only $ 10 ...,kingwood cove - $ 10 before 10 am ! only $ 10 ...
1295,0,fw : please read . . don ' t delete - - - - - ...,fw : please read . . don ' t delete - - - - - ...
1315,0,"kingwood cove - 2 play "" for "" $ 20 thanks for...","kingwood cove - 2 play "" for "" $ 20 thanks for..."
...,...,...,...
33485,1,"web design & development hello , as a professi...","web design & development hello , as a professi..."
33526,1,compare p ) rescription drug prices find all y...,compare p ) rescription drug prices find all y...
33530,1,"urgent message dearest one , i am madam . adel...","urgent message dearest one , i am madam . adel..."
33566,1,"need more energy , more money ? free sample ! ...","need more energy , more money ? free sample ! ..."


In [None]:
import pandas as pd
import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import re

# Ensure you have the necessary NLTK datasets downloaded
nltk.download('words')
nltk.download('punkt')
nltk.download('wordnet')

english_words = set(words.words())
lemmatizer = nltk.WordNetLemmatizer()

def count_non_english_words(text):
    # Tokenize the text into words
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    tokens = word_tokenize(text)
    # Lematize tokens using lematizer
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens]
    # Count total words and non-English words
    total_words = len(tokens)
    non_english_words = sum(1 for word in tokens if word not in english_words)

    # Return the count of non-English words and total words
    return non_english_words, total_words

# Apply the function to the 'text' column and calculate the percentage of non-English words
emails['non_english_info'] = emails['original_text'].apply(count_non_english_words)
emails['percentage_non_english'] = emails['non_english_info'].apply(lambda x: x[0] / x[1] if x[1] > 0 else 0)

# Filter emails containing more than 30% non-English words
filtered_emails = emails[emails['percentage_non_english'] > 0.60].sort_values(by='percentage_non_english', ascending=False)

filtered_emails.shape

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emails['non_english_info'] = emails['original_text'].apply(count_non_english_words)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emails['percentage_non_english'] = emails['non_english_info'].apply(lambda x: x[0] / x[1] if x[1] > 0 else 0)


(1422, 5)

In [None]:
# Remove filtered_emails from email dataframe
print(emails.shape)
emails = emails[~emails.index.isin(filtered_emails.index)]
print(emails.shape)

(29711, 5)
(28289, 5)


In [None]:
# Count unique words in text column
unique_words = set()
for text in emails['text']:
    unique_words.update(text.split())

print(len(unique_words))

128424


In [None]:
emails['spam'].value_counts()

spam
0    14577
1    13712
Name: count, dtype: int64

In [None]:
# Store into the cleaned folder
emails[['text', 'original_text', 'spam']].to_csv(cleaned_data_path + '/enron.csv', index=False)