In [1]:
import re
import nltk
import os
import numpy as np
import codecs
import multiprocessing as mp
import pandas as pd
from nltk import stem

stemmer = stem.PorterStemmer()

from nltk.corpus import stopwords
from collections import Counter
from typing import Dict, List, Tuple

# find the path of emails
def listdir(directory: str) -> List:
    """
    A specialized version of os.listdir() that ignores files that
    start with a leading period.
    
    Especially dismissing .DS_STORE s.
    """
    filelist = os.listdir(directory)
    return [x for x in filelist if not (x.startswith('.'))]

stopwords = stopwords.words('english')
cut_model = nltk.WordPunctTokenizer()


# Preprocess emails
def enron_processor(emails_dir: str, return_list: list) -> list:    
    """
    * remove numbers
    * remove stopwords
    * add lables
    """
    dirs = [os.path.join(emails_dir, f) for f in os.listdir(emails_dir)]
    for d in dirs:
        emails = [os.path.join(d, f) for f in os.listdir(d)]
        for mail in emails:
            # print(mail)
            with codecs.open(mail, "rb", encoding='utf_8_sig', errors='ignore') as m:
                email_list = []
                line_str = ""
                for line in m:
                    for word in line:
                        if word.startswith("http"):
                            print(word)
                            word = "URL"
                            print(word)
                        word = stemmer.stem(word)
                    line = re.sub(r'[^a-zA-Z\s]', '', string=line)
                    line = line.lower()
                    line = line.strip()
                    tokens = cut_model.tokenize(line)
                    line = [stemmer.stem(token) for token in tokens if token not in stopwords]
           
                    line = ' '.join(line)
                    line_str = line_str+line+" "
                email_list.append(line_str)
                
                if mail.split(".")[-2] == 'spam':
                    email_list.append("spam")
                else: 
                    email_list.append("ham")
                email_list.append(mail)
                return_list.append(email_list)
                

                
return_dict = {}   
root_dir = 'spampy/datasets/enron'
emails_dirs = [os.path.join(root_dir, f) for f in listdir(root_dir)]
return_list = []
for emails_dir in emails_dirs:
    enron_processor(emails_dir, return_list)


messages = pd.DataFrame(return_list, columns=['message', 'label', 'path'])

messages

Unnamed: 0,message,label,path
0,christma tree farm pictur,ham,spampy/datasets/enron\enron1\ham\0001.1999-12-...
1,vastar resourc inc gari product high island la...,ham,spampy/datasets/enron\enron1\ham\0002.1999-12-...
2,calpin daili ga nomin calpin daili ga nomin doc,ham,spampy/datasets/enron\enron1\ham\0003.1999-12-...
3,issu fyi see note alreadi done stella forward ...,ham,spampy/datasets/enron\enron1\ham\0004.1999-12-...
4,meter nov alloc fyi forward lauri allen hou ec...,ham,spampy/datasets/enron\enron1\ham\0005.1999-12-...
...,...,...,...
33711,iso q good news c edaliss val edumm vl eoggra ...,spam,spampy/datasets/enron\enron6\spam\5995.2005-07...
33712,prescript medicin special precis put buck back...,spam,spampy/datasets/enron\enron6\spam\5997.2005-07...
33713,next gener onlin pharmaci readi rock let man r...,spam,spampy/datasets/enron\enron6\spam\5998.2005-07...
33714,bloow time time learn last time longer bed rea...,spam,spampy/datasets/enron\enron6\spam\5999.2005-07...


In [1]:
messages.to_csv("messages.csv")

NameError: name 'messages' is not defined

In [2]:
from sklearn.model_selection import train_test_split
messages['label'] = messages['label'].replace('ham', 0)
messages['label'] = messages['label'].replace('spam', 1)

spam = messages[messages.label == 1]
ham = messages[messages.label == 0]


from sklearn.feature_extraction.text import TfidfVectorizer

# Tf-idf for spam datasets
vect_spam = TfidfVectorizer()
tfidf_spam = vect_spam.fit_transform(spam['message'])
tfidf_matrix_spam = pd.DataFrame(tfidf_spam.toarray(), columns = vect_spam.get_feature_names())
header_spam = vect_spam.get_feature_names()

# Tf-idf for ham datasets
vect_ham = TfidfVectorizer()
tfidf_ham = vect_ham.fit_transform(ham['message'])
tfidf_matrix_ham = pd.DataFrame(tfidf_ham.toarray(), columns = vect_ham.get_feature_names())
header_ham = vect_ham.get_feature_names()

In [3]:
print(len(spam))
print(len(ham))

17171
16545


In [4]:
ham_unique = list(set(header_ham).difference(set(header_spam)))
header_ham1 = pd.DataFrame(ham_unique)
header_ham1.to_csv("ham_unique.csv")

spam_unique = list(set(header_spam).difference(set(header_ham)))
header_spam1 = pd.DataFrame(spam_unique)
header_spam1.to_csv("spam_unique.csv")

In [1]:
# ham unique
import csv
with open("ham_unique.csv", "r") as csvfile:
    reader = csv.reader(csvfile)
    ham_unique = []
    for row in reader:
        ham_unique.append(row[1])
ham_unique = ham_unique[1:]
ham_unique

['raghuram',
 'marni',
 'danka',
 'yoho',
 'unterberg',
 'alwya',
 'hensley',
 'highval',
 'caisop',
 'graem',
 'ramachandran',
 'wadel',
 'ngai',
 'jester',
 'burchfield',
 'osonenko',
 'kleeg',
 'keeter',
 'fyre',
 'boerger',
 'nadal',
 'rothrock',
 'chee',
 'astra',
 'vinio',
 'hambrick',
 'vtaylor',
 'helpful',
 'barend',
 'opsdoc',
 'hgreer',
 'unimpress',
 'chome',
 'doublecount',
 'riskpubl',
 'highfield',
 'reit',
 'dopi',
 'okazja',
 'vonbock',
 'untest',
 'tingleaf',
 'hoelscher',
 'oilgram',
 'braker',
 'narzeczona',
 'lohs',
 'blackabi',
 'unlead',
 'intellimous',
 'huong',
 'prerat',
 'severin',
 'canu',
 'etsdb',
 'updateaccount',
 'iutben',
 'perfer',
 'kilgard',
 'morongo',
 'annemiek',
 'pppo',
 'reprecuss',
 'clik',
 'purswel',
 'methodolgi',
 'midlothian',
 'mktw',
 'churbock',
 'trrovh',
 'jhender',
 'ruebsahm',
 'tamullen',
 'thconfsess',
 'orli',
 'lichti',
 'yowman',
 'celestica',
 'duketecotap',
 'ziplip',
 'genieand',
 'multiparti',
 'zatarain',
 'hbr',
 'maldo

In [2]:
# spam unique
import csv
with open("spam_unique.csv", "r") as csvfile:
    reader = csv.reader(csvfile)
    spam_unique = []
    for row in reader:
        spam_unique.append(row[1])
spam_unique = spam_unique[1:]
spam_unique

['beforejun',
 'referralwar',
 'dickey',
 'offhand',
 'rkfqebbc',
 'buik',
 'pjmmfpecj',
 'mugwump',
 'custodiandot',
 'guyanagao',
 'mmtroq',
 'libhgmh',
 'qrtqqk',
 'bzovsaebl',
 'lawrentian',
 'joyou',
 'reminis',
 'benardo',
 'feldspath',
 'cowlumn',
 'tute',
 'honda',
 'ahmpan',
 'krja',
 'nformatlon',
 'emliti',
 'loquat',
 'achgh',
 'noticepend',
 'auro',
 'alkyl',
 'woodnymph',
 'glyptic',
 'wbo',
 'ainsi',
 'rnedi',
 'rachmaninoff',
 'valpso',
 'bjzvn',
 'goona',
 'aeiou',
 'eggotti',
 'investimento',
 'bedroulett',
 'jvsauukip',
 'vteed',
 'liverymen',
 'csp',
 'nondecreas',
 'brnnhg',
 'jacobit',
 'yhxpgtnzbr',
 'finhancinq',
 'chey',
 'shail',
 'wsq',
 'hershel',
 'icerigi',
 'ciy',
 'dkamkfk',
 'semen',
 'tjptavpsf',
 'zahdhi',
 'ptjial',
 'defilad',
 'dfzxj',
 'custa',
 'wtmym',
 'cyfsix',
 'vaudevillemov',
 'pblica',
 'aristogenesi',
 'onlinesuppli',
 'electoral',
 'pamactu',
 'nerveless',
 'khartoum',
 'wasserman',
 'nbort',
 'easnot',
 'rxkqggxxmn',
 'qklbmcml',
 'retr

In [2]:
# variance: average = 0
import csv
with open("x2result.csv", "r") as csvfile:
    reader = csv.reader(csvfile)
    top100_features = []
    for row in reader:
        top100_features.append(row[1])
top100_features = top100_features[1:]
top100_features

['enron',
 'vinc',
 'louis',
 'thank',
 'attach',
 'dave',
 'doc',
 'employe',
 'listbot',
 'jim',
 'question',
 'schedul',
 'congratul',
 'monday',
 'houston',
 'file',
 'salli',
 'daren',
 'kaminski',
 'energi',
 'risk',
 'texa',
 'deal',
 'password',
 'eb',
 'tomorrow',
 'jeff',
 'pm',
 'steve',
 'desk',
 'enrononlin',
 'edu',
 'cera',
 'bout',
 'confer',
 'bob',
 'kevin',
 'discuss',
 'plan',
 'lokay',
 'mike',
 'research',
 'know',
 'model',
 'capac',
 'counterparti',
 'spreadsheet',
 'request',
 'mark',
 'blackberri',
 'meter',
 'fyi',
 'util',
 'ga',
 'pleas',
 'contract',
 'volum',
 'eol',
 'pdf',
 'laci',
 'april',
 'access',
 'clickathom',
 'sitara',
 'chang',
 'ken',
 'accomplish',
 'bid',
 'url',
 'juno',
 'issu',
 'pjm',
 'san',
 'et',
 'resum',
 'locker',
 'chri',
 'hope',
 'tw',
 'revis',
 'topica',
 'wassup',
 'ect',
 'tom',
 'calger',
 'expedia',
 'tax',
 'draft',
 'duke',
 'juli',
 'fax',
 'option',
 'ena',
 'friday',
 'tanya',
 'kal',
 'octob',
 'cdnow',
 'beenladen'

In [4]:
# ori > 0, ad = 0
# The disappearing features
import csv
with open("ori1_ad0_columns.csv", "r") as csvfile:
    reader = csv.reader(csvfile)
    ad_diff = []
    for row in reader:
        ad_diff.append(row[1])
ori1_ad0_diff = ad_diff[1:]

# ori = 0, ad > 0
# The adding features
with open("ori0_ad1_columns.csv", "r") as csvfile:
    reader = csv.reader(csvfile)
    ori0_ad1_diff = []
    for row in reader:
        ori0_ad1_diff.append(row[1])
ori0_ad1_diff = ori0_ad1_diff[1:]
ori0_ad1_diff

['aaa',
 'aarhu',
 'aaron',
 'abackof',
 'abacu',
 'abacustech',
 'abandon',
 'abarch',
 'abash',
 'abba',
 'abbamont',
 'abbl',
 'abbot',
 'abc',
 'abdo',
 'abdullah',
 'abeckley',
 'abhay',
 'abi',
 'abid',
 'abigail',
 'abiiiti',
 'abil',
 'abix',
 'abklaeren',
 'abl',
 'ablx',
 'abnorm',
 'abof',
 'aborm',
 'abort',
 'aboveboard',
 'abovenet',
 'abq',
 'abr',
 'abraham',
 'abramo',
 'abreast',
 'abreo',
 'abroad',
 'absa',
 'abscissa',
 'absenc',
 'absolut',
 'absorbowac',
 'absout',
 'abstrus',
 'abyss',
 'ac',
 'academi',
 'academia',
 'acadian',
 'accentur',
 'accept',
 'access',
 'accid',
 'acco',
 'accompani',
 'accomplish',
 'accord',
 'accordingli',
 'account',
 'accountonlin',
 'accret',
 'accru',
 'accrual',
 'acctg',
 'accumul',
 'accur',
 'accuraci',
 'acervant',
 'acf',
 'acheiv',
 'achiev',
 'ack',
 'acknowledg',
 'aco',
 'acobl',
 'acoc',
 'acock',
 'acoop',
 'acorss',
 'acostello',
 'acquaint',
 'acquir',
 'acquisit',
 'acreag',
 'acredirect',
 'acrid',
 'acrobat',
 

In [3]:
# in ham & top100
ham_unique_in_top = list(set(ham_unique).intersection(set(top100_features)))
print(len(ham_unique_in_top))
ham_unique_in_top

16


['cera',
 'counterparti',
 'kal',
 'lokay',
 'enrononlin',
 'kaminski',
 'wassup',
 'topica',
 'listbot',
 'clickathom',
 'cdnow',
 'calger',
 'beenladen',
 'ena',
 'pjm',
 'sitara']

In [7]:
words14str = ""
for item in ham_unique_in_top:
    words14str = words14str + " " + item
    
words14str

' enrononlin ena reactionsnet cdnow kaminski lokay sitara counterparti clickathom topica cera eyeforenergi'

In [6]:
# ori > 1, ad = 0 & top100
ori1_ad0_in_top = list(set(ori1_ad0_diff).intersection(set(top100_features)))
print(len(ori1_ad0_in_top))
ori1_ad0_in_top

0


[]

In [7]:
# ori = 0, ad > 0 & in top100
ori0_ad1_in_top = list(set(ori0_ad1_diff).intersection(set(top100_features)))
print(len(ori0_ad1_in_top))
ori0_ad1_in_top

100


['discuss',
 'everyon',
 'vinc',
 'beenladen',
 'et',
 'pm',
 'steve',
 'cera',
 'congratul',
 'volum',
 'shirley',
 'pleas',
 'question',
 'daren',
 'amazon',
 'dave',
 'research',
 'cdnow',
 'friday',
 'april',
 'fyi',
 'eyeforenergi',
 'tanya',
 'contract',
 'risk',
 'expedia',
 'util',
 'probabl',
 'monday',
 'houston',
 'tim',
 'would',
 'password',
 'eol',
 'sitara',
 'pager',
 'accomplish',
 'chang',
 'bid',
 'energi',
 'file',
 'ga',
 'kaminski',
 'john',
 'deal',
 'saturday',
 'ken',
 'ena',
 'blackberri',
 'bob',
 'kevin',
 'tomorrow',
 'septemb',
 'employe',
 'enron',
 'ect',
 'louis',
 'jeff',
 'outag',
 'jhherbert',
 'know',
 'desk',
 'chart',
 'eb',
 'spreadsheet',
 'enrononlin',
 'texa',
 'portland',
 'var',
 'resum',
 'doc',
 'confer',
 'topica',
 'laci',
 'post',
 'capac',
 'attach',
 'octob',
 'counterparti',
 'updat',
 'chri',
 'san',
 'edu',
 'salli',
 'access',
 'duke',
 'model',
 'california',
 'meet',
 'jim',
 'mike',
 'schedul',
 'calger',
 'clickathom',
 'thank

In [8]:
# ori > 0, ad = 0 & in spam
ori1_ad0_in_spam = list(set(ori1_ad0_in_top).intersection(set(spam_unique)))
print(len(ori1_ad0_in_spam))
print(len(ori1_ad0_in_top), len(spam_unique))
ori1_ad0_in_spam

0
0 80643


[]

In [9]:
# ori > 0, ad = 0 & in ham
ori1_ad0_in_ham = list(set(ori1_ad0_in_top).intersection(set(ham_unique)))
print(len(ori1_ad0_in_ham))
print(len(ori1_ad0_in_ham), len(ham_unique))
ori1_ad0_in_ham

0
0 20471


[]

In [10]:
ori0_ad1_in_ham = list(set(ori0_ad1_in_top).intersection(set(ham_unique)))
print(len(ori0_ad1_in_ham))
print(len(ori0_ad1_in_ham), len(ham_unique))
ori0_ad1_in_ham

13
13 20471


['topica',
 'cdnow',
 'cera',
 'clickathom',
 'ena',
 'counterparti',
 'kaminski',
 'eyeforenergi',
 'beenladen',
 'calger',
 'sitara',
 'enrononlin',
 'jhherbert']

In [11]:
ori0_ad1_in_spam = list(set(ori0_ad1_in_top).intersection(set(spam_unique)))
print(len(ori0_ad1_in_spam))
print(len(ori0_ad1_in_spam), len(spam_unique))
ori0_ad1_in_spam

0
0 80643


[]