### Download Spam Mail Dataset


In [1]:
! curl http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/preprocessed/enron1.tar.gz --output enron1.tar.gz
! tar -xf enron1.tar.gz enron1

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1760k  100 1760k    0     0   118k      0  0:00:14  0:00:14 --:--:--  228k


In [2]:
! ls -1 enron1/ham/*.txt | wc -l # print the number of non-spam emails

    3672


In [3]:
! ls -1 enron1/spam/*.txt | wc -l # print the number of spam emails

    1500


In [4]:
! cat enron1/ham/0007.1999-12-14.farmer.ham.txt # print an example of non-spam (ham) email

Subject: mcmullen gas for 11 / 99
jackie ,
since the inlet to 3 river plant is shut in on 10 / 19 / 99 ( the last day of
flow ) :
at what meter is the mcmullen gas being diverted to ?
at what meter is hpl buying the residue gas ? ( this is the gas from teco ,
vastar , vintage , tejones , and swift )
i still see active deals at meter 3405 in path manager for teco , vastar ,
vintage , tejones , and swift
i also see gas scheduled in pops at meter 3404 and 3405 .
please advice . we need to resolve this as soon as possible so settlement
can send out payments .
thanks

### Download Packages

In [6]:
! pip3 install nltk
! python -m nltk.downloader all #이 부분에서 많은 시간을 소요합니다.

Defaulting to user installation because normal site-packages is not writeable
[31mERROR: Could not find a version that satisfies the requirement ntlk (from versions: none)[0m
[31mERROR: No matching distribution found for ntlk[0m
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
/System/Library/Frameworks/Python.framework/Versions/2.7/Resources/Python.app/Contents/MacOS/Python: No module named nltk


### Load Dataset

In [7]:
import glob, os

# init
"""
emails: a set of email
labels: a set of label representing whetere the gien email is spam or ham
  - spam: 1
  - ham: 0
"""

emails, labels = [], []
parition = 0

In [8]:
# load spam dataset
file_path = 'enron1/spam'

for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f: # [!important] check encofing format
      emails.append(f.read())
      labels.append(1)

file_path = 'enron1/ham'
for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f: 
      emails.append(f.read())
      labels.append(0) 

print('# of emails = {}\n# of labels = {}'.format(len(emails), len(labels)))

# of emails = 5172
# of labels = 5172


### Data Preprocessing
  - remove number and punctuation
  - remove name entity
  - remove stopword
  - lemmatization

In [55]:
# remove number and punctuation 
def letters_only(word):
    return word.isalpha()

# remove name entity
from nltk.corpus import names
all_names = set(names.words())

# lemmaization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


# put all together to clean texts
def clean_text(doc):
    cleaned_doc = []
    for word in doc.split(' '): # split doc. by blank (' ')
        word = word.lower() # ABD -> abd
        if letters_only(word) and word not in all_names and len(word) > 2: # remove number and punc. and name entity
            cleaned_doc.append(lemmatizer.lemmatize(word))
            
    return ' '.join(cleaned_doc) 

cleaned_emails = [clean_text(doc) for doc in emails]

### Data Preparation
* Split data into train and test set.

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

X_train, X_test, Y_train, Y_test = train_test_split(cleaned_emails, labels, test_size=0.33, random_state=486)

cv = CountVectorizer(stop_words='english', max_features=500)
term_docs_train = cv.fit_transform(X_train) # get counter vector for X_train
term_docs_test = cv.transform(X_test) # get counter vector for X_test

### Import Model and Train

In [61]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB

gnb = MultinomialNB()
gnb.fit(term_docs_train.todense(), Y_train)
y_pred = gnb.predict(term_docs_test.todense())



### Simple Evaluation

In [60]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(Y_test, y_pred)
print("Accuracy of the model is: {:.2f}".format(acc))


Accuracy of the model is: 0.90
