In [1]:
import os
import numpy as np
import re
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import nltk
from collections import Counter
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
import pickle
   


In [2]:
ps = PorterStemmer()
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

In [3]:
# Creates dictionary from all the emails in the directory
def build_dictionary(dir):
  # Read the file names
  emails = os.listdir(dir)
  emails.sort()
  # Array to hold all the words in the emails
  dictionary = []

  # Collecting all words from those emails
  for email in emails:
    m = open(os.path.join(dir, email))
    for i, line in enumerate(m):
      if i == 2: # Body of email is only 3rd line of text file
        words = line.split()
        dictionary += words

  # We now have the array of words, whoch may have duplicate entries
  dictionary = list(set(dictionary)) # Removes duplicates

  # Removes puctuations and non alphabets
  for index, word in enumerate(dictionary):
    if (word.isalpha() == False) or (len(word) == 1):
      del dictionary[index]
   #remove stopwords 
  for count,word in enumerate(dictionary):
    if word in stop_words:
        del dictionary[count]   
  dictionar_stem=[]
  for w in dictionary:
        dictionar_stem.append(lemmatizer.lemmatize(w.lower()))  
#   dictionary = Counter(dictionar_stem)
#   dictionary = dictionary.most_common(most)
  

  return dictionar_stem

In [4]:
def build_features(dir, dictionary):
  # Read the file names
  emails = os.listdir(dir)
  emails.sort()
  # ndarray to have the features
  features_matrix = np.zeros((len(emails), len(dictionary)))

  # collecting the number of occurances of each of the words in the emails
  for email_index, email in enumerate(emails):
    m = open(os.path.join(dir, email))
    for line_index, line in enumerate(m):
      if line_index == 2: #in each email body is at third line, which make index 2
        words = line.split()
        for word_index, word in enumerate(dictionary):
          features_matrix[email_index, word_index] = words.count(word)

  return features_matrix

In [5]:
def build_labels(dir):
  # Read the file names
  emails = os.listdir(dir)
  emails.sort()
  # ndarray of labels
  labels_matrix = np.zeros(len(emails))

  for index, email in enumerate(emails):
    labels_matrix[index] = 1 if re.search('spms*', email) else 0

  return labels_matrix

In [6]:
#choose 9 folder for traing and the remaining one for testing
files=list(range(1,10))
print(files)
print('Building dictionary')
train_dir=[]
test_file=files.pop(7)
for i in files:
        train_dir.append('../input/emaildataset/emaildataset/part{}'.format(i))
sub_dictionary=[]
for i in train_dir:
    sub_dictionary.append(build_dictionary(i))
#convert sub list (dictionary) to one flat list
flat_list = [item for sublist in sub_dictionary for item in sublist]
dictionary=list(set(flat_list)) #remove duplicated
print(files)
print('dictionary length', len(dictionary))

[1, 2, 3, 4, 5, 6, 7, 8, 9]
Building dictionary
[1, 2, 3, 4, 5, 6, 7, 9]
dictionary length 45591


In [7]:
len(dictionary)

45591

In [8]:
%%time
print('Building training features and labels')
sub_features_train=[]
sub_labels_train=[]
for i in train_dir:
    sub_features_train.append(build_features(i, dictionary)) #X train
    sub_labels_train.append(build_labels(i)) #y train
features_train=np.concatenate(sub_features_train)
labels_train=np.concatenate(sub_labels_train)    

Building training features and labels
CPU times: user 17min 18s, sys: 2.38 s, total: 17min 21s
Wall time: 17min 22s


In [9]:
test_file

8

In [10]:
test_dir = '../input/emaildataset/emaildataset/part{}'.format(test_file)
print('4. Building the test features and labels')
features_test = build_features(test_dir, dictionary) #X_test
labels_test = build_labels(test_dir) #y_test

4. Building the test features and labels


In [11]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='liblinear',random_state = 0)
# from sklearn.tree import DecisionTreeClassifier
# classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
#classifier.fit(X_train, y_train)
print('3. Training the classifier')
classifier.fit(features_train, labels_train) #X train, y_train

3. Training the classifier




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [12]:
print('5. Calculating accuracy of the trained classifier')
accuracy = classifier.score(features_test, labels_test)
print(accuracy)

5. Calculating accuracy of the trained classifier
0.986159169550173


In [13]:
#Classification report
label_pred = classifier.predict(features_test) # ypred #feature test ---> y test
print(classification_report(labels_test , label_pred ))


              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       241
         1.0       1.00      0.92      0.96        48

   micro avg       0.99      0.99      0.99       289
   macro avg       0.99      0.96      0.97       289
weighted avg       0.99      0.99      0.99       289



In [14]:
indices = [i for i in range(len(labels_test)) if labels_test[i] != label_pred[i]]
wrong_predictions = features_test[indices,:]


In [15]:
with open('text_classifier', 'wb') as picklefile:  
    pickle.dump(classifier,picklefile)

In [16]:
from sklearn.externals import joblib

# Save to file in the current working directory
joblib_file = "text_classification.pkl"  
joblib.dump(classifier, joblib_file)


['text_classification.pkl']

In [17]:
with open('dictionary', 'wb') as f:
    pickle.dump(dictionary, f)

In [18]:
print(indices)

[262, 264, 265, 267]


In [19]:
print(test_dir)

../input/emaildataset/emaildataset/part8


In [20]:
import glob
import os
all_files = glob.glob(os.path.join(test_dir, "*.txt")) 

In [21]:
for i in indices:
    print(open(all_files[i]).read())

Subject: analysts choose " adlu " microcap bestock pick

* * * * * we believe in opt-in policies . if you did not request this investor email , please do not reply , you will be taken off our email list automatically , or reply with remove * * adlu - otc : bb - patented , trademarked products , top management , sec form 10 reporting compliance , big four accounting and audits , international advertising campaign , digital imaging and internet applications , good investor relations , small float . this micro - cap is doing everything right ! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - dear investor , the analysts at bestockpix have chosen adlu as a micro - cap june ' 99 first pick . this swiss based , surface coating technology company is doing everything investors look for . the adlu consu