In [1]:
import numpy as np
import sklearn
import sklearn.datasets as skd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import mutual_info_score
from sklearn.metrics import precision_score, recall_score
from sklearn.svm import SVC
from sklearn import svm
from sklearn.model_selection import train_test_split

In [2]:
#Each sub-directory in the parent directory is assumed to contain documents from the same class
#I pre-processed the part1 (fold1) and part2 (fold2) of the lingspam dataset to place spam emails in one folder 
#and legit emails in another; you should do the same for the entire dataset, either manually or via a script. 

ls_train = skd.load_files(container_path='./lingspam_public/lemm_stop/train')
ls_test = skd.load_files(container_path='./lingspam_public/lemm_stop/part10')

In [3]:
# #The count vectorizer classes fit_transform function generates a vocoabulary that contains each unique term in the dataset
# #and outputs a sparse matrix tabulating term occurences
count_vect = CountVectorizer()
x_train = count_vect.fit_transform(ls_train.data)

#Since the vocabulary has already been learned, use the transform function to transform the test data using the same vocab
x_test = count_vect.transform(ls_test.data)

In [4]:
## TF Mulitnomial Bern
######################

tf_transformer = TfidfTransformer(use_idf = False)
x_train_tf = tf_transformer.fit_transform(x_train)
x_test_tf = tf_transformer.fit_transform(x_test)

In [5]:
## Information Gain

ig_train_tf = mutual_info_classif(x_train_tf, ls_train.target, discrete_features='auto', n_neighbors=3, copy=True, random_state=None)


In [6]:
top_train_10 = np.argsort(-ig_train_tf)[:10]
top_train_100 = np.argsort(-ig_train_tf)[:100]
top_train_1000 = np.argsort(-ig_train_tf)[:1000]

# print(np.sort(top_train_10))

feature_names_arg_1 = top_train_10
# print(np.sort(top_test_10))

In [7]:
from scipy.sparse import hstack
from scipy.sparse import csr_matrix

new_x_t_10   = x_train_tf[:,top_train_10[0]]
new_x_t_100  = x_train_tf[:,top_train_100[0]]
new_x_t_1000 = x_train_tf[:,top_train_1000[0]]

# test dataset preprocess
new_x_te_10   = x_test_tf[:,top_train_10[0]]
new_x_te_100  = x_test_tf[:,top_train_100[0]]
new_x_te_1000 = x_test_tf[:,top_train_1000[0]]

In [8]:
# 10 features case

for i in range(1,10):
    new_x_t_10 = hstack((new_x_t_10, x_train_tf[:,top_train_10[i]]))
    new_x_te_10 = hstack((new_x_te_10, x_test_tf[:,top_train_10[i]]))
#     print(np.shape(new_x_te_10))

new_x_train_10 = csr_matrix(new_x_t_10)
new_x_test_10 = csr_matrix(new_x_te_10)

In [9]:
mNomTF = sklearn.naive_bayes.MultinomialNB();
mNomTF.fit(new_x_train_10,ls_train.target);

In [10]:
#Test the accuracy of the trained classifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

y_pred_mNomTF_10 = mNomTF.predict(new_x_test_10)

acc_mNomTF_10 = mNomTF.score(new_x_test_10,ls_test.target)

print("MultiNomial NB Term Frequency with 10 features:")
print("Acc: %s" %acc_mNomTF_10)

precision,recall,f1,_ = precision_recall_fscore_support(ls_test.target, y_pred_mNomTF_10)
print("Spam precision & recall: %s" %precision[1], recall[1])
# precision_recall_fscore_support(ls_test.target, y_pred_mNomTF_10)

MultiNomial NB Term Frequency with 10 features:
Acc: 0.831615120275
Spam precision & recall: 0.0 0.0


  'precision', 'predicted', average, warn_for)


In [11]:
# Cm = confusion_matrix(ls_test.target,y_pred_mNomTF_10)
# C = np.sum(Cm)
# Cm = Cm/C
# print('Confusion Matrix:')
# print(np.array_str(Cm, precision=4, suppress_small=True))
# print(precision_recall_fscore_support(ls_test.target, y_pred_mNomTF_10))

In [12]:
# 100 features case

for i in range(1,100):
    new_x_t_100 = hstack((new_x_t_100, x_train_tf[:,top_train_100[i]]))
    new_x_te_100 = hstack((new_x_te_100, x_test_tf[:,top_train_100[i]]))

# print(np.shape(new_x_te_100))

new_x_train_100 = csr_matrix(new_x_t_100)
new_x_test_100 = csr_matrix(new_x_te_100)

In [13]:
mNomTF = sklearn.naive_bayes.MultinomialNB()
mNomTF.fit(new_x_train_100,ls_train.target)

#Test the accuracy of the trained classifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

y_pred_mNomTF_100 = mNomTF.predict(new_x_test_100)

acc_mNomTF_100 = mNomTF.score(new_x_test_100,ls_test.target)

print("MultiNomial NB Term Frequency with 100 features:")
print("Acc: %s" %acc_mNomTF_100)

precision,recall,f1,_ = precision_recall_fscore_support(ls_test.target, y_pred_mNomTF_100)
print("Spam precision & recall: %s" %precision[1], recall[1])

MultiNomial NB Term Frequency with 100 features:
Acc: 0.900343642612
Spam precision & recall: 1.0 0.408163265306


In [14]:
# 1000 features case

for i in range(1,1000):
    new_x_t_1000 = hstack((new_x_t_1000, x_train_tf[:,top_train_1000[i]]))
    new_x_te_1000 = hstack((new_x_te_1000, x_test_tf[:,top_train_1000[i]]))

# print(np.shape(new_x_te_1000))

new_x_train_1000 = csr_matrix(new_x_t_1000)
new_x_test_1000 = csr_matrix(new_x_te_1000)

In [15]:
mNomTF = sklearn.naive_bayes.MultinomialNB()
mNomTF.fit(new_x_train_1000,ls_train.target)

#Test the accuracy of the trained classifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

y_pred_mNomTF_1000 = mNomTF.predict(new_x_test_1000)

acc_mNomTF_1000 = mNomTF.score(new_x_test_1000,ls_test.target)

print("MultiNomial NB Term Frequency with 1000 features:")
print("Acc: %s" %acc_mNomTF_1000)

precision,recall,f1,_ = precision_recall_fscore_support(ls_test.target, y_pred_mNomTF_1000)
print("Spam precision & recall: %s" %precision[1], recall[1])

MultiNomial NB Term Frequency with 1000 features:
Acc: 0.979381443299
Spam precision & recall: 1.0 0.877551020408


In [16]:
## Multinomial Binary Naive Based
#################################

count_vect_binary = CountVectorizer(binary = True)
x_train_binary = count_vect_binary.fit_transform(ls_train.data)

#Since the vocabulary has already been learned, use the transform function to transform the test data using the same vocab
x_test_binary = count_vect_binary.transform(ls_test.data)

In [17]:
## Information Gain
ig_train_binary = mutual_info_classif(x_train_binary, ls_train.target, discrete_features='auto', n_neighbors=3, copy=True, random_state=None)

In [18]:
top_train_10 = np.argsort(-ig_train_binary)[:10]
top_train_100 = np.argsort(-ig_train_binary)[:100]
top_train_1000 = np.argsort(-ig_train_binary)[:1000]

feature_names_arg_2 = top_train_10
feature_names_arg_2_100 = top_train_100
feature_names_arg_2_1000 = top_train_1000

In [19]:
from scipy.sparse import hstack
from scipy.sparse import csr_matrix

new_x_t_10   = x_train_binary[:,top_train_10[0]]
new_x_t_100  = x_train_binary[:,top_train_100[0]]
new_x_t_1000 = x_train_binary[:,top_train_1000[0]]

# test dataset preprocess
new_x_te_10   = x_test_binary[:,top_train_10[0]]
new_x_te_100  = x_test_binary[:,top_train_100[0]]
new_x_te_1000 = x_test_binary[:,top_train_1000[0]]

In [20]:
# 10 features case

for i in range(1,10):
    new_x_t_10 = hstack((new_x_t_10, x_train_binary[:,top_train_10[i]]))
    new_x_te_10 = hstack((new_x_te_10, x_test_binary[:,top_train_10[i]]))
#     print(np.shape(new_x_te_10))

new_x_train_10 = csr_matrix(new_x_t_10)
new_x_test_10 = csr_matrix(new_x_te_10)

In [21]:
mNom = sklearn.naive_bayes.MultinomialNB();
mNom.fit(new_x_train_10,ls_train.target);
#Test the accuracy of the trained classifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

y_pred_mNom_10 = mNom.predict(new_x_test_10)

acc_mNom_10 = mNom.score(new_x_test_10,ls_test.target)

print("MultiNomial NB with 10 features:")
print("Acc: %s" %acc_mNom_10)

precision,recall,f1,_ = precision_recall_fscore_support(ls_test.target, y_pred_mNom_10)
print("Spam precision & recall: %s" %precision[1], recall[1])

MultiNomial NB with 10 features:
Acc: 0.951890034364
Spam precision & recall: 0.888888888889 0.816326530612


In [22]:
# 100 features case

for i in range(1,100):
    new_x_t_100 = hstack((new_x_t_100, x_train_binary[:,top_train_100[i]]))
    new_x_te_100 = hstack((new_x_te_100, x_test_binary[:,top_train_100[i]]))
#     print(np.shape(new_x_te_10))

new_x_train_100 = csr_matrix(new_x_t_100)
new_x_test_100 = csr_matrix(new_x_te_100)

mNom = sklearn.naive_bayes.MultinomialNB()
mNom.fit(new_x_train_100,ls_train.target)

#Test the accuracy of the trained classifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

y_pred_mNom_100 = mNom.predict(new_x_test_100)

acc_mNom_100 = mNom.score(new_x_test_100,ls_test.target)

print("MultiNomial NB with 100 features:")
print("Acc: %s" %acc_mNom_100)

precision,recall,f1,_ = precision_recall_fscore_support(ls_test.target, y_pred_mNom_100)
print("Spam precision & recall: %s" %precision[1], recall[1])

MultiNomial NB with 100 features:
Acc: 0.982817869416
Spam precision & recall: 0.978260869565 0.918367346939


In [23]:
# 1000 features case

for i in range(1,1000):
    new_x_t_1000 = hstack((new_x_t_1000, x_train_binary[:,top_train_1000[i]]))
    new_x_te_1000 = hstack((new_x_te_1000, x_test_binary[:,top_train_1000[i]]))
#     print(np.shape(new_x_te_10))

new_x_train_1000 = csr_matrix(new_x_t_1000)
new_x_test_1000 = csr_matrix(new_x_te_1000)

mNom = sklearn.naive_bayes.MultinomialNB()
mNom.fit(new_x_train_1000,ls_train.target)
clf3 = mNom
#Test the accuracy of the trained classifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

y_pred_mNom_1000 = mNom.predict(new_x_test_1000)

acc_mNom_1000 = mNom.score(new_x_test_1000,ls_test.target)

print("MultiNomial NB with 1000 features:")
print("Acc: %s" %acc_mNom_1000)

precision,recall,f1,_ = precision_recall_fscore_support(ls_test.target, y_pred_mNom_1000)
print("Spam precision & recall: %s" %precision[1], recall[1])

MultiNomial NB with 1000 features:
Acc: 0.989690721649
Spam precision & recall: 1.0 0.938775510204


In [24]:
# Bernouli NB Case
##################


In [25]:
# 10 features
berNB_10 = sklearn.naive_bayes.BernoulliNB()
berNB_10.fit(new_x_train_10, ls_train.target)

y_pred_berNB_10 = berNB_10.predict(new_x_test_10)

acc_berNB_10 = berNB_10.score(new_x_test_10,ls_test.target)

print("Bernouli NB with 10 features:")
print("Acc: %s" %acc_berNB_10)

precision,recall,f1,_ = precision_recall_fscore_support(ls_test.target, y_pred_berNB_10)
print("Spam precision & recall: %s" %precision[1], recall[1])

Bernouli NB with 10 features:
Acc: 0.948453608247
Spam precision & recall: 0.869565217391 0.816326530612


In [26]:
# 100 features
berNB_100 = sklearn.naive_bayes.BernoulliNB()
berNB_100.fit(new_x_train_100, ls_train.target)

y_pred_berNB_100 = berNB_100.predict(new_x_test_100)

acc_berNB_100 = berNB_100.score(new_x_test_100,ls_test.target)

print("Bernouli NB with 100 features:")
print("Acc: %s" %acc_berNB_100)

precision,recall,f1,_ = precision_recall_fscore_support(ls_test.target, y_pred_berNB_100)
print("Spam precision & recall: %s" %precision[1], recall[1])

Bernouli NB with 100 features:
Acc: 0.945017182131
Spam precision & recall: 1.0 0.673469387755


In [27]:
# 1000 features
berNB_1000 = sklearn.naive_bayes.BernoulliNB()
berNB_1000.fit(new_x_train_1000, ls_train.target)

y_pred_berNB_1000 = berNB_1000.predict(new_x_test_1000)

acc_berNB_1000 = berNB_1000.score(new_x_test_1000,ls_test.target)

print("Bernouli NB with 1000 features:")
print("Acc: %s" %acc_berNB_1000)

precision,recall,f1,_ = precision_recall_fscore_support(ls_test.target, y_pred_berNB_1000)
print("Spam precision & recall: %s" %precision[1], recall[1])

Bernouli NB with 1000 features:
Acc: 0.93470790378
Spam precision & recall: 1.0 0.612244897959


In [33]:
# cross validation

x_train_svm, x_test_svm, y_train_svm, y_test_svm = train_test_split(
    new_x_train_1000, ls_train.target, test_size=0.4, random_state=0)

In [34]:
# SVM
from sklearn.svm import SVC

clf = sklearn.svm.SVC()
clf.fit(x_train_svm, y_train_svm)

y_pred_svm_1000 = clf.predict(x_test_svm)

svm_acc = np.mean(y_pred_svm_1000 == y_test_svm)
print('Acc: {0:f}'.format(svm_acc))

precision,recall,f1,_ = precision_recall_fscore_support(y_test_svm, y_pred_svm_1000)
print("Spam precision & recall: %s" %precision[1], recall[1])

Acc: 0.928915
Spam precision & recall: 1.0 0.581920903955


In [36]:
import pickle
with open("top_1000.txt", "wb") as fp:   #Pickling
    pickle.dump(feature_names_arg_2_1000, fp)
    
from sklearn.externals import joblib
joblib.dump(clf3, 'multinomial_NB_1000.pkl') 

['multinomial_NB_1000.pkl']

In [37]:
# print out the top 10, 100, 1000 features selected

# top 10
for item in feature_names_arg_2:
    print((count_vect.get_feature_names())[int(item)])

# # top 100
# for item in feature_names_arg_2_100:
#     print((count_vect.get_feature_names())[int(item)])

# top 1000
# for item in feature_names_arg_2_1000:
#     print((count_vect.get_feature_names())[int(item)])



language
remove
free
linguistic
university
money
click
market
our
business
