# 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'

# 2. Import data

In [2]:
data = pd.read_csv("../data/real/spam.csv",encoding='latin-1', header = 0, sep='\t')
data['Class'] = data.Class.map({'ham':0, 'spam':1})

In [40]:
data.head()

Unnamed: 0,Class,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [41]:
#Count observations in each label
print data.Class.value_counts()

ham     4825
spam     747
Name: Class, dtype: int64


In [3]:
# convert label to a numerical variable
x_train = data['Text'].values
y_train = data['Class'].values

# 4.Text Transformation
Various text transformation techniques such as stop word removal, lowering the texts, tfidf transformations, prunning, stemming can be performed using sklearn.feature_extraction libraries. Then, the data can be convereted into bag-of-words. <br> <br>
For this problem, Let us see how our model performs without removing stop words.

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
vect = TfidfVectorizer()
x_train = vect.fit_transform(x_train)

In [5]:
x_train.shape

(5572, 8749)

In [6]:
# Reduce data dimension
x_train = TruncatedSVD(n_components=100).fit_transform(x_train)

In [220]:
import pickle
pickle.dump({'x':x_train, 'y':np.eye(2)[y_train.astype('int32')]}, open('../data/real/spam/train.pkl', 'wb+'))

# 6. Machine Learning models:

In [7]:
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import LinearSVC as SVM
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.cross_validation import train_test_split # to split the data
from sklearn.cross_validation import KFold # For cross vbalidation
from sklearn.model_selection import GridSearchCV # for tunnig hyper parameter it will use all combination of given parameters
from sklearn.model_selection import RandomizedSearchCV # same for tunning hyper parameter but will use random combinations of parameters
from sklearn.metrics import (confusion_matrix,recall_score,precision_recall_curve,auc,roc_curve,roc_auc_score,classification_report,
                             accuracy_score,confusion_matrix,classification_report)
from imblearn.pipeline import make_pipeline
from time import time



In [8]:
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.metrics import precision_recall_curve, auc, f1_score, precision_recall_fscore_support
from imblearn.metrics import geometric_mean_score

def evaluate_auc_prc(y, pred):
    precision, recall, thresholds = precision_recall_curve(y, pred)
    aucprc = auc(recall, precision)
    #print 'AUPRC:{}'.format(aucprc)
    #plt.title('Precision Recall Curve')
    #plt.plot(precision, recall, 'b',label='AUC = %0.2f'% aucprc)
    #plt.legend(loc='lower right')
    #plt.ylabel('Precision')
    #plt.xlabel('Recall')
    #plt.show()
    return aucprc


def evaluate_f1(y, y_pred, pos_label=1):
    precision, recall, f1, support = precision_recall_fscore_support(y, y_pred, pos_label=1)
    print classification_report(y, y_pred)
    return f1[1]

def evaluate_f2(y, y_pred, pos_label=1):
    precision, recall, f1, support = precision_recall_fscore_support(y, y_pred, pos_label=1)
    #print classification_report(y, y_pred)
    f2 = (1+0.5**2)*(precision[1]*recall[1])/(0.5**2*precision[1]+recall[1])
   
    return f2

aucprc_score = make_scorer(evaluate_auc_prc, greater_is_better=True)

gms = make_scorer(geometric_mean_score, greater_is_better=True)

f1_scorer = make_scorer(evaluate_f1, greater_is_better=True)

f2_scorer = make_scorer(evaluate_f2, greater_is_better=True)

def data_split(x, test_size=0.4, seed=1): # preparing data for training and testing as we are going to use different data 
    #again and again so make a function
    x_features= x.ix[:,x.columns != "Class"]
    x_labels=x.ix[:,x.columns=="Class"]
    x_features_train,x_features_test,x_labels_train,x_labels_test = train_test_split(x_features,x_labels,
                                                                                     test_size=test_size, 
                                                                                     random_state=seed)
    x_features_train.index = np.arange(len(x_features_train))
    x_labels_train.index = np.arange(len(x_labels_train))
    x_features_test.index = np.arange(len(x_features_test))
    x_labels_test.index = np.arange(len(x_labels_test))
    print("length of training data")
    print(len(x_features_train))
    print("length of test data")
    print(len(x_features_test))
    return(x_features_train,x_features_test,x_labels_train,x_labels_test)

In [10]:
# Choose classifer with cross validation
# Supervised Model: LR, SVM (linear), SVM (rbf), DT, KNN, MLP
# Dataset: original
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import matthews_corrcoef

# LR
# best_c = 0
# best_reward = 0
# for c in [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]:
#     clf = LR(C=c, random_state=0)
#     reward = np.array(cross_val_score(clf, x_train, y_train, cv=5, n_jobs=5, scoring=f2_scorer)).mean()
#     if reward > best_reward:
#         best_reward = reward
#         best_c = c
# print 'best_c for LR:{}'.format(best_c)
# print 'best_reward for LR:{}'.format(best_reward)

# # SVM (linear)
# best_c = 0
# best_reward = 0
# for c in [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]:
#     clf = SVM(C=c, random_state=0)
#     reward = np.array(cross_val_score(clf, x_train, y_train, cv=5, n_jobs=5, scoring=f2_scorer)).mean()
#     if reward > best_reward:
#         best_reward = reward
#         best_c = c
# print 'best_c for SVM:{}'.format(best_c)
# print 'best_reward for SVM:{}'.format(best_reward)

# # SVM (rbf)
# best_c = 0
# best_reward = 0
# for c in [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]:
#     clf = SVC(C=c, kernel='rbf', random_state=0)
#     reward = np.array(cross_val_score(clf, x_train, y_train, cv=5, n_jobs=5, scoring=f2_scorer)).mean()
#     if reward > best_reward:
#         best_reward = reward
#         best_c = c
# print 'best_c for SVC:{}'.format(best_c)
# print 'best_reward for SVC:{}'.format(best_reward)

# # DT 
# best_c = 0
# best_reward = 0
# for c in [1, 2, 3, 4, 5, 6, 7, 8]:
#     clf = DT(max_depth=c, random_state=0)
#     reward = np.array(cross_val_score(clf, x_train, y_train, cv=5, n_jobs=5, scoring=f2_scorer)).mean()
#     if reward > best_reward:
#         best_reward = reward
#         best_c = c
# print 'best_c for DT:{}'.format(best_c)
# print 'best_reward DT:{}'.format(best_reward)

# # KNN 
# best_c = 0
# best_reward = 0
# for c in [1, 2, 3, 4, 5, 6, 7, 8]:
#     clf = KNN(n_neighbors=c)
#     reward = np.array(cross_val_score(clf, x_train, y_train, cv=5, n_jobs=5, scoring=f2_scorer)).mean()
#     if reward > best_reward:
#         best_reward = reward
#         best_c = c
# print 'best_c for KNN:{}'.format(best_c)
# print 'best_reward KNN:{}'.format(best_reward)

# MLP 
best_c = 0
best_reward = 0
for c in [1, 2, 3, 4, 5, 6, 7, 8]:
    clf = MLP(hidden_layer_sizes=[c], random_state=0)
    reward = np.array(cross_val_score(clf, x_train, y_train, cv=5, n_jobs=5, scoring=f2_scorer)).mean()
    if reward > best_reward:
        best_reward = reward
        best_c = c
print 'best_c for MLP:{}'.format(best_c)
print 'best_reward MLP:{}'.format(best_reward)

best_c for MLP:8
best_reward MLP:0.93270938744


###  Multinomial Naive Bayes
Generally, Naive Bayes works well on text data. Multinomail Naive bayes is best suited for classification with discrete features. 

In [192]:
# Choose hyper-parameters with cross validation
# Supervised Model: NB
# Dataset: original
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC


best_c = 0
best_reward = 0
for c in [1e2]:
    print('c:{}'.format(c))
    clf = MultinomialNB()
    clf.fit(x_train, y_train)
    preds = clf.predict(x_train)
    reward = np.array(cross_val_score(clf, x_train, y_train, cv=2, n_jobs=2, 
                                      scoring=f2_scorer)).mean()
    if reward > best_reward:
        best_reward = reward
        best_c = c
print 'best_c:{}'.format(best_c)
print 'best_reward for NB:{}'.format(best_reward)

c:100.0
best_c:0
best_reward for NB:0


###  Logistic Regression

In [216]:
# Choose hyper-parameters with cross validation
# Supervised Model: LR
# Dataset: original
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC


best_c = 0
best_reward = 0
for c in [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6]:
    clf = LR(C=c, random_state=0)
    reward = np.array(cross_val_score(clf, x_train, y_train, cv=5, n_jobs=5, 
                                      scoring=f2_scorer)).mean()
    if reward > best_reward:
        best_reward = reward
        best_c = c
print 'best_c:{}'.format(best_c)
print 'best_reward for LR:{}'.format(best_reward)
clf = LR(C=best_c, random_state=0)

best_c:10.0
best_reward for LR:0.930359664121


In [217]:
clf =LR(C=1e1, random_state=0)
clf.fit(x_train, y_train)
preds = clf.predict(x_train)
print evaluate_f2(y_train, preds)

0.935938877461


### Support Vector Machine

In [215]:
# Choose hyper-parameters with cross validation
# Supervised Model: LR
# Dataset: original
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

best_c = 0
best_reward = 0
for c in [1e1, 1e2, 1e3]:
    clf = SVM(C=c, random_state=0)
    reward = np.array(cross_val_score(clf, x_train, y_train, cv=5, n_jobs=5, 
                                      scoring=f2_scorer)).mean()
    if reward > best_reward:
        best_reward = reward
        best_c = c
print 'best_c:{}'.format(best_c)
print 'best_reward for linear SVM:{}'.format(best_reward)

best_c:1000.0
best_reward for linear SVM:0.92530228667


In [198]:
clf = SVM(C=1e1, random_state=0)
clf.fit(x_train, y_train)
preds = clf.predict(x_train)
print evaluate_f2(y_train, preds)

0.939892259711


###  $k$-NN classifier

In [213]:
# Choose model with cross validation
# Supervised Model: KNN
# Dataset: original
from sklearn.model_selection import cross_val_score

best_c = 0
best_reward = 0
for c in np.arange(3,10):
    clf = KNN(n_neighbors=c)
    reward = np.array(cross_val_score(clf, x_train, y_train, cv=5, n_jobs=5, 
                                      scoring=f2_scorer)).mean()
    if reward > best_reward:
        best_reward = reward
        best_c = c
print 'best_c:{}'.format(best_c)
print 'best_reward for KNN:{}'.format(best_reward)

best_c:6
best_reward for KNN:0.922153122758


In [214]:
clf = KNN(n_neighbors=6)
clf.fit(x_train, y_train)
preds = clf.predict(x_train)
print evaluate_f2(y_train, preds)

0.947978690066


###  Decision Tree

In [219]:
# Choose model with cross validation
# Supervised Model: DT
# Dataset: original
from sklearn.model_selection import cross_val_score

best_c = 0
best_reward = 0
for c in np.arange(2, 20):
    clf = DT(max_depth=c)
    reward = np.array(cross_val_score(clf, x_train, y_train, cv=5, n_jobs=5, 
                                      scoring=f2_scorer)).mean()
    if reward > best_reward:
        best_reward = reward
        best_c = c
print 'best_c:{}'.format(best_c)
print 'best_reward for DT:{}'.format(best_reward)

best_c:6
best_reward for DT:0.839226818602


In [218]:
clf = DT(max_depth=6)
clf.fit(x_train, y_train)
preds = clf.predict(x_train)
print evaluate_f2(y_train, preds)

0.936002255427


###  AdaBoost

In [207]:
# Choose model with cross validation
# Supervised Model: AdaBoost
# Dataset: original
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score

best_c = 0
best_reward = 0
for c in np.arange(1, 2):
    clf = AdaBoostClassifier()
    reward = np.array(cross_val_score(clf, x_train, y_train, cv=5, n_jobs=5, 
                                      scoring=f2_scorer)).mean()
    if reward > best_reward:
        best_reward = reward
        best_c = c
print 'best_c:{}'.format(best_c)
print 'best_reward for AdaBoost:{}'.format(best_reward)

best_c:1
best_reward for AdaBoost:0.902406624162


# Sampling Evaluation with LR

In [None]:
# Performance on original dataset with the chosen model.
clf = LR(C=1e1, random_state=0)
clf.fit(x_train, y_train)
preds = clf.predict(x_train)
print evaluate_f2(y_train, preds)
# print 'reward on original dataset:{}'.format(reward)

In [228]:
# Choose under-sampling ratio for different under-sampling methods
# Supervised Model: SVM(C=1e2, kernel='rbf', random_state=0)
# Under sampling methods: Random, ENN, Cluster, TomekLinks, ALLKNN
from imblearn.under_sampling import (NearMiss, RandomUnderSampler, EditedNearestNeighbours, 
                                     CondensedNearestNeighbour, ClusterCentroids, TomekLinks,
                                     RepeatedEditedNearestNeighbours, AllKNN)
from imblearn.pipeline import make_pipeline
from collections import Counter


train_x = x_train
train_y = y_train
pos_num = (train_y == 1).sum()
neg_num = (train_y == 0).sum()

for sampler, name, time in zip([RandomUnderSampler, EditedNearestNeighbours, ClusterCentroids, TomekLinks, AllKNN],
                         ['Random', 'ENN', 'Cluster', 'TomekLinks', 'ALLKNN'], [50, 1, 1, 1, 1]):
    max_i = 1
    best_reward = -1
    for i in np.arange(1, 1.0*neg_num/pos_num, 0.5):
        sample = sampler(ratio={0:int(i*pos_num)})
        train_x_s, train_y_s = sample.fit_sample(train_x, train_y)
        clf.fit(train_x_s, train_y_s)
        preds = clf.predict(train_x)
        reward = evaluate_f2(train_y, preds)
        if best_reward < reward:
            best_reward = reward    
            max_i = i
    print 'best under-sampling ratio with {}:{}'.format(name, max_i)
    
    best_reward = 0
    for i in np.arange(time):
        sample = sampler(ratio={0:int(max_i*pos_num)})
        train_x_s, train_y_s = sample.fit_sample(train_x, train_y)
        clf.fit(train_x_s, train_y_s)
        preds = clf.predict(train_x)
        reward = evaluate_f2(train_y, preds)
        if best_reward < reward:
            best_reward = reward    
    print 'best reward with {}:{}'.format(name, best_reward)

best under-sampling ratio with Cluster:6.0
best reward:0.931982633864
best reward with Cluster:0.931982633864
best under-sampling ratio with TomekLinks:6.0
best reward:0.935938877461
best reward with TomekLinks:0.935938877461
best under-sampling ratio with ALLKNN:6.0
best reward:0.933485583785
best reward with ALLKNN:0.933485583785


In [12]:
# Perform EasyEnsemble and BalanceCascade
from imblearn.ensemble import EasyEnsemble, BalanceCascade

train_x = x_train
train_y = y_train
pos_num = (train_y == 1).sum()
neg_num = (train_y == 0).sum()

for sampler, name in zip([EasyEnsemble, BalanceCascade],
                         ['EasyEnsemble']):
    max_i = 1
    best_reward = -1
    for i in np.arange(1, 1.0*neg_num/pos_num, 0.2):
        sample = sampler(ratio={0:int(i*pos_num)}, replacement=False, n_subsets=10)
        train_x_s, train_y_s = sample.fit_sample(x_train, y_train)
        preds = None
        for x, y in zip(train_x_s, train_y_s):
            clf.fit(x, y)
            if preds is None:
                preds = clf.predict(train_x)
            else:
                preds += clf.predict(train_x)
        preds = (preds >= ((len(train_x_s)+1)/2)).astype('int32')
        reward = evaluate_f2(train_y, preds)
        if best_reward < reward:
            best_reward = reward    
            max_i = i
    print 'best under-sampling ratio for {}:{}'.format(name, max_i)
    print 'best reward for {}:{}'.format(name, best_reward)

best under-sampling ratio for EasyEnsemble:6.2
best reward for EasyEnsemble:0.938864628821


In [247]:
# Choose over-sampling ratio for different orver-sampling methods
# Supervised Model: DT(max_depth=4)
# Over sampling methods: Random, SMOTE, ADASYN
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.pipeline import make_pipeline
from collections import Counter

for sampler, name, time in zip([RandomOverSampler, SMOTE, ADASYN],
                         ['RandomOverSampler', 'SMOTE', 'ADASYN'], [10, 10, 10]):
    max_i = 1
    best_reward = -1
    for i in np.arange(1, 6, 0.2):
        sample = sampler(ratio={1:int(i*pos_num)})
        train_x_s, train_y_s = sample.fit_sample(train_x, train_y)
        clf.fit(train_x_s, train_y_s)
        preds = clf.predict(train_x)
        reward = evaluate_f2(train_y, preds)
        if best_reward < reward:
            best_reward = reward    
            max_i = i
    print 'best over-sampling ratio for {}:{}'.format(name, max_i)
    best_reward = 0
    bset_set = None
    for i in np.arange(time):
        sample = sampler(ratio={1:int(max_i*pos_num)})
        train_x_s, train_y_s = sample.fit_sample(train_x, train_y)
        clf.fit(train_x_s, train_y_s)
        preds = clf.predict(train_x)
        reward = evaluate_f2(train_y, preds)
        if best_reward < reward:
            best_reward = reward   
            bset_set = (train_x_s, train_y_s)
    print 'best reward with {}:{}'.format(name, best_reward)
    pickle.dump({'x':bset_set[0], 'y':np.eye(2)[bset_set[1].astype('int32')]}, open('../data/real/spam/train_{}.pkl'.format(name), 'wb+'))
    

best over-sampling ratio for RandomOverSampler:1.0
best reward with RandomOverSampler:0.935938877461
best over-sampling ratio for SMOTE:1.0
best reward with SMOTE:0.935938877461
best over-sampling ratio for ADASYN:1.0
best reward with ADASYN:0.935938877461


In [241]:
pos_num

747