In [1]:
import pandas as pd
import numpy as np
import random

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_curve, auc

from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('../data/SMSSpamCollection', sep='\t', header=None)
data['Text'] = data[1].str.replace('[^\w\s]','')
data.columns = ['label', 'Full Text', 'Text']
data['Lower Case Text'] = data['Text'].str.lower()

In [3]:
data

Unnamed: 0,label,Full Text,Text,Lower Case Text
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,nah i dont think he goes to usf he lives aroun...
5,spam,FreeMsg Hey there darling it's been 3 week's n...,FreeMsg Hey there darling its been 3 weeks now...,freemsg hey there darling its been 3 weeks now...
6,ham,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me T...,even my brother is not like to speak with me t...
7,ham,As per your request 'Melle Melle (Oru Minnamin...,As per your request Melle Melle Oru Minnaminun...,as per your request melle melle oru minnaminun...
8,spam,WINNER!! As a valued network customer you have...,WINNER As a valued network customer you have b...,winner as a valued network customer you have b...
9,spam,Had your mobile 11 months or more? U R entitle...,Had your mobile 11 months or more U R entitled...,had your mobile 11 months or more u r entitled...


In [4]:
labels, counts = np.unique(data['label'], return_counts=True)
encoder = preprocessing.LabelEncoder()
encoder.fit(labels[np.argsort(-counts)])
data['y'] = encoder.transform(data['label'])

In [8]:
data['y'].sum()/len(data['y'])

0.13406317300789664

In [5]:
random.seed(42)
mask_train = np.random.random(data.shape[0]) < 0.8
data_train = data[mask_train]
data_test = data.iloc[~mask_train, :]

# Count Vectorizer

In [6]:
count_vect = CountVectorizer()
count_data_train_transformed = count_vect.fit_transform(data_train['Lower Case Text'])
count_data_test_transformed = count_vect.transform(data_test['Lower Case Text'])

In [7]:
classifier = LogisticRegression()

In [8]:
classifier.fit(count_data_train_transformed, data_train['y'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
count_fpr, count_tpr, count_thresholds = roc_curve(data_train['y'], classifier.predict_proba(count_data_train_transformed)[:,1])
auc(count_fpr, count_tpr)

0.9999480684912223

In [10]:
count_fpr, count_tpr, count_thresholds = roc_curve(data_test['y'], classifier.predict_proba(count_data_test_transformed)[:,1])
auc(count_fpr, count_tpr)

0.9901939530581569

# TFIDF Vectorizer

In [11]:
tfidf_vect = TfidfVectorizer()
tfidf_data_train_transformed = tfidf_vect.fit_transform(data_train['Lower Case Text'])
tfidf_data_test_transformed = tfidf_vect.transform(data_test['Lower Case Text'])

In [12]:
classifier.fit(tfidf_data_train_transformed, data_train['y'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
tfidf_fpr, tfidf_tpr, tfidf_thresholds = roc_curve(data_train['y'], classifier.predict_proba(tfidf_data_train_transformed)[:,1])
auc(tfidf_fpr, tfidf_tpr)

0.9974077164213438

In [14]:
tfidf_fpr, tfidf_tpr, tfidf_thresholds = roc_curve(data_test['y'], classifier.predict_proba(tfidf_data_test_transformed)[:,1])
auc(tfidf_fpr, tfidf_tpr)

0.988835414498786

# N-Gram Classification

## BiGram Count Vect

In [15]:
bigram_count_vect = CountVectorizer(ngram_range = (2,2))
bicount_data_train_transformed = bigram_count_vect.fit_transform(data_train['Lower Case Text'])
bicount_data_test_transformed = bigram_count_vect.transform(data_test['Lower Case Text'])

In [16]:
classifier.fit(bicount_data_train_transformed, data_train['y'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
bicount_fpr, bicount_tpr, bicount_thresholds = roc_curve(data_train['y'], classifier.predict_proba(bicount_data_train_transformed)[:,1])
auc(bicount_fpr, bicount_tpr)

0.9999905579074949

In [18]:
bicount_fpr, bicount_tpr, bicount_thresholds = roc_curve(data_test['y'], classifier.predict_proba(bicount_data_test_transformed)[:,1])
auc(bicount_fpr, bicount_tpr)

0.9795135275754422

## BiGram TFIDF Vect

In [19]:
bigram_tfidf_vect = TfidfVectorizer(ngram_range = (2,2))
bitfidf_data_train_transformed = bigram_tfidf_vect.fit_transform(data_train['Lower Case Text'])
bitfidf_data_test_transformed = bigram_tfidf_vect.transform(data_test['Lower Case Text'])

In [20]:
classifier.fit(bitfidf_data_train_transformed, data_train['y'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
bitfidf_fpr, bitfidf_tpr, bitfidf_thresholds = roc_curve(data_train['y'], classifier.predict_proba(bitfidf_data_train_transformed)[:,1])
auc(bitfidf_fpr, bitfidf_tpr)

0.9999892703494261

In [22]:
bitfidf_fpr, bitfidf_tpr, bitfidf_thresholds = roc_curve(data_test['y'], classifier.predict_proba(bitfidf_data_test_transformed)[:,1])
auc(bitfidf_fpr, bitfidf_tpr)

0.9795568851890392

## NGram Count Vect

In [23]:
ngram_count_vect = CountVectorizer(ngram_range = (1,2))
ncount_data_train_transformed = ngram_count_vect.fit_transform(data_train['Lower Case Text'])
ncount_data_test_transformed = ngram_count_vect.transform(data_test['Lower Case Text'])

In [24]:
classifier.fit(ncount_data_train_transformed, data_train['y'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
ncount_fpr, ncount_tpr, ncount_thresholds = roc_curve(data_train['y'], classifier.predict_proba(ncount_data_train_transformed)[:,1])
auc(ncount_fpr, ncount_tpr)

0.9999974248838623

In [26]:
ncount_fpr, ncount_tpr, ncount_thresholds = roc_curve(data_test['y'], classifier.predict_proba(ncount_data_test_transformed)[:,1])
auc(ncount_fpr, ncount_tpr)

0.9878309631171234

## NGram TFIDF Vect

In [27]:
ngram_tfidf_vect = TfidfVectorizer(ngram_range = (1,2))
ntfidf_data_train_transformed = ngram_tfidf_vect.fit_transform(data_train['Lower Case Text'])
ntfidf_data_test_transformed = ngram_tfidf_vect.transform(data_test['Lower Case Text'])

In [28]:
classifier.fit(ntfidf_data_train_transformed, data_train['y'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [29]:
ntfidf_fpr, ntfidf_tpr, ntfidf_thresholds = roc_curve(data_train['y'], classifier.predict_proba(ntfidf_data_train_transformed)[:,1])
auc(ntfidf_fpr, ntfidf_tpr)

0.9993952768936545

In [30]:
ntfidf_fpr, ntfidf_tpr, ntfidf_thresholds = roc_curve(data_test['y'], classifier.predict_proba(ntfidf_data_test_transformed)[:,1])
auc(ntfidf_fpr, ntfidf_tpr)

0.9897531506532548