In [1]:
import numpy as np
import pandas as pd

In [3]:
#PANDAS CAN READ IN FILES INTO A DATAFRAME OBJECT
df = pd.read_csv('smsspamcollection.tsv',sep='\t') #TAB SEPARATED FILE

In [4]:
#FIRST 5 ROWS
df.head() 

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [6]:
#WE DONT KNOW HOW TO EXTRACT FEATURES FROM THE MESSAGE
#ONLY USE LENGTH AND PUNCT [ALREADY NUMERICAL]

#CHECK IF DATA IS MISSING ANYTHING: TRUE -> NULL -> 1
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [7]:
#HOW MANY ROWS/RECORDS
len(df)

5572

In [8]:
#ACCESS SPECIFIC COLUMNS
df['label']

0        ham
1        ham
2       spam
3        ham
4        ham
5       spam
6        ham
7        ham
8       spam
9       spam
10       ham
11      spam
12      spam
13       ham
14       ham
15      spam
16       ham
17       ham
18       ham
19      spam
20       ham
21       ham
22       ham
23       ham
24       ham
25       ham
26       ham
27       ham
28       ham
29       ham
        ... 
5542     ham
5543     ham
5544     ham
5545     ham
5546     ham
5547    spam
5548     ham
5549     ham
5550     ham
5551     ham
5552     ham
5553     ham
5554     ham
5555     ham
5556     ham
5557     ham
5558     ham
5559     ham
5560     ham
5561     ham
5562     ham
5563     ham
5564     ham
5565     ham
5566    spam
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

In [9]:
#ACCESS UNIQUE VALUES IN A COLUMN
df['label'].unique()

array(['ham', 'spam'], dtype=object)

In [10]:
#HOW MANY OF EACH UNIQUE VALUE
df['label'].value_countscounts()

ham     4825
spam     747
Name: label, dtype: int64

In [11]:
#SPAM MESSAGES USUALLY LONGER THAN HAM MESSAGES
#BEHAVIOUR IN NUMBER OF PUNCTUATION NOT CLEAR

In [12]:
#SPLIT INTO TRAINING AND TESTING SET
from sklearn.model_selection import train_test_split


In [13]:
#X -> FEATURE DATA
X = df[['length', 'punct']]
#Y -> LABEL
Y = df['label']

#30% TESTING #SAME RANDOM_STATE; SAME SPLIT EACH TIME
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X,Y, test_size = 0.3, random_state = 42)

In [15]:
#SHAPE OF DATA: (ROWS, COLS)
X_TRAIN.shape

(3900, 2)

In [21]:
#CREATE AND TRAIN A ML MODEL

#IMPORT
from sklearn.linear_model import LogisticRegression

In [22]:
#INSTANTIATE
lr_model = LogisticRegression(solver='lbfgs')

In [23]:
#TRAIN THE MODEL
lr_model.fit(X_TRAIN, Y_TRAIN)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [24]:
#TEST ACCURACY OF MODEL USING TEST DATA


#IMPORT
from sklearn import metrics

In [25]:
#PREDICTION RESULTS
predictions = lr_model.predict(X_TEST)

In [27]:
#TRUE VALUES: Y_TEST

In [30]:
#BUILD OUT THE CONFUSION MATRIX
print(metrics.confusion_matrix(Y_TEST, predictions))

df = pd.DataFrame(metrics.confusion_matrix(Y_TEST,predictions), index=['ham','spam'], columns=['ham','spam'])
df

[[1404   44]
 [ 219    5]]


Unnamed: 0,ham,spam
ham,1404,44
spam,219,5


In [31]:
#5 SPAMS CORRECTLY CLASSIFIED
#SHOULD TAKE INTO ACCOUNT TEXT DATA TOO

#CLASSIFICATION REPORT
print(metrics.classification_report(Y_TEST,predictions))

              precision    recall  f1-score   support

         ham       0.87      0.97      0.91      1448
        spam       0.10      0.02      0.04       224

   micro avg       0.84      0.84      0.84      1672
   macro avg       0.48      0.50      0.48      1672
weighted avg       0.76      0.84      0.80      1672



In [32]:
#OVERALL ACCURACY
print(100*metrics.accuracy_score(Y_TEST, predictions))

84.27033492822966


In [41]:
#USING A NAIVE BAYES MODEL
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_TRAIN, Y_TRAIN)

predictions = nb_model.predict(X_TEST)

print(metrics.confusion_matrix(Y_TEST, predictions))

print(metrics.classification_report(Y_TEST, predictions))

[[1438   10]
 [ 224    0]]
              precision    recall  f1-score   support

         ham       0.87      0.99      0.92      1448
        spam       0.00      0.00      0.00       224

   micro avg       0.86      0.86      0.86      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.86      0.80      1672



In [45]:
#USING A SVM MODEL
from sklearn.svm import SVC

svc_model = SVC(gamma = 'auto')
svc_model.fit(X_TRAIN, Y_TRAIN)

predictions = svc_model.predict(X_TEST)

print(metrics.confusion_matrix(Y_TEST, predictions))

print(metrics.classification_report(Y_TEST, predictions))

[[1373   75]
 [ 121  103]]
              precision    recall  f1-score   support

         ham       0.92      0.95      0.93      1448
        spam       0.58      0.46      0.51       224

   micro avg       0.88      0.88      0.88      1672
   macro avg       0.75      0.70      0.72      1672
weighted avg       0.87      0.88      0.88      1672

