In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('smsspamcollection.tsv', sep ='\t')

In [5]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [6]:
#CHECK FOR MISSING VALUES
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [7]:
#COUNT OF UNIQUE VALUES
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [8]:
#SPLIT DATA INTO TRAINING AND TEST SET
from sklearn.model_selection import train_test_split

In [9]:
#JUST USE THE TEXT AS THE FEATURE


X = df['message']

Y = df['label']

In [10]:
#TRAIN-TEST SPLIT
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X,Y, test_size = 0.33, random_state = 42)

In [11]:
#COUNT-VECTORISATION
#-> Includes tokenisation, stop words removing etc

#IMPORT
from sklearn.feature_extraction.text import CountVectorizer

#INSTANTIATION
count_vect= CountVectorizer()


In [12]:
#FIT VECTORISER TO THE DATA

#TRANSFORM THE ORIGINAL TEXT MESSAGE --> VECTOR
X_TRAIN_COUNTS = count_vect.fit_transform(X_TRAIN)

In [14]:
#SPARSE ROW FORMAT STORAGE OF X_TRAIN_COUNTS
X_TRAIN.shape #With over 7000 unique words

X_TRAIN_COUNTS.shape

(3733, 7082)

In [16]:
#SINCE IT HAS MOSTLY 0S; CONVERT THE COUNTS INTO FREQUENCIES WITH TF-IDF
#-> Give more important words; more weight

#IMPORT
from sklearn.feature_extraction.text import TfidfTransformer

#INSTANTIATE
tfidf_transformer = TfidfTransformer()

In [17]:

#NOTE: PASSING THE COUNT MATRIX AS ARGUMENT
X_TRAIN_TFIDF = tfidf_transformer.fit_transform(X_TRAIN_COUNTS)

X_TRAIN_TFIDF.shape

(3733, 7082)

In [18]:
#COMBINE STEPS OF COUNT_VECTORISATION AND TFIDF TRANSFORMATION
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_TRAIN_TFIDF = vectorizer.fit_transform(X_TRAIN)

In [19]:
#TRAIN A CLASSIFIER

from sklearn.svm import LinearSVC

clf = LinearSVC()

clf.fit(X_TRAIN_TFIDF, Y_TRAIN)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [20]:
#NOTE: VOCABULARY HAS NOT BEEN BUILT ON THE TEST SET

#WILL WE HAVE TO PERFORM VECTORISATION AND FIT TRANSFORM ON TEST DATA TOO? [LONG PROCESS]

#USE ONE SINGLE PIPELINE STEP

from sklearn.pipeline import Pipeline

text_classifier = Pipeline([('tfidf',TfidfVectorizer()), ('clf',LinearSVC())])

text_classifier.fit(X_TRAIN, Y_TRAIN)

  if LooseVersion(joblib_version) < '0.12':


Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [21]:
predictions = text_classifier.predict(X_TEST)

In [22]:
X_TEST

3245    Squeeeeeze!! This is christmas hug.. If u lik ...
944     And also I've sorta blown him off a couple tim...
1044    Mmm thats better now i got a roast down me! i...
2484        Mm have some kanji dont eat anything heavy ok
812     So there's a ring that comes with the guys cos...
2973    Sary just need Tim in the bollox &it hurt him ...
2991    Love isn't a decision, it's a feeling. If we c...
2942    My supervisor find 4 me one lor i thk his stud...
230                    Dear good morning now only i am up
1181                           I'm in chennai velachery:)
1912    Lol grr my mom is taking forever with my presc...
1992    No other Valentines huh? The proof is on your ...
5435                    I'm wif him now buying tix lar...
4805    Er, hello, things didn‘t quite go to plan – is...
401     FREE RINGTONE text FIRST to 87131 for a poly o...
1859                     Sir, i am waiting for your call.
1344    Crazy ar he's married. Ü like gd looking guys ...
2952          

In [23]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(Y_TEST, predictions))

[[1586    7]
 [  12  234]]


In [24]:
print(classification_report(Y_TEST, predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [26]:
#CHECK ACCURACY OF MODEL
from sklearn import metrics

accuracy = 100*metrics.accuracy_score(Y_TEST, predictions)

print(accuracy)

98.9668297988037


In [33]:
#PREDICTION ON A NEW MESSAGE
text_classifier.predict(["Hi how are you doing today?", "You have been selected as a winner for a lottery"])

array(['ham', 'spam'], dtype=object)