In [None]:
# Building Your Own NLP System
# ------------------------------
# Steps:
# 1- Build a vocabulary
#   1.a. Assign to each word an ID
# 2- Perform Feature Extraction 

In [1]:
import numpy as np
import pandas as pd

In [2]:
from pathlib import Path

current_path = str(Path('.').absolute())
data_path = str(current_path) + '/data/smsspamcollection.tsv'

In [3]:
df = pd.read_csv(data_path, sep='\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [None]:
# Let's vectorize the message attribute

In [4]:
df.isnull()

Unnamed: 0,label,message,length,punct
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [5]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [6]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split

X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [8]:
# Perform Count Vectorization

from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()

In [9]:
X
# X is holding text, we are going to transform it to numerical values

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
5       FreeMsg Hey there darling it's been 3 week's n...
6       Even my brother is not like to speak with me. ...
7       As per your request 'Melle Melle (Oru Minnamin...
8       WINNER!! As a valued network customer you have...
9       Had your mobile 11 months or more? U R entitle...
10      I'm gonna be home soon and i don't want to tal...
11      SIX chances to win CASH! From 100 to 20,000 po...
12      URGENT! You have won a 1 week FREE membership ...
13      I've been searching for the right words to tha...
14                    I HAVE A DATE ON SUNDAY WITH WILL!!
15      XXXMobileMovieClub: To use your credit, click ...
16                             Oh k...i'm watching here:)
17      Eh u r

In [10]:
# What steps do we need to do with the CountVectorizer?
count_vect.fit(X_train) # -> building the vocabulary, counting the number of words
X_train_counts = count_vect.transform(X_train) # -> Do the text to numerical transformation

# X_train_counts = count_vect.fit_transform(X_train)

In [11]:
X_train_counts.shape

(3900, 7263)

In [12]:
X_train.shape

(3900,)

In [14]:
# Things to be aware of
# - Not all words are very important
# - A bunch of word does not "weight" the same as others
# - TD-IDF give us information about which words are mort importants than others

from sklearn.feature_extraction.text import TfidfTransformer

tdidf_transf = TfidfTransformer()
X_train_tfidf = tdidf_transf.fit_transform(X_train_counts)
X_train_tfidf.shape

(3900, 7263)

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# vect = TfidfVectorizer()
# X_train_tfidf = vect.fit_transform(X_train)

In [15]:
# Training a Classifier

from sklearn.svm import LinearSVC

clf = LinearSVC()
clf.fit(X_train_tfidf, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [16]:
# sklearn Pipeline object to reduce reptitive process
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# With the pipeline object we will perform TF-IDF, then Classification in a single step
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [17]:
predictions = text_clf.predict(X_test)

In [18]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

print(confusion_matrix(y_test, predictions))


[[1445    3]
 [  10  214]]


In [19]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      1448
        spam       0.99      0.96      0.97       224

   micro avg       0.99      0.99      0.99      1672
   macro avg       0.99      0.98      0.98      1672
weighted avg       0.99      0.99      0.99      1672



In [20]:
print(accuracy_score(y_test, predictions))

0.9922248803827751


In [70]:
# Let's make predictions

In [21]:
text_clf.predict(['Heyyyy how areee you? I just wanna get in touch with you for playing a song.'])

array(['ham'], dtype=object)

In [22]:
text_clf.predict(['Congratulations! You have been selected as a winner. Please click in the following link and send a text message to 2452435 number'])

array(['spam'], dtype=object)