In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv(r'C:\Users\ck\natural language processing\UPDATED_NLP_COURSE\TextFiles\smsspamcollection.tsv',sep='\t')

In [3]:
df.shape

(5572, 4)

In [4]:
df.head(5)

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


# Check for missing values

In [5]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

There are no missing values in the data

In [6]:
df['label'].value_counts()
#frequency count of each category in label column

ham     4825
spam     747
Name: label, dtype: int64

# Split the data into train & test sets:

In [7]:
X = df['message']  
y = df['label']

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Scikit-learn's CountVectorizer

Text preprocessing, tokenizing and the ability to filter out stopwords are all included in CountVectorizer, which builds a dictionary of features and transforms documents to feature vectors.

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

In [11]:
X_train_counts = count_vect.fit_transform(X_train)
#'fit_transform' to the data builds vocabulary,count the no.of words,filter stop words etc 
#and transforms into the vectorized form
X_train_counts.shape

(3733, 7082)

In [12]:
X_train_counts

<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

So our message feature is converted into a huge sparse matrix with 3733 documents and 7082 features(words)

# Transforming Counts to Frequencies with Tf-idf

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

In [14]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3733, 7082)

Note: we can combine the CountVectorizer and TfidTransformer steps into one using TfidVectorizer:

# Train a classifier

We will use a linear support vector classifier as it handels sparse features well

In [15]:
from sklearn.svm import LinearSVC
clf = LinearSVC()

In [16]:
#fitting to the training data
clf.fit(X_train_tfidf,y_train)

LinearSVC()

Now that we have fitted data to the training set,we predict the output for test set and evaluate the performance by comparing the predicted values of test set with the actual values of the test set

But Remember we only preprocessed and tranformed the training data. The similar operations has to be performed on test set.
So instead of repeating all the steps (i.e countvectorization,Tf-Idf tranformation) we can use the scikit learn pipelines directly to perform all these tasks

# Building a Pipeline

In [17]:
from sklearn.pipeline import Pipeline

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [19]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', LinearSVC()),])
#pipe line that first tranforms the input data into tfidf vector form (remember tfidf vectorizer combines the
#CountVectorizer and TfidTransformer steps into one )and then fits linear support classifier

In [20]:
# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

Note here that we passed X_train and y_train.text_clf.fit(X_train, y_train)  performs all the operations that we manually did to get the X_train_tfidf

# Testing the classifier 

In [21]:
# Form a prediction set
predictions = text_clf.predict(X_test)

In [22]:
predictions

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [23]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1586    7]
 [  12  234]]


In [24]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

    accuracy                           0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [25]:
#  the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.989668297988037


# Prediction on new message

In [26]:
text_clf.predict(['Congratulations! you have won free entry ticket passes for the sunburn'])

array(['spam'], dtype=object)

In [27]:
text_clf.predict(["Hi! It's been a long time since we have met"])

array(['ham'], dtype=object)