In [119]:
import numpy as np 
import pandas as pd
import re, string
import matplotlib.pyplot as plt

In [120]:
df=pd.read_csv('spam.csv',encoding='latin-1')

In [121]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


# Removing unnamed columns and updating relevant naming to other columns

In [122]:
df=df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [123]:
df=df.rename({'v1':'target','v2':'message'},axis=1)

In [124]:
len_text=[]
for i in df['message']:
    len_text.append(len(i))

In [125]:
df

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# Data Cleaning and Preprocessing 

In [126]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [127]:
df['message_clean'] = df['message'].apply(clean_text)
df.head()

Unnamed: 0,target,message,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [128]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['target'])

df['target_encoded'] = le.transform(df['target'])
df.head()

Unnamed: 0,target,message,message_clean,target_encoded
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...,1
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,0


# Count Vectorizer

## defining X and y for use with Count Vectorizer

In [129]:
x = df['message_clean']
y = df['target_encoded']

print(len(x), len(y))

5572 5572


## Split into train and test sets

In [130]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

4179 4179
1393 1393


## instantiate the vectorizer

In [131]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(x_train)

CountVectorizer()

## Using the trained to create a document-term matrix from train and test sets

In [132]:
x_train_dtm = vect.transform(x_train)
x_test_dtm = vect.transform(x_test)

## Tunning Count Vectorizer using stop_words, ngram_range, min_def, max_def, max_features

In [133]:
vect_tunned = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.1, max_df=0.7, max_features=100)

In [134]:
# tokenize and build vocab
vect.fit(x)
# summarize
print(vect.vocabulary_)
# encode document
vector = vect.transform(x)
# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())

(5572, 8188)
<class 'scipy.sparse.csr.csr_matrix'>


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# TF-IDF Vectorizer

In [135]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

tfidf_transformer.fit(x_train_dtm)
x_train_tfidf = tfidf_transformer.transform(x_train_dtm)

x_train_tfidf

<4179x6939 sparse matrix of type '<class 'numpy.float64'>'
	with 51763 stored elements in Compressed Sparse Row format>

In [136]:
# tokenize and build vocab
vect.fit(x)
# summarize
print(vect.vocabulary_)
# encode document
vector = vect.transform(x)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

(5572, 8188)


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# Hashing with Hashing Vectorizer

In [137]:
from sklearn.feature_extraction.text import HashingVectorizer
# list of text documents
text = ["Free entry in 2 a wkly comp to win FA Cup finals."]
# create the transform
vectorizer = HashingVectorizer(n_features=20)
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

(1, 20)
[[ 0.          0.          0.          0.26726124  0.          0.
  -0.26726124 -0.26726124  0.          0.80178373  0.          0.26726124
   0.          0.          0.          0.          0.          0.26726124
   0.          0.        ]]


# Machine Learning techninques

In [138]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.2)

## Logistic Regression

In [139]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
cv=CountVectorizer()
lr=LogisticRegression(max_iter=10000)
x_train=cv.fit_transform(x_train)

In [140]:
lr.fit(x_train,y_train)
pred_1=lr.predict(cv.transform(x_test))
accuracy_1=accuracy_score(y_test,pred_1)
accuracy_1

0.9766816143497757

## Multinomial Naive Bayes

In [141]:
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()
nb.fit(x_train,y_train)
pred_2=nb.predict(cv.transform(x_test))
accuracy_2=accuracy_score(y_test,pred_2)
accuracy_2

0.97847533632287

## Support Vector Machine (SVM)

In [142]:
from sklearn.svm import SVC
svm=SVC()
svm.fit(x_train,y_train)
pred_3=svm.predict(cv.transform(x_test))
accuracy_3=accuracy_score(y_test,pred_3)
accuracy_3

0.9739910313901345