In [1]:
import pandas as pd
import nltk

In [2]:
data = pd.read_csv('dataset/SMSSpamCollection', sep="\t", header=None)
data.columns = ["label", "feature"]

In [3]:
data.head()

Unnamed: 0,label,feature
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
second_data = data.copy()
second_data.head()

Unnamed: 0,label,feature
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [6]:
# Extracts main points from the text data but meaning might be changed due to removal of suffix or prefix
# to make base of each word. i.e. cleaning would be converted to clean after applying stemming.
lemmatizer = WordNetLemmatizer()
corpus = []

## Stemmer technique
for i in range(len(second_data)):
    words = re.sub('[^a-zA-Z]', ' ', second_data["feature"][i])
    words = words.lower()
    words = words.split()

    # Applying stemming on each words after removing words
    # which does not add values to the data by using stopwords() which is available in various languages.
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    words = ' '.join(words)
    corpus.append(words)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer 
# instead of YfidfVectorizer we can use CountVectorizer(Bag of Words) which counts number of occurances for each word.

In [8]:
tv = TfidfVectorizer(ngram_range=(1,3))
X = tv.fit_transform(corpus).toarray()

In [9]:
X.shape

(5572, 66679)

In [10]:
y = data["label"]

In [11]:
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: label, dtype: object

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [13]:
print(f'X_train size = {len(X_train)}')
print(f'X_test size = {len(X_test)}')
print(f'y_train size = {len(y_train)}')
print(f'y_test size = {len(y_test)}')

X_train size = 3733
X_test size = 1839
y_train size = 3733
y_test size = 1839


In [14]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

In [15]:
from sklearn import metrics
import numpy as np
import itertools

In [16]:
classifier.fit(X_train, y_train)

MultinomialNB()

In [24]:
prediction = classifier.predict(X_test)
score = metrics.accuracy_score(y_test, prediction)
print(f'Accuracy is: {score}')
print(prediction)

Accuracy is: 0.9347471451876019
['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']


In [25]:
classifier.predict([["Hi", "you", "have", "got", "free", "cheque of 5000"]])

  return f(**kwargs)


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 66679 is different from 6)