In [19]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
sms = pd.read_csv(r'/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='latin-1')
sms_1 = sms.copy()
sms.head()

In [3]:
sms = sms.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
sms.head()

PREPROCESSING TEXT

Dealing with Missing values / Nan values

In [4]:
sms.isna().sum()

In [5]:
sms.info()

Removing the URL

In [6]:
import re
def clean_url(text):
    return re.sub(r'http\S+',' ',text)
sms['v2'] = sms['v2'].apply(clean_url)
sms.head()

Removing irrelevant characters (number and punctuations

In [7]:
def clean_alphanum(text):
    return re.sub('[^a-zA-Z]',' ',text)
sms['v2']=sms['v2'].apply(clean_alphanum)


In [8]:
sms.head()

To convert all the letters in lower case which becomes easier to process

In [9]:
def to_lower(text):
    return str(text).lower()
sms['v2']=sms['v2'].apply(to_lower)
sms.head()

In [10]:
import nltk 
from nltk.tokenize import word_tokenize

def clean_tok(text):
    return word_tokenize(text)
sms['v2'] = sms['v2'].apply(clean_tok)
sms.head()

In [11]:
from nltk.corpus import stopwords

stp_wrds = set(stopwords.words('english'))
def clean_stpwrds(text):
    return [item for item in text if item not in stp_wrds]
sms['v2']=sms['v2'].apply(clean_stpwrds)
sms.head()

In [12]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
def lem(text):
    return [lemma.lemmatize(w,pos='v') for w in text]
sms['v2']=sms['v2'].apply(lem)
sms.head()

In [13]:
x = sms['v2']
y = sms['v1']

x.shape, y.shape

In [14]:
corpus = []

for i in sms['v2']:
    msg = ' '.join([row for row in i])
    corpus.append(msg)
corpus[:5]

In [15]:
tf = TfidfVectorizer()
x_fit = tf.fit_transform(corpus).toarray()
x_fit[:5]

In [16]:
label_enc = LabelEncoder()
sms['v1'] = label_enc.fit_transform(sms['v1'])
sms['v1']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(x_fit, y,
                                                    random_state=12)

In [18]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb.score(X_train, y_train), nb.score(X_test, y_test)

In [20]:
y_pred = nb.predict(X_test)
print(y_pred)

In [21]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)