# Importing Libraries

In [1]:
import warnings
import pickle
import pandas as pd
import plotly_express as px
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Veto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Veto\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Veto\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Veto\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# Optional styling options
%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option("max_rows", None)

# Loading Data

In [3]:
data = pd.read_csv('spam.csv', encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Drop unused columns
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

# Rename column names
data.rename(columns={"v1": "Target", "v2": "Content"}, inplace=True)
data.head()


Unnamed: 0,Target,Content
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Visualization

In [5]:
fig = px.histogram(data, x="Target", color="Target", color_discrete_sequence=["#61ff79", "#ff4e2b"])
fig.show()

## Data Preprocessing

### Cleaning Text
Remove punctuations, symbols, numbers, whitespace etc.

In [6]:
def Clean(Text):

    # Replace email address with 'emailaddress'
    result = re.sub(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddress', Text)

    # Replace urls with 'webaddress'
    result = re.sub(
        r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress', result)

    # Replace money symbol with 'money-symbol'
    result = re.sub(r'£|\$', 'money-symbol', result)

    # Replace 10 digit phone number with 'phone-number'
    result = re.sub(
        r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phone-number', result)

    # Replace normal number with 'number'
    result = re.sub(r'\d+(\.\d+)?', 'number', result)

    # remove punctuation
    result = re.sub(r'[^\w\d\s]', '', result)

    # remove whitespace between terms with single space
    result = re.sub(r'[^\w\d\s]', '', result)

    # change words to lower case
    result = result.lower()

    return result


data["Clean_Text"] = data["Content"].apply(Clean)
data.head()

Unnamed: 0,Target,Content,Clean_Text
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in number a wkly comp to win fa cup...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


### Tokenization
Splitting sentences into words, or tokens

In [7]:
data["Tokenize_Text"]= data.apply(lambda row: nltk.word_tokenize(row["Clean_Text"]), axis=1)

data["Tokenize_Text"].head()

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, number, a, wkly, comp, to, w...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, dont, think, he, goes, to, usf, he, l...
Name: Tokenize_Text, dtype: object

### Removing Stopwords
Stopwords are words that have no contribution to NLP, but are frequently used in sentences
(ex: "the", "a", "an", "in", "but")

In [8]:
# Removing the stopwords function
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    filtered_text = [word for word in text if word not in stop_words]
    return filtered_text


data["Nostopword_Text"] = data["Tokenize_Text"].apply(remove_stopwords)
data["Nostopword_Text"].head()

0    [go, jurong, point, crazy, available, bugis, n...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, number, wkly, comp, win, fa, cup...
3        [u, dun, say, early, hor, u, c, already, say]
4    [nah, dont, think, goes, usf, lives, around, t...
Name: Nostopword_Text, dtype: object

### Lemmatization / Stemming
Converting words into their base form (ex. "Playing", "Plays", "Played", "Play" will be converted to "Play")

In [9]:
lemmatizer = WordNetLemmatizer()


def lemmatize_word(text):
    lemmas = [lemmatizer.lemmatize(word, pos='v') for word in text]
    return lemmas


data["Lemmatized_Text"] = data["Nostopword_Text"].apply(lemmatize_word)
data["Lemmatized_Text"].head()

0    [go, jurong, point, crazy, available, bugis, n...
1                         [ok, lar, joke, wif, u, oni]
2    [free, entry, number, wkly, comp, win, fa, cup...
3        [u, dun, say, early, hor, u, c, already, say]
4    [nah, dont, think, go, usf, live, around, though]
Name: Lemmatized_Text, dtype: object

## Vectorizing
### Creating a Text Corpus

In [10]:
corpus = []
for i in data["Lemmatized_Text"]:
    msg = ' '.join([row for row in i])
    corpus.append(msg)

corpus[:5]

['go jurong point crazy available bugis n great world la e buffet cine get amore wat',
 'ok lar joke wif u oni',
 'free entry number wkly comp win fa cup final tkts numberst may number text fa number receive entry questionstd txt ratetcs apply numberovernumbers',
 'u dun say early hor u c already say',
 'nah dont think go usf live around though']

### Creating a Vector, using Term Frequency - Inverse Document Frequency (TF-IDF)

In [11]:
# Changing text data in to numbers.
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus).toarray()

# Saving the tfidf model
pickle.dump(tfidf, open('./models/tfidf.pkl', 'wb'))

X[:2, :3]

array([[0., 0., 0.],
       [0., 0., 0.]])

## Creating the Model

In [12]:
# Label encode the Target and use it as y
label_encoder = LabelEncoder()
data["Target"] = label_encoder.fit_transform(data["Target"])
data['Target'].head()

0    0
1    0
2    1
3    0
4    0
Name: Target, dtype: int32

In [13]:
y = data["Target"]
# Splitting the testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Train the model using Multinomial Naive-Bayes Classifier

In [14]:
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

### Testing the accuracy of the model

In [15]:
model.score(X_test, y_test)

0.9575856443719413

### Model Demonstration

In [16]:
def predict(msg):
  data = [msg]
  vector = tfidf.transform(data).toarray()
  prediction = model.predict(vector)
  return prediction[0]


ham_msg = 'Hello world. Happy birthday to you!'
spam_msg = 'Congratulations, you just won $50,000 cash to claim your prize, please email me at veto@gmail.com to claim your prize now. URGENT'


print(predict(ham_msg))
print(predict(spam_msg))

0
1


## Saving the Model

In [17]:
pickle.dump(model, open('./models/model.pkl', 'wb'))

## END