# 1. Preprocessing and Cleaning
# 2. Train Test Split 
# 3. Bow and TF-Idf (sentences --> vectors) {preventing data Leakage}
# 4. Trained our Models

In [96]:
import pandas as pd 


In [97]:
messages = pd.read_csv('./SMSSpamCollection.txt',sep='\t',names=['label','message'])

In [98]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Data Cleaning and Preprocessing

In [99]:
import re 
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [100]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [101]:
corpus = [] 

for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

# train test split

In [102]:
## out features

y = pd.get_dummies(messages['label'], dtype=int)
y = y.iloc[:,0].values


In [103]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(corpus,y,test_size=0.2)

## Create Bag of Words

In [104]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500,ngram_range=(1,2))

In [105]:
x_train_b = cv.fit_transform(x_train).toarray()
x_test_b = cv.transform(x_test).toarray()

In [106]:
cv.vocabulary_

{'hungri': np.int64(998),
 'gay': np.int64(778),
 'guy': np.int64(900),
 'feel': np.int64(689),
 'call': np.int64(247),
 'min': np.int64(1299),
 'stop': np.int64(1993),
 'text': np.int64(2080),
 'call min': np.int64(259),
 'min stop': np.int64(1302),
 'stop text': np.int64(1999),
 'text call': np.int64(2082),
 'pick': np.int64(1572),
 'anoth': np.int64(70),
 'th': np.int64(2091),
 'done': np.int64(557),
 'ok': np.int64(1473),
 'thanx': np.int64(2097),
 'gd': np.int64(783),
 'nite': np.int64(1426),
 'ok thanx': np.int64(1482),
 'still': np.int64(1986),
 'get': np.int64(789),
 'good': np.int64(841),
 'loverboy': np.int64(1220),
 'keep': np.int64(1077),
 'come': np.int64(388),
 'queen': np.int64(1672),
 'hmmm': np.int64(960),
 'ach': np.int64(15),
 'speak': np.int64(1958),
 'miss': np.int64(1313),
 'cool': np.int64(429),
 'see': np.int64(1819),
 'ya': np.int64(2463),
 'see ya': np.int64(1826),
 'went': np.int64(2377),
 'attend': np.int64(113),
 'two': np.int64(2211),
 'round': np.int64(17

In [107]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(x_train_b,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [108]:
y_pred =model.predict(x_test_b)

In [109]:
from sklearn.metrics import accuracy_score,classification_report

print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred,y_test))

0.9829596412556054
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       168
           1       1.00      0.98      0.99       947

    accuracy                           0.98      1115
   macro avg       0.96      0.98      0.97      1115
weighted avg       0.98      0.98      0.98      1115



# create TF-IDF model

In [110]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(max_features=2500,ngram_range=(1,2)
                     )

In [111]:
x_train_t = tv.fit_transform(x_train).toarray()
x_test_t = tv.transform(x_test).toarray()

In [112]:
tv.vocabulary_

{'hungri': np.int64(998),
 'gay': np.int64(778),
 'guy': np.int64(900),
 'feel': np.int64(689),
 'call': np.int64(247),
 'min': np.int64(1299),
 'stop': np.int64(1993),
 'text': np.int64(2080),
 'call min': np.int64(259),
 'min stop': np.int64(1302),
 'stop text': np.int64(1999),
 'text call': np.int64(2082),
 'pick': np.int64(1572),
 'anoth': np.int64(70),
 'th': np.int64(2091),
 'done': np.int64(557),
 'ok': np.int64(1473),
 'thanx': np.int64(2097),
 'gd': np.int64(783),
 'nite': np.int64(1426),
 'ok thanx': np.int64(1482),
 'still': np.int64(1986),
 'get': np.int64(789),
 'good': np.int64(841),
 'loverboy': np.int64(1220),
 'keep': np.int64(1077),
 'come': np.int64(388),
 'queen': np.int64(1672),
 'hmmm': np.int64(960),
 'ach': np.int64(15),
 'speak': np.int64(1958),
 'miss': np.int64(1313),
 'cool': np.int64(429),
 'see': np.int64(1819),
 'ya': np.int64(2463),
 'see ya': np.int64(1826),
 'went': np.int64(2377),
 'attend': np.int64(113),
 'two': np.int64(2211),
 'round': np.int64(17

In [113]:
from sklearn.naive_bayes import MultinomialNB

t_model = MultinomialNB().fit(x_train_t,y_train)

In [114]:
y_pred_t = t_model.predict(x_test_t)

In [115]:
print(classification_report(y_pred_t,y_test))

              precision    recall  f1-score   support

           0       0.80      1.00      0.89       143
           1       1.00      0.96      0.98       972

    accuracy                           0.97      1115
   macro avg       0.90      0.98      0.93      1115
weighted avg       0.97      0.97      0.97      1115

