# Exercise 4: Naives Bayes

### Step 1: Read txt file and pre-process data
- remove all words with numbers and pure numbers
- remove all puncutation
- convert capital to small letter

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('SMSSpamCollection.txt', sep="\t", header=None)
df.columns=['label','text']
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
import re
import string

pattern_alphanumeric="\w*\d\w"
pattern_punctuation="["+re.escape(string.punctuation)+"]"
df['new_text']='empty'

for ind in df.index:
    temp=re.sub(pattern_alphanumeric,'',df['text'][ind])
    df['new_text'][ind]=re.sub(pattern_punctuation, '',temp).lower()
df_cleaned=df[['label','new_text']].copy()
df_cleaned.head()

Unnamed: 0,label,new_text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


### Step 2: split dataset
- test: 30%, training: 70%
- randomstate=42 to get same random train and test split

In [4]:
X = df_cleaned['new_text']
Y = df_cleaned['label']

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(X, Y, test_size=0.3, random_state=42)

In [6]:
print(f"""
size of original: {X.shape}
training: {X_train.shape}
test: {X_test.shape}
      """)


size of original: (5572,)
training: (3900,)
test: (1672,)
      


### Step 3: convert text to vectors with count vectorizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(stop_words='english', ngram_range=(1,1))
X_train_cv=cv.fit_transform(X_train)
X_test_cv=cv.transform(X_test) #make sure X_test is TRANSFORM and not fit_transform

#print dimensions and features 
print(f"""
training: {X_train_cv.toarray().shape}
test: {X_test_cv.toarray().shape}

features: {cv.get_feature_names_out()}
      """)

# print(X_train_cv)


training: (3900, 6861)
test: (1672, 6861)

features: ['11' '12' '13' ... 'zouk' 'zyada' 'üll']
      


### Step 4: fit model on training data
- fit model to training data
- apply fitted model to test data
- predict test data

In [12]:
#MAIN DIFFERENCE IN CODE
from sklearn.naive_bayes import MultinomialNB

nb=MultinomialNB()

#train
nb.fit(X_train_cv, y_train)

#move to X_test
y_pred_cv=nb.predict(X_test_cv)
y_pred_cv

array(['ham', 'ham', 'ham', ..., 'ham', 'spam', 'ham'], dtype='<U4')

### Step 5: evaluation 
- confusion matrix
- precision
- recall
- f1-score
- accuracy

In [13]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, y_pred_cv)
cm

array([[1437,   11],
       [  14,  210]], dtype=int64)

In [14]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_cv, target_names=['ham','spam']))

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1448
        spam       0.95      0.94      0.94       224

    accuracy                           0.99      1672
   macro avg       0.97      0.96      0.97      1672
weighted avg       0.98      0.99      0.99      1672



### Step 6: Save model and count vectorizer

In [15]:
import pickle
from datetime import datetime

time=datetime.now().strftime('%Y-%m-%d')
print(time)

with open(f'Naives Bayes{time}.pkl', 'wb') as f1: 
    pickle.dump(nb, f1)
    
with open(f'CountVectorizer{time}','wb') as f2: 
    pickle.dump(cv,f2)


2024-11-27
