In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import accuracy_score
import nltk
nltk.download('stopwords')
import math
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
data=pd.read_csv('/content/spam.csv',encoding="ISO-8859-1")
data=pd.DataFrame(data)


In [25]:
port_stem=PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content


In [26]:
data=data[['v1','v2']].copy()
data.rename(columns={'v1':'label','v2':'text'},inplace=True)

data.label[data.label == 'ham'] = 0
data.label[data.label== 'spam'] = 1
data.shape

(5572, 2)

In [27]:
data['text']=data['text'].apply(stemming)
data.head()

Unnamed: 0,label,text
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri wkli comp win fa cup final tkt st m...
3,0,u dun say earli hor u c alreadi say
4,0,nah think goe usf live around though


In [28]:
x=data.drop(columns=['label'],axis=1)
y=data['label']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3,stratify=y)

In [29]:
def score1(y,y_):
  c=0
  for i in range(len(y)):
    if(y[i]==y_[i]):
      c+=1
  return (c/len(y))*100

In [30]:
class BernouliNB:
  def __init__(self):
    pass
  def _prior(self):
    self.ham_prob=self.ham_mail/(self.ham_mail+self.spam_mail)
    self.spam_prob=self.spam_mail/(self.ham_mail+self.spam_mail)
  def feature_extract(self,x,y):
    data_train=pd.concat([x, y], axis=1, join='inner')
    data_spam=data_train[data_train['label']==1]
    data_ham=data_train[data_train['label']==0]
    self.spam_mail,_=data_spam.shape
    self.ham_mail,_=data_ham.shape
    cv=CountVectorizer()
    data_ham_cv=cv.fit_transform(data_ham['text'].values)
    self.feature_names_ham=cv.get_feature_names_out()
    data_spam_cv=cv.fit_transform(data_spam['text'].values)
    self.feature_names_spam=cv.get_feature_names_out()

  def probability_calc(self,text):
    text=text.split()
    ham_prob=self.ham_prob
    for word in text:
      num=1
      if word in self.feature_names_ham:
        num+=1
      den=self.n+2
      ham_prob*=(num/den)
    spam_prob=self.spam_prob
    for word in text:
      num=1
      if word in self.feature_names_spam:
        num+=1
      den=self.n+2
      spam_prob*=(num/den)

    if ham_prob>spam_prob:
      return 0
    else:
      return 1


  def fit(self,x,y):
    self.x=x
    self.y=y
    self.feature_extract(x,y)
    self.n=len(self.feature_names_spam.shape)+len(self.feature_names_ham.shape)
    self._prior()
    y_calc=x['text'].apply(self.probability_calc)
    print(y.to_numpy())
    print(y_calc.to_numpy())
    print(score1(y.to_numpy(),y_calc.to_numpy()))

  def predict(self,x):
    return x['text'].apply(self.probability_calc).to_numpy()
  def score(self,x,y):
    y_calc=self.predict(x)
    return score1(y.to_numpy(),y_calc)





In [31]:
bernouli=BernouliNB()
bernouli.fit(x_train,y_train)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
96.43257796724254


In [32]:
y_calc=bernouli.predict(x_test)
print(score1(y_test.to_numpy(),y_calc))


93.36322869955157


In [33]:
def get_score(model,x_train,y_train,x_test,y_test):
  model.fit(x_train,y_train)
  return model.score(x_test,y_test)


In [34]:
print(data.text)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri wkli comp win fa cup final tkt st m...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    nd time tri contact u u pound prize claim easi...
5568                                b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: text, Length: 5572, dtype: object


In [35]:
from sklearn.model_selection import KFold





X = data['text']
y = data['label']


fold = KFold(n_splits=10)


score = []


for train_idx, test_idx in fold.split(X, y):

    x_train, x_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]


    bernouli.fit(pd.DataFrame({'text': x_train}), y_train)
    acc_score = bernouli.score(pd.DataFrame({'text': x_test}), y_test)


    score.append(acc_score)


[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
96.05105704028719
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
96.15077782209812
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
96.11166500498504
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
95.67298105682951
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
95.85244267198405
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
96.01196410767697
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
96.39082751744765
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
96.23130608175472
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
96.01196410767697
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
95.97208374875373


In [36]:
print(np.mean(score))
print(np.median(score))

93.44935425957028
92.99820466786356
