In [267]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import accuracy_score
import nltk
nltk.download('stopwords')
import math
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [268]:
data=pd.read_csv('/content/spam.csv',encoding="ISO-8859-1")
data=pd.DataFrame(data)

In [269]:
data=data[['v1','v2']].copy()

In [270]:
data.rename(columns={'v1':'label','v2':'text'},inplace=True)

In [271]:
data.label[data.label=='ham']=0
data.label[data.label=='spam']=1

In [272]:
data.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [273]:
port_stem=PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [274]:
data['text']=data['text'].apply(stemming)

In [275]:
data.head()

Unnamed: 0,label,text
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri wkli comp win fa cup final tkt st m...
3,0,u dun say earli hor u c alreadi say
4,0,nah think goe usf live around though


In [276]:
x=data.drop(columns=['label'],axis=1)
y=data['label']

In [277]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=2,test_size=0.2,stratify=y)

In [278]:
def score1(y,y_):
  c=0
  for i in range(len(y)):
    if(y[i]==y_[i]):
      c+=1
  return (c/len(y))*100

In [279]:
class MulitNB:
  def _prior(self):
    self.ham_prob=self.ham_mails/(self.ham_mails+self.spam_mails)
    self.spam_prob=self.spam_mails/(self.ham_mails+self.spam_mails)
  def fit(self,x,y):
    cv=CountVectorizer()
    data = pd.concat([x, y], axis=1, join='inner')
    ham_data=data[data.label==0]
    self.ham_mails,_=ham_data.shape
    spam_data=data[data.label==1]
    self.spam_mails,_=spam_data.shape
    vector=cv.fit_transform(ham_data['text'].values)
    self.ham_vector=pd.DataFrame(vector.toarray(),columns=cv.get_feature_names_out())
    self.ham_features=cv.get_feature_names_out()
    vector=cv.fit_transform(spam_data['text'].values)
    self.spam_vector=pd.DataFrame(vector.toarray(),columns=cv.get_feature_names_out())
    self.spam_features=cv.get_feature_names_out()
    self._prior()
    self.ham_words=self.ham_vector.sum().sum()
    self.spam_words=self.spam_vector.sum().sum()
    y_calc=data['text'].apply(self.calc_prob)
    # return score1(y_calc.to_numpy(),y)
    y_calc=y_calc.to_numpy()
    print(y_calc)
    print(y.to_numpy())
    val=score1(y_calc,y.to_numpy())
    # return score1(y_calc,y.to_numpy())
    print(val)

  def calc_prob(self,text):
    text=text.split()
    # # ham
    ham_prob=[]
    ham_prob.append(self.ham_prob)

    for word in text:
      num=1
      if word in self.ham_features:
        num+=self.ham_vector[word].sum()
      den=self.ham_words+2
      ham_prob.append(num/den)

    log_ham_prob= np.sum(np.log(np.array(ham_prob)))

    # # spam
    spam_prob=[]
    spam_prob.append(self.spam_prob)

    for word in text:
      num=1
      if word in self.spam_features:
        num+=self.spam_vector[word].sum()
      den=self.spam_words+2
      spam_prob.append(num/den)

    log_spam_prob= np.sum(np.log(np.array(spam_prob)))

    if log_ham_prob>log_spam_prob:
      return 0
    else:
      return 1

  def predict(self,x):
    return x['text'].apply(self.calc_prob).to_numpy()
  def score(self,x,y):
    y_calc=self.predict(x)
    return score1(y_calc,y.to_numpy())



In [280]:
multi=MulitNB()
multi.fit(x_train,y_train)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
97.21785954678035


In [281]:
from sklearn.model_selection import KFold


X = data['text']
y = data['label']


fold = KFold(n_splits=10)


score = []


for train_idx, test_idx in fold.split(X, y):

    x_train, x_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]


    multi.fit(pd.DataFrame({'text': x_train}), y_train)
    acc_score = multi.score(pd.DataFrame({'text': x_test}), y_test)


    score.append(acc_score)


[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
97.12804148384522
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
97.24770642201835
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
97.38783649052841
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
97.28813559322033
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
97.26819541375873
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
97.26819541375873
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
97.2482552342971
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
97.34795613160519
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
97.40777666999003
[0 0 1 ... 0 0 0]
[0 0 1 ... 0 0 0]
97.16849451645064


In [282]:
# print(score)
print(np.mean(score))
print(np.median(score))

94.14950161837288
94.25493716337522
