# Naive Bayes Classifier for Email Classification.

In [1]:
import pandas as pd
import numpy as np

## Step 1: Data Loading

In [2]:
data=pd.read_csv("/content/drive/MyDrive/Concepts and Technologies of AI/Week 10/Dataset/spam_ham_dataset.csv",index_col=0)
data.head()

Unnamed: 0,label,text,label_num
605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
4685,spam,"Subject: photoshop , windows , office . cheap ...",1
2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [3]:
length=len(data['text'])
length

5171

## Step 2: Text Cleaning with NLTK Library

In [4]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
corpus=[]
for i in range(0,length):
  #Replacing punctuations with space
  text=re.sub('[^a-zA-Z]',' ',data['text'][i])
  #converting to lowercase
  text=text.lower()
  #Stemming
  text=text.split()
  ps=PorterStemmer()
  all_stopwords=stopwords.words('english')
  text=[ps.stem(word) for word in text if not word in set(all_stopwords)]
  text=' '.join(text)
  corpus.append(text)

## Step 3: Re-verifying and further cleaning of data

In [6]:
data_check=data.copy()
data_check['cleanText']=corpus
data_check.head()

Unnamed: 0,label,text,label_num,cleanText
605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject christma tree farm pictur
2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject vastar resourc inc gari product high i...
3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject calpin daili ga nomin calpin daili ga ...
4685,spam,"Subject: photoshop , windows , office . cheap ...",1,subject issu fyi see note alreadi done stella ...
2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,subject meter nov alloc fyi forward lauri alle...


In [7]:
#Removing subject
data_check['cleanText']=data_check['cleanText'].str.replace('subject','')
data_check.head()

Unnamed: 0,label,text,label_num,cleanText
605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,christma tree farm pictur
2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,vastar resourc inc gari product high island l...
3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,calpin daili ga nomin calpin daili ga nomin doc
4685,spam,"Subject: photoshop , windows , office . cheap ...",1,issu fyi see note alreadi done stella forward...
2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,meter nov alloc fyi forward lauri allen hou e...


## Step 4: Constructing Feature Matrix and Label Vector

In [8]:
x=data_check.loc[:,'cleanText'].values
y=data_check.loc[:,'label_num'].values

## Step 5: Text Representation using Count Vectorization

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
x=cv.fit_transform(x).toarray()

## Step 6: Final Model Building using Scikit Learn

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [11]:
model=MultinomialNB()
model.fit(X_train,y_train)

## Step 7: Final Evaluation

In [12]:
y_pred=model.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,y_pred))
print("Classification Report:\n",classification_report(y_test,y_pred))

Accuracy:  0.645618556701031
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.83      0.77      1121
           1       0.28      0.17      0.21       431

    accuracy                           0.65      1552
   macro avg       0.50      0.50      0.49      1552
weighted avg       0.60      0.65      0.62      1552



# For IMDB dataset

In [13]:
movie_data=pd.read_csv("/content/drive/MyDrive/Concepts and Technologies of AI/Week 10/Dataset/IMDB Dataset.csv")
movie_data.head(2)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive


In [14]:
length_md=len(movie_data['review'])
length_md

50000

In [15]:
mcorpus=[]
for i in range(0,length_md):
  #Replacing punctuations with space
  mtext=re.sub('[^a-zA-Z]',' ',movie_data['review'][i])
  #converting to lowercase
  mtext=mtext.lower()
  #Stemming
  mtext=mtext.split()
  ps=PorterStemmer()
  all_stopwords=stopwords.words('english')
  mtext=[ps.stem(word) for word in mtext if not word in set(all_stopwords)]
  mtext=' '.join(mtext)
  mcorpus.append(mtext)

## Re-verifying and further cleaning of the Data:


In [16]:
mdata_check = movie_data.copy()
mdata_check['cleanText'] = mcorpus
mdata_check.head()

Unnamed: 0,review,sentiment,cleanText
0,One of the other reviewers has mentioned that ...,positive,one review mention watch oz episod hook right ...
1,A wonderful little production. <br /><br />The...,positive,wonder littl product br br film techniqu unass...
2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,negative,basic famili littl boy jake think zombi closet...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...


## Converting labels to binary

In [28]:
mdata_check['sentiment']=mdata_check['sentiment'].map({'positive': 1, 'negative': 0})

## Construction Feature Matrix and Label Vector:

In [29]:
x=mdata_check.loc[:,'cleanText'].values
y=mdata_check.loc[:,'sentiment'].values

## Text Representation using Count Vectorization:

In [30]:
cv = CountVectorizer(max_features=10000)
x = cv.fit_transform(x).toarray()

In [31]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [32]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [33]:
y_pred = model.predict(X_test)

In [34]:
from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score,confusion_matrix,roc_auc_score

In [36]:
print(f"The accuracy is {accuracy_score(y_test,y_pred)}")
print(f"The f1-score is {f1_score(y_test,y_pred)}")
print(f"The precision score is {precision_score(y_test,y_pred)}")
print(f"The recall score is {recall_score(y_test,y_pred)}")
print(f"Confusion matrix: {confusion_matrix(y_test,y_pred)}")
print(f"The roc-auc score is {roc_auc_score(y_test,y_pred)}")

The accuracy is 0.8498
The f1-score is 0.8493480441323972
The precision score is 0.858649361184344
The recall score is 0.8402460805715419
Confusion matrix: [[4264  697]
 [ 805 4234]]
The roc-auc score is 0.8498751064014735


# Feature Selection using Wrapper Methods

## Data Loading and PreProcessing

In [79]:
breastcancer_data=pd.read_csv("/content/drive/MyDrive/Concepts and Technologies of AI/Week 10/Dataset/wpbc.data",header=None)
breastcancer_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,119513,N,31,18.02,27.6,117.5,1013.0,0.09489,0.1036,0.1086,...,139.7,1436.0,0.1195,0.1926,0.314,0.117,0.2677,0.08113,5.0,5
1,8423,N,61,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,...,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,3.0,2
2,842517,N,116,21.37,17.44,137.5,1373.0,0.08836,0.1189,0.1255,...,159.1,1949.0,0.1188,0.3449,0.3414,0.2032,0.4334,0.09067,2.5,0
3,843483,N,123,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,...,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,2.0,0
4,843584,R,27,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,...,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,3.5,0


In [80]:
breastcancer_data.shape

(198, 35)

In [81]:
col_names=[str(i) for i in range(1,36)]
breastcancer_data.columns=col_names

In [83]:
breastcancer_data.replace('?',np.nan,inplace=True)

In [90]:
breastcancer_data.dropna(inplace=True)
breastcancer_data.isna().sum()

Unnamed: 0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
10,0


In [91]:
breastcancer_data.columns

Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
       '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25',
       '26', '27', '28', '29', '30', '31', '32', '33', '34', '35'],
      dtype='object')

In [100]:
breastcancer_data['2']=breastcancer_data['2'].map({'R': 1, 'N': 0})

In [101]:
X=breastcancer_data.drop(columns=['2']).values
y=breastcancer_data['2'].values

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Applying a Wrapper method

In [103]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [104]:
model = LogisticRegression(max_iter=20000)
n_features_to_select = 5 # Number of features to keep
rfe = RFE(estimator=model, n_features_to_select=n_features_to_select)
# Fit RFE
rfe.fit(X_train, y_train)
# Get selected features
selected_features = rfe.support_ # Boolean mask of selected features
ranking = rfe.ranking_ # Feature rankings (1 indicates selected features)
# Transform the dataset to include only selected features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)
# Train the model on the selected features
model.fit(X_train_rfe, y_train)
# Predict and evaluate
y_pred = model.predict(X_test_rfe)
accuracy = accuracy_score(y_test, y_pred)
print(f"Selected Features Mask: {selected_features}")
print(f"Feature Ranking: {ranking}")

Selected Features Mask: [False False False False False False False False False False False False
  True  True  True False False False False False False False False False
 False False False  True  True False False False False False]
Feature Ranking: [30 14  7 13 16 26 15  3  5 22 29 23  1  1  1 18 25 10 11 20 27 24  6 17
 21 28  4  1  1 12  8 19  2  9]


In [105]:
print(f"Model Accuracy with Selected Features: {accuracy}")
print(f"Model precision with Selected Features: {precision_score(y_test,y_pred)}")
print(f"Model Recall score with Selected Features: {recall_score(y_test,y_pred)}")
print(f"Model f1-score with Selected Features: {f1_score(y_test,y_pred)}")
print(f"Model roc-auc score with Selected Features: {roc_auc_score(y_test,y_pred)}")

Model Accuracy with Selected Features: 0.8461538461538461
Model precision with Selected Features: 1.0
Model Recall score with Selected Features: 0.14285714285714285
Model f1-score with Selected Features: 0.25
Model roc-auc score with Selected Features: 0.5714285714285714
