# Basic Imports

In [1]:
!pip install wordcloud 
!pip install scikit-learn
!pip install nltk
!pip install xgboost



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder # for encoding categorical labels
from nltk.stem.porter import PorterStemmer # for stemming words used to reduce them to their root form
import string # for string manipulation
from wordcloud import WordCloud # Importing WordCloud for text visualization
import nltk
from nltk.corpus import stopwords # for text preprocessing and cleaning
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer # for converting text data into numerical format
from sklearn.model_selection import train_test_split # for splitting the dataset into training and testing sets
from sklearn.metrics import accuracy_score,precision_score,recall_score # for evaluating model performance

nltk.download('stopwords') # Downloading stopwords from NLTK
nltk.download('punkt') # Downloading punkt tokenizer from NLTK
nltk.download('punkt_tab') # Downloading punkt_tab tokenizer from NLTK


[nltk_data] Downloading package stopwords to C:\Users\Yashraj
[nltk_data]     Sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Yashraj
[nltk_data]     Sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Yashraj
[nltk_data]     Sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
df=pd.read_csv('spam.csv', encoding='latin-1') # Reading the dataset
df.head() # Displaying the first few rows of the dataset

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True) # Dropping unnecessary columns
df.rename(columns={'v1':'target', 'v2':'text'}, inplace=True) # Renaming columns for clarity
df.head() # Displaying the first few rows after dropping columns

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Data Preprocessing

In [5]:
encoder=LabelEncoder() # Initializing LabelEncoder
df['target']=encoder.fit_transform(df['target']) # Encoding the 'target' column
df.head() # Displaying the first few rows after encoding

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.duplicated().sum() # Checking for duplicate entries in the dataset
len(df) # Getting the total number of entries in the dataset

5572

In [7]:
df=df.drop_duplicates(keep='first') # Dropping duplicate entries
len(df) # Getting the total number of entries after dropping duplicates

5169

# Feature engineering

In [8]:
ps=PorterStemmer() # Initializing PorterStemmer for stemming words
def transform_text(text):
    text=text.lower() # Converting text to lowercase
    text=nltk.word_tokenize(text) # Tokenizing the text into words

    y=[] # List to hold processed words
    for i in text:
        if i.isalnum(): # Checking if the token is alphanumeric
            y.append(i)

    text=y[:] # Copying the list of alphanumeric tokens
    y.clear() # Clearing the list for reuse

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation: # Removing stopwords and punctuation
            y.append(i)

    text=y[:] # Copying the cleaned list of tokens
    y.clear() # Clearing the list for reuse

    for i in text:
        y.append(ps.stem(i)) # Stemming each word

    return " ".join(y) # Joining the processed words back into a single string

In [9]:
transform_text("Hello friends, how are you doing today?") # Testing the transform_text function

'hello friend today'

In [10]:
df['transformed_text']=df['text'].apply(transform_text) # Applying the transform_text function to the 'text' column
df.head() # Displaying the first few rows after text transformation

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [11]:
tfidf= TfidfVectorizer(max_features=500) # Initializing TfidfVectorizer with a maximum of 500 features
# TF-IDF ranks words by their importance within a document, relative to the entire corpus.
x=tfidf.fit_transform(df['transformed_text']).toarray() # Converting the transformed text into TF-IDF feature vectors
y=df['target'].values # Extracting the target labels


# Train Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2) # Splitting the dataset into training and testing sets

# Model Traning

In [13]:
from sklearn.linear_model import LogisticRegression # Importing Logistic Regression model
from sklearn.svm import SVC # Importing Support Vector Classifier
from sklearn.naive_bayes import MultinomialNB # Importing Multinomial Naive Bayes model
from sklearn.tree import DecisionTreeClassifier # Importing Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier # Importing Random Forest Classifier
from sklearn.neighbors import KNeighborsClassifier # Importing K-Nearest Neighbors Classifier
from sklearn.ensemble import AdaBoostClassifier # Importing AdaBoost Classifier
from sklearn.ensemble import BaggingClassifier # Importing Bagging Classifier
from sklearn.ensemble import GradientBoostingClassifier # Importing Gradient Boosting Classifier
from sklearn.ensemble import ExtraTreesClassifier # Importing Extra Trees Classifier
from xgboost import XGBClassifier # Importing XGBoost Classifier

In [14]:
svm=SVC(kernel='sigmoid',gamma='scale') # Initializing SVC with sigmoid kernel
kn=KNeighborsClassifier(n_neighbors=7) # Initializing KNN with 7 neighbors
mnb=MultinomialNB() # Initializing Multinomial Naive Bayes
dft=DecisionTreeClassifier(max_depth=5) # Initializing Decision Tree Classifier
lr=LogisticRegression(solver='liblinear',penalty='l1') # Initializing Logistic Regression
rfc=RandomForestClassifier(n_estimators=50, random_state=2) # Initializing Random Forest Classifier
abc=AdaBoostClassifier(n_estimators=50, random_state=2) # Initializing AdaBoost Classifier
bgc=BaggingClassifier(n_estimators=50, random_state=2) # Initializing Bagging Classifier
ec=ExtraTreesClassifier(n_estimators=50, random_state=2) # Initializing Extra Trees Classifier
gbc=GradientBoostingClassifier(n_estimators=50, random_state=2) # Initializing Gradient Boosting Classifier
xgb=XGBClassifier(n_estimators=50, random_state=2) # Initializing XGBoost Classifier

In [15]:
clfs={
    'SVC':svm,
    'KNeighborsClassifier':kn,
    'MultinomialNB':mnb,
    'DecisionTreeClassifier':dft,
    'LogisticRegression':lr,
    'RandomForestClassifier':rfc,
    'AdaBoostClassifier':abc,
    'BaggingClassifier':bgc,
    'ExtraTreesClassifier':ec,
    'GradientBoostingClassifier':gbc,
    'XGBClassifier':xgb
}

# Model Evaluation

In [16]:
accuracy_scores=[]
precision_scores=[]
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train) # Training the classifier
    y_pred=clf.predict(X_test) # Making predictions on the test set
    accuracy=accuracy_score(y_test,y_pred) # Calculating accuracy
    precision=precision_score(y_test,y_pred) # Calculating precision
    return accuracy, precision

In [18]:
for name,clf in clfs.items():
    current_accuracy, current_precision = train_classifier(clf,X_train,y_train,X_test,y_test)
    print(" ")
    print("for:",name)
    print("Accuracy:",current_accuracy)
    print("Precision:",current_precision)

    accuracy_scores.append(current_accuracy) # Storing accuracy score
    precision_scores.append(current_precision) # Storing precision score

 
for: SVC
Accuracy: 0.9671179883945842
Precision: 0.9333333333333333
 
for: KNeighborsClassifier
Accuracy: 0.9187620889748549
Precision: 1.0
 
for: MultinomialNB
Accuracy: 0.9709864603481625
Precision: 0.9655172413793104
 
for: DecisionTreeClassifier
Accuracy: 0.9381044487427466
Precision: 0.9111111111111111




 
for: LogisticRegression
Accuracy: 0.9632495164410058
Precision: 0.9629629629629629
 
for: RandomForestClassifier
Accuracy: 0.9700193423597679
Precision: 0.9421487603305785
 
for: AdaBoostClassifier
Accuracy: 0.9235976789168279
Precision: 0.8734177215189873
 
for: BaggingClassifier
Accuracy: 0.9622823984526112
Precision: 0.9024390243902439
 
for: ExtraTreesClassifier
Accuracy: 0.9709864603481625
Precision: 0.921875
 
for: GradientBoostingClassifier
Accuracy: 0.9497098646034816
Precision: 0.93
 
for: XGBClassifier
Accuracy: 0.9690522243713733
Precision: 0.9568965517241379
