In [None]:
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
data = pd.read_csv(r'D:\MLOPS_dvc\dvc_project\data\spam.csv',encoding='latin')
data.head()

In [None]:
data.columns

In [None]:
data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],inplace=True)
data.head()

In [None]:
data.rename(
    columns = {
        'v1' : 'target',
        'v2' : 'text'
    },
    inplace=True
)
data.head()

## EDA

In [None]:
data.describe()

In [None]:
data.target.value_counts().plot(
    kind = 'bar',
    color = ['#FA745A','#F7B7AD'],
    legend=True,
)

In [None]:
print(f'Nulls in data : \n{data.isnull().sum()}')

In [None]:
data.info()

In [None]:
print(f"Dulicates {data.duplicated().sum()}")
data.shape

In [None]:
data.drop_duplicates(keep = "first",inplace=True)
data.shape

## Feature Engg

In [None]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import re
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # 1. Lowercase
    text = text.lower()

    # 2. Remove URLs, mentions, hashtags
    text = re.sub(r'http\S+|www\S+|@\w+|#\w+', '', text)

    # 3. Remove numbers and punctuation
    text = re.sub(r'[^a-z\s]', '', text)

    # 4. Tokenize
    tokens = word_tokenize(text)

    # 5. Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # 6. Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # 7. Join back to string
    cleaned_text = ' '.join(tokens)

    return cleaned_text

In [None]:
data['transformed_text'] = data['text'].apply(clean_text)

In [None]:
data.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['target'] = encoder.fit_transform(data['target'])
tfid = TfidfVectorizer(max_features = 500)

X= tfid.fit_transform(data['transformed_text']).toarray()
y = data['target'].values

In [None]:
from sklearn.model_selection  import   train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 2)

In [None]:
from sklearn.linear_model import   LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
svc = SVC(kernel='sigmoid',gamma = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear',penalty='l1')
rfc = RandomForestClassifier(n_estimators=50,random_state=2)
abc = AdaBoostClassifier(n_estimators=50,random_state=2)
bc = BaggingClassifier(n_estimators=50,random_state=2)
etc = ExtraTreesClassifier(n_estimators=50,random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators = 50,random_state = 2)


In [None]:
clfs = {
    'SVC' : svc,
    'KNN' : knc,
    'NB' : mnb,
    'DT' : dtc,
    'LR' : lrc,
    'RF' : rfc,
    'Adaboost' : abc,
    'Bgc' : bc,
    'ETC' : etc,
    'GBDT' : gbdt,
    'xgb' : xgb

}

In [None]:
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report,ConfusionMatrixDisplay
def train_classifier (clfs,x_train,y_train,x_test,y_test):
    clfs.fit(x_train,y_train)
    y_pred = clfs.predict(x_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    cfr = classification_report(y_test,y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return accuracy,precision,cm,cfr,clfs.classes_

In [None]:
accuracy_scores =[]
precision_scores = []
cm_scores =[]
cfr_scores = []
for name,clfs in clfs.items():
    current_accuracy , current_precision, c_cm,c_cfr,classes = train_classifier(clfs,x_train,y_train,x_test,y_test)
    print()
    print('For : ',name)
    print('Accuracy : ' , current_accuracy)
    print('Precision : ',current_precision)
    print("Classification_report : \n", c_cfr )
    disp = ConfusionMatrixDisplay(confusion_matrix=c_cm, display_labels=classes)
    disp.plot(cmap='Blues', values_format='d')

    plt.title(f"Confusion Matrix of {name}")
    plt.show()


    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)
    cm_scores.append(c_cm)
    cfr_scores.append(c_cfr)

    