<a href="https://colab.research.google.com/github/yashyaks/ML-playground/blob/main/ACL_Lab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [159]:
import pandas as pd
import string
import re

In [160]:
df = pd.read_csv("https://raw.githubusercontent.com/yashyaks/ML-playground/main/spam.csv", encoding='latin1', index_col=False)

In [161]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df = df.rename(columns={'v1': 'label', 'v2': 'text'})

In [162]:
df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [163]:
def remove_punctuation(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['text'] = df['text'].apply(remove_punctuation)

In [164]:
def re_breakline(text_list):
    return [re.sub('[\n\r]', ' ', r) for r in text_list]

text = list(df['text'].values)
text_breakline = re_breakline(text)
df['text'] = text_breakline

In [165]:
def re_numbers(text_list):
    return [re.sub('[0-9]+', 'number', r) for r in text_list]

text = list(df['text'].values)
text_numbers = re_numbers(text)
df['text'] = text_numbers

In [167]:
Aimport nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

df['tokens'] = df['text'].apply(tokenize_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [168]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stop_words(tokens):
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

df['tokens'] = df['tokens'].apply(remove_stop_words)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [169]:
def lowercase_tokens(tokens):
    lowercase_tokens = [word.lower() for word in tokens]
    return lowercase_tokens

df['tokens'] = df['tokens'].apply(lowercase_tokens)

In [170]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem_tokens(tokens):
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

df['tokens'] = df['tokens'].apply(stem_tokens)

In [171]:
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy_fun(doc):
    return doc

tfidf_vectorizer = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)

X = tfidf_vectorizer.fit_transform(df['tokens'])
features = tfidf_vectorizer.get_feature_names_out()

In [172]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

df['label'] = df['label'].astype('category').cat.codes
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [173]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
y_pred = nb_classifier.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

print(classification_report(y_test, y_pred))

Accuracy: 0.96
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1453
           1       1.00      0.70      0.83       219

    accuracy                           0.96      1672
   macro avg       0.98      0.85      0.90      1672
weighted avg       0.96      0.96      0.96      1672



In [174]:
from sklearn import metrics

confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[1453    0]
 [  65  154]]


In [175]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0, 0.5, 1.0, 1.5, 2.0]
}

grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.2f}")

best_nb_classifier = grid_search.best_estimator_

Best parameters: {'alpha': 0.5}
Best cross-validation score: 0.97




In [176]:
y_pred = best_nb_classifier.predict(X_test)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.98
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1453
           1       0.98      0.84      0.90       219

    accuracy                           0.98      1672
   macro avg       0.98      0.92      0.95      1672
weighted avg       0.98      0.98      0.98      1672



In [177]:
from sklearn import metrics

confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

In [178]:
confusion_matrix


array([[1449,    4],
       [  35,  184]])