In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb 

In [2]:
dataset=pd.read_csv('SPAM text message 20170820 - Data.csv')

In [3]:
dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
dataset.isnull().sum()

Category    0
Message     0
dtype: int64

In [5]:
Unique_Category=dataset['Category'].unique()
Unique_Cat={}
i=0
for cat in Unique_Category:
    Unique_Cat[cat]=i
    i+=1
dataset['Category']=dataset['Category'].map(Unique_Cat)

In [6]:
dataset.head(2)

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...


In [7]:
dataset['Message']=dataset['Message'].apply(lambda x:x.lower())

### Remove Punctuation

In [8]:
import string
def remove_pun(txt):
    return txt.translate(str.maketrans("","",string.punctuation))

In [9]:
dataset['Message']=dataset['Message'].apply(remove_pun)

### Remove Emojis

In [10]:
def remove_emojis(txt):
    new=""
    for i in txt:
        if i.isascii():
            new=new +i
    return new    
dataset['Message']=dataset['Message'].apply(remove_emojis)

## Remove Stock words using NLTK

In [11]:
import nltk

In [12]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [13]:
nltk.download("punkt") # Words od libarry
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zesha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zesha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
stop_words=set(stopwords.words('english'))

In [15]:
len(stop_words)

198

In [16]:
def Remove_stopwords(txt):
    words=txt.split()
    Clean_words=[]
    for i in  words:
        if not i in stop_words:
            Clean_words.append(i)
    return ' '.join(Clean_words)  
dataset['Message']=dataset['Message'].apply(Remove_stopwords)   

In [17]:
dataset.loc[1]

Category                          0
Message     ok lar joking wif u oni
Name: 1, dtype: object

## Balancing Dataset and Dividing into Training and Testing

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [38]:
X_train,X_test,y_train,y_test=train_test_split(dataset['Message'] ,dataset['Category'])

###  Define the new pipeline for MultinomialNB

In [39]:
pipeline_parameter=[
    ('Tf_vectorizer',TfidfVectorizer()),
    ('smote',SMOTE(random_state=42)),
    ('Mbn',MultinomialNB())
]

In [40]:
pipeline_mbn=Pipeline(pipeline_parameter)

In [41]:
pipeline_mbn.fit(X_train,y_train)

0,1,2
,steps,"[('Tf_vectorizer', ...), ('smote', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [42]:
pipeline_mbn.score(X_test,y_test)

0.9698492462311558

In [43]:
pipe_model=pipeline_mbn.predict(X_test)

In [44]:
parameters = {
    'Mbn__alpha': [1.0, 0.5, 0.1] # This name matches your pipeline step
}

In [60]:
grid_search_nb= GridSearchCV(pipeline_mbn, parameters, cv=5, scoring='f1_weighted', n_jobs=1)
grid_search_nb.fit(X_train, y_train)
print("Best alpha:", grid_search_nb.best_params_)

Best alpha: {'Mbn__alpha': 0.1}


In [61]:
print(classification_report(y_test, grid_search_nb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1205
           1       0.91      0.92      0.91       188

    accuracy                           0.98      1393
   macro avg       0.95      0.95      0.95      1393
weighted avg       0.98      0.98      0.98      1393



###  Define the new pipeline for Random Forest

In [47]:
from sklearn.ensemble import RandomForestClassifier

In [48]:
pipeline_rf = Pipeline([
    ('Tf_vectorizer', TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9)),
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42)) 
])

In [49]:
params_rf = {
    'rf__n_estimators': [50, 100, 200], 
    'rf__max_depth': [10, 20, None]
}

In [51]:
grid_search_rf = GridSearchCV(pipeline_rf, params_rf, cv=5, scoring='f1_weighted', n_jobs=1)
grid_search_rf.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'rf__max_depth': [10, 20, ...], 'rf__n_estimators': [50, 100, ...]}"
,scoring,'f1_weighted'
,n_jobs,1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [57]:
print(classification_report(y_test, grid_search_rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1205
           1       0.99      0.88      0.93       188

    accuracy                           0.98      1393
   macro avg       0.98      0.94      0.96      1393
weighted avg       0.98      0.98      0.98      1393



### Define the new pipeline for Logistic Regression

In [62]:
pipeline_lr = Pipeline([
    ('Tf_vectorizer', TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9)),
    ('smote', SMOTE(random_state=42)),
    # Increase max_iter to ensure the model has enough iterations to converge
    ('lr', LogisticRegression(random_state=42, solver='saga', max_iter=5000))
])

In [63]:
params_lr = {
    'lr__C': [0.1, 1, 10],
    'lr__penalty': ['l1', 'l2']
}

In [64]:
grid_search_lr = GridSearchCV(pipeline_lr, params_lr, cv=5, scoring='f1_weighted', n_jobs=1)
grid_search_lr.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...ver='saga'))])
,param_grid,"{'lr__C': [0.1, 1, ...], 'lr__penalty': ['l1', 'l2']}"
,scoring,'f1_weighted'
,n_jobs,1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'saga'
,max_iter,5000


In [65]:
print("Best parameters for Logistic Regression:", grid_search_lr.best_params_)
print("\nClassification Report:")
print(classification_report(y_test, grid_search_lr.predict(X_test)))

Best parameters for Logistic Regression: {'lr__C': 1, 'lr__penalty': 'l2'}

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1205
           1       0.90      0.90      0.90       188

    accuracy                           0.97      1393
   macro avg       0.94      0.94      0.94      1393
weighted avg       0.97      0.97      0.97      1393

