In [124]:
#import dependencies
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [85]:
#import dataset
data=pd.read_csv('./spam dataset/spam.csv',encoding='latin1')

In [86]:
#explore data
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [87]:
data.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [88]:
data.shape

(5572, 5)

In [89]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [90]:
data.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [91]:
#data preprocessing
data=data.dropna(axis=1)


In [92]:
data=data.rename(columns={
    'v1':'type',
    'v2':'message',
})

In [93]:
data.head()

Unnamed: 0,type,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [94]:
# text cleaning
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return " ".join(words)

data['message'] = data['message'].apply(clean_text)


In [102]:
data.head()

Unnamed: 0,type,message
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah dont think goe usf live around though


In [103]:
#label encoding
le =LabelEncoder()
data['type']=le.fit_transform(data['type'])
data.head()

Unnamed: 0,type,message
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri 2 wkli comp win fa cup final tkt 21...
3,0,u dun say earli hor u c alreadi say
4,0,nah dont think goe usf live around though


In [105]:
#TF-IDF vectorization
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(data['message'])
y = data['type']


In [106]:
#data splitting
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [108]:
#model training
model=LogisticRegression()
model.fit(X_train,y_train)

In [123]:
#prediction
prediction=model.predict(X_test)
prediction.shape

(1115,)

In [127]:
#model evaluation
accuracy=accuracy_score(y_test,prediction)
print('accuracy',accuracy)

accuracy 0.9443946188340807


In [129]:
matrix=confusion_matrix(y_test,prediction)
print(matrix)

[[961   4]
 [ 58  92]]


In [131]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       965
           1       0.96      0.61      0.75       150

    accuracy                           0.94      1115
   macro avg       0.95      0.80      0.86      1115
weighted avg       0.95      0.94      0.94      1115



In [132]:
#tester sur de nouveaux SMS 
def predict_spam(sms):
    sms_clean = clean_text(sms)
    sms_vect = tfidf.transform([sms_clean])
    pred = model.predict(sms_vect)
    return "Spam" if pred[0] == 1 else "Not Spam"

print(predict_spam("Congratulations! You've won a $1000 Walmart gift card. Call now."))

Not Spam
