In [41]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords

In [42]:
import chardet

def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        result = chardet.detect(file.read())
    return result['encoding']
csv_file_path = 'spam.csv'

csv_file_encoding = detect_encoding(csv_file_path)

df = pd.read_csv(csv_file_path, encoding=csv_file_encoding)

In [43]:
csv_file_path = 'spam.csv'

csv_file_encoding = 'latin-1'

df = pd.read_csv(csv_file_path, encoding=csv_file_encoding)


In [44]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,ham,"Go until jurong point, crazy.. Available only ...",,,,,,,,,,,,,,,,,
1,ham,Ok lar... Joking wif u oni...,,,,,,,,,,,,,,,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,,,,,,,,,,,,,,
3,ham,U dun say so early hor... U c already then say...,,,,,,,,,,,,,,,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,,,,,,,,,,,,,,


In [45]:
print(df.columns)

try:
    df = df[['v2', 'v1']]
except KeyError as e:
    print(f"")
df = df.rename(columns={'v2': 'messages', 'v1': 'label'})
print(df.head())


Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14',
       'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18'],
      dtype='object')
                                            messages label
0  Go until jurong point, crazy.. Available only ...   ham
1                      Ok lar... Joking wif u oni...   ham
2  Free entry in 2 a wkly comp to win FA Cup fina...  spam
3  U dun say so early hor... U c already then say...   ham
4  Nah I don't think he goes to usf, he lives aro...   ham


In [46]:
df.isnull().sum()

messages    0
label       0
dtype: int64

In [56]:
import re
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r'[^0-9a-zA-Z]', ' ', text)

    text = re.sub(r'\s+', ' ', text)

    text = " ".join(word for word in text.split() if word not in STOPWORDS)
    return text


In [57]:
df['clean_text'] = df['messages'].apply(clean_text)
df.head()

Unnamed: 0,messages,label,clean_text
0,"Go until jurong point, crazy.. Available only ...",ham,go jurong point crazy available bugis n great ...
1,Ok lar... Joking wif u oni...,ham,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,U dun say so early hor... U c already then say...,ham,u dun say early hor u c already say
4,"Nah I don't think he goes to usf, he lives aro...",ham,nah think goes usf lives around though


In [58]:
df['clean_text'] = df['messages'].apply(clean_text)
X = df['clean_text']
y = df['label']


In [59]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

def classify(model, X, y):
    
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True, stratify=y)
    
    pipeline_model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', model)])
    pipeline_model.fit(x_train, y_train)
    
    print('Accuracy:', pipeline_model.score(x_test, y_test)*100)
    y_pred = pipeline_model.predict(x_test)
    print(classification_report(y_test, y_pred))

In [60]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model, X, y)

Accuracy: 96.26704953338118
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1207
        spam       0.99      0.73      0.84       186

    accuracy                           0.96      1393
   macro avg       0.98      0.86      0.91      1393
weighted avg       0.96      0.96      0.96      1393



In [61]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
classify(model, X, y)

Accuracy: 96.19526202440775
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1207
        spam       1.00      0.72      0.83       186

    accuracy                           0.96      1393
   macro avg       0.98      0.86      0.91      1393
weighted avg       0.96      0.96      0.96      1393



In [62]:
from sklearn.svm import SVC
model = SVC(C=3)
classify(model, X, y)

Accuracy: 98.20531227566404
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1207
        spam       1.00      0.87      0.93       186

    accuracy                           0.98      1393
   macro avg       0.99      0.93      0.96      1393
weighted avg       0.98      0.98      0.98      1393



In [63]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model, X, y)

Accuracy: 97.48743718592965
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      1207
        spam       1.00      0.81      0.90       186

    accuracy                           0.97      1393
   macro avg       0.99      0.91      0.94      1393
weighted avg       0.98      0.97      0.97      1393

