In [None]:
import numpy as np
import pandas as pd

Data exploration & cleaning

In [None]:
data= pd.read_csv("spam.csv", encoding="latin-1")
data.head()

In [None]:
data.isnull().sum()

In [None]:
data["Unnamed: 2"].unique()


In [None]:
data["Unnamed: 3"].unique()

In [None]:
#columns Unnamed 2,3,4 has a lot of missing data, some of the sms text
#seems to be overflowing to those columns
#i will concatonate the rest of the sms messages and clear out the rest


overflow_rows= data.iloc[:,2:].notnull().any(axis=1)

for index,row in data[overflow_rows].iterrows():
    overflow_sms= row[2:][row[2:].notnull()].tolist()
    data.at[index,"v2"] += " ".join(overflow_sms)

data.drop(data.columns[2:], axis=1, inplace=True)

data.head()

Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
#encoding ham & spam
encoder= LabelEncoder()
data["v1"]=encoder.fit_transform(data["v1"])
#creating a dictionary for the encoding
class_mappings={index: label for index, label in enumerate(encoder.classes_)}
class_mappings

Tokenization & stemming & parsing

In [None]:
#NLP libraries 
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
print(data.dtypes)

In [None]:
#function to cut non-alphabetic characters, stem and tokenize
def processSms(text):
    ps=PorterStemmer()
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]','',text )
    tokens = word_tokenize(text)
    stemmed_tokens =[ps.stem(word) for word in tokens]
    return stemmed_tokens

data["processed_sms"] = data["v2"].apply(processSms)

data.head()

In [None]:

data['processed_sms'] = data['processed_sms'].apply(lambda x: ' '.join(map(str, x)))


In [None]:
#i wanted to know if there are any links or mail addreses shared in sms texts
def detect_links_emails(text):
    # Regular expressions for links and email addresses
    link_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'

    links = re.findall(link_pattern, text)
    emails = re.findall(email_pattern, text)

    return links, emails

links_emails = data['processed_sms'].apply(detect_links_emails)


any_links_emails = any(links_emails)
print("Any links or email addresses in the 'v2' column:", any_links_emails)


In [None]:
#masking the links and mails as httpaddr and mailaddr
def mask_links_emails(text):
    # Regular expressions for links and email addresses
    link_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'

    text = re.sub(link_pattern, 'httpaddr', text)
    text = re.sub(email_pattern, 'mailaddr', text)

    return text
data["processed_sms"] = data["processed_sms"].apply(mask_links_emails)


In [None]:
# Function to convert tokens to feature vectors
def getFeatureVector(tokens, vocab):
    feature_vec = np.zeros(len(vocab))
    for token in tokens:
        if token in vocab:
            feature_vec[vocab[token]] = 1
    return feature_vec
# Convert tokens to vocabulary indices
all_words = [word for tokens in data['processed_sms'] for word in tokens]
unique_words = list(set(all_words))
vocab = {word: index for index, word in enumerate(unique_words)}

# Prepare X and y data
X = np.array([getFeatureVector(tokens, vocab) for tokens in data['processed_sms']])
y = data["v1"]


train test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

In [None]:
#Modeling - Training
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# Train SVM model
svm = SVC(kernel= "linear")
svm.fit(X_train, y_train)

accuracy = svm.score(X_test, y_test)
print("Accuracy:", accuracy)

y_pred = svm.predict(X_test)
f1_score(y_test, y_pred)