In [2]:
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [3]:
sms_data = pd.read_csv('../csv/SMSSpamCollection.csv')

In [4]:
# Define a function that takes the message string as input and does the following:
# 1. Convert all characters to lower case
# 2. Remove all punctuation ("string.punctuation" contains a list of punctuations)
# 3. Remove all digits ("string.digits" contains a list of numbers)
# 4. Returns a string of the processed text
def text_process(message):
    message_proc = [char.lower() for char in message if (char not in string.punctuation) and (char not in string.digits)]
    message_proc = ''.join(message_proc)
    return message_proc

In [5]:
# Apply the created function to the SMS column and save the results in a new column 
sms_data['SMS_processed'] = sms_data['SMS'].apply(text_process)

In [6]:
# Instantiate a CountVectorizer and fit to the processed SMS column
vectorizer = CountVectorizer(stop_words="english")
# Transform the processed SMS column using the vectorizer and save it in a variable X
X = vectorizer.fit_transform(sms_data['SMS_processed'])
# Define a variable y which includes the label
y = sms_data['Label']

In [7]:
# Split the data into training and testing (Set the parameter stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=0)

In [8]:
# Instantiate a SVC classifier and fit it on the training data.
vectorizer = SVC()
vectorizer.fit(X_train, y_train)


In [9]:
# Calculate the accuracy
accuracy = vectorizer.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9634146341463414


In [10]:
# Calculate the specificity (True Negative Rate)
def specificity(model, X_test, y_test):
    # Caluclate the predictions of the model (y_pred)
    y_pred = model.predict(X_test)
    # Find the length of the labels that were correctly predicted as "spam"
    true_negative = len(y_pred[(y_pred == 'spam') & (y_test == 'spam')])
    # Find the length of the actual labels that are "spam"
    actual_negative = len(y_test[y_test == 'spam'])
     # Devide the numbers to get the specificity and return the value
    return true_negative / actual_negative

specificity_ = specificity(vectorizer, X_test, y_test)
print("Specificity:", specificity_)


Specificity: 0.7272727272727273


In [11]:
# Define a dictionary with the parameters you wish to optimize
params = {}
# Using for loops, find the optimal combination of parameters that lead to the mazimum accuracy
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    for C in [0.1, 1, 10, 100]:
        for gamma in [0.1, 1, 10, 100]:
            vectorizer = SVC(kernel=kernel, C=C, gamma=gamma)
            vectorizer.fit(X_train, y_train)
            accuracy = vectorizer.score(X_test, y_test)
            params[(kernel, C, gamma)] = accuracy
# Print your results
print("Highest accuracy score: ", max(params.values()))
print("Parameters used: ", max(params, key=params.get))

Highest accuracy score:  0.9777618364418939
Parameters used:  ('linear', 1, 0.1)


In [12]:
# Define a dictionary with the parameters you wish to optimize
params = {}

# Using for loops, find the optimal combination of parameters that lead to the mazimum specificity
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    for C in [0.1, 1, 10, 100]:
        for gamma in [0.1, 1, 10, 100]:
            vectorizer = SVC(kernel=kernel, C=C, gamma=gamma)
            vectorizer.fit(X_train, y_train)
            specificity_ = specificity(vectorizer, X_test, y_test)
            params[(kernel, C, gamma)] = specificity_
            
# Print your results
print("Highest specificity score: ", max(params.values()))
print("Parameters used: ", max(params, key=params.get))

Highest specificity score:  0.839572192513369
Parameters used:  ('linear', 1, 0.1)
