In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('spam.csv')

In [3]:
type(df)

pandas.core.frame.DataFrame

## Data Cleaning and Preprocessing

In [4]:
df.dropna(inplace=True, axis=1)
df = df.rename(columns={'v1': 'spam', 'v2': 'text'})
df.head()

Unnamed: 0,spam,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Label Encoding: '1' is 'spam'
df['spam'] = df['spam'].apply(lambda x: 1 if x == 'spam' else 0)

In [6]:
df.head()

Unnamed: 0,spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Splitting data into test and train sets
x = df['text']
y = df['spam']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=45)


## Feature Extraction

Transform text data into feature vectors to apply machine learning models

In [8]:
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

## Training the Model

In [9]:
model = LogisticRegression()

In [10]:
model.fit(x_train_vec, y_train)

## Evaluating the Model

In [11]:
# Prediction on training data:
prediction_train = model.predict(x_train_vec)
accuracy_train = accuracy_score(y_train, prediction_train)


In [12]:
print(accuracy_train)

0.9683983720373474


In [13]:
# Prediction on test data:
prediction_test = model.predict(x_test_vec)
accuracy_test = accuracy_score(y_test, prediction_test)


In [14]:
print(accuracy_test)

0.9641062455132807


In [15]:
model.score(x_test_vec, y_test)

0.9641062455132807

### F1 Score: ###

In [16]:
from sklearn.metrics import f1_score

f1_score(y_test, model.predict(x_test_vec), average=None)

array([0.97955846, 0.85294118])

## Predictive System

In [24]:
input_mail = ["Please don't text me anymore. I have nothing else to say."]
input_vec = vectorizer.transform(input_mail)

prediction = model.predict(input_vec)

if prediction[0] == 1:
    print("Spam")
else:
    print("Ham")

Ham
