In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import nltk
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
#read file
data = pd.read_csv("spam.csv", encoding='latin1')
data.head()

In [None]:
#clean and update df
data=data.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1)
data=data.rename(columns={"v1":"label","v2":"text"})
data.head()

In [None]:
#check total rows
data.tail()

In [None]:
#count unique values
data.label.value_counts()

In [None]:
#summary
data.describe()

In [None]:
#create new column and measure length
data['length']=data['text'].apply(len)
data.head()

In [None]:
#histogram for length
data['length'].plot(bins=25, kind='hist', figsize=(5,4))

In [None]:
#histogram group by label
data.hist(column='length', by='label', bins=25, figsize=(8,4))

In [None]:
# map "ham" to 0, "spam" to 1
data.loc[:,'label']=data.label.map({'ham':0, 'spam':1})
data.head()

In [None]:
#Uses CountVectorizer on 'data' text, splits, transforms new text for classification
count=CountVectorizer()
input=["Thank you for your recent purchase. Here's a special discount code for your next order."]
text=count.fit_transform(data['text'],input)

x_train, x_test, y_train, y_test= train_test_split(text,data['label'], test_size=0.20, random_state=1)
text

In [None]:
#Shapes training set, testing set
print(x_train.shape)
print(x_test.shape)
input=text[5571]

In [None]:
#logistic regression model
model=LogisticRegression()
model.fit(x_train, y_train)

In [None]:
#Predicts labels on test set
prediction=model.predict(x_test)
print(prediction)

In [None]:
#accuracy, precision, recall, and F1 scores for model evaluation
print("Accuracy score: {}".format(accuracy_score(y_test, prediction)))
print("precision score: {}".format(precision_score(y_test, prediction)))
print("recall score: {}".format(recall_score(y_test, prediction)))
print("f1 score: {}".format(f1_score(y_test, prediction)))

In [None]:
#Uses trained model to predict label for the input text
input
result = data['text']
input ="Great deals on electronics this weekend. Visit our store for exclusive discounts."
result

In [None]:
# append new text to df
result.loc[len(result.index)]="Congratulations! You've been selected for a special promotion. Claim your prize now."
result

In [None]:
#transform
text=count.fit_transform(result)
text


In [None]:
#predict label using trained model
input=text[5572]
model.predict(input)

In [None]:
#predict label using trained model (not spam)
input_text = ["Reminder: Your appointment with Dr. Smith is scheduled for tomorrow at 3 PM."]
input_transformed = count.transform(input_text)
prediction = model.predict(input_transformed)
print(prediction)

In [None]:
#predict label using trained model  (spam)
input_text = ["Congratulations! You've won a free cruise. Call now to claim your prize."]
input_transformed = count.transform(input_text)
prediction = model.predict(input_transformed)
print(prediction)