In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [9]:
df = pd.read_csv('spam_detection.csv')

In [10]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [12]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True, axis=1)

In [13]:
df.rename(columns={'v1':'category', 'v2':'text'}, inplace=True)

In [14]:
df.head()

Unnamed: 0,category,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
df.isna().sum()

category    0
text        0
dtype: int64

In [16]:
df.count()

category    5572
text        5572
dtype: int64

In [17]:
df.category = [(0 if i=='ham' else 1) for i in df.category]

In [18]:
df.head()

Unnamed: 0,category,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['category'], test_size=0.25, random_state=41)

In [20]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english')

In [29]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [22]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [23]:
model = LogisticRegression()

In [24]:
model.fit(X_train_features, y_train)

LogisticRegression()

In [27]:
train_pred = model.predict(X_train_features)
print("training accuracy is: ", accuracy_score(y_train, train_pred))

training accuracy is:  0.9703278296243121


In [30]:
pred = model.predict(X_test_features)
print("test accuracy is: ", accuracy_score(y_test, pred))

test accuracy is:  0.9533381191672649


In [34]:
input_mail = ["Are you willing to go for aptitude class"]
input_mail_features = feature_extraction.transform(input_mail)
prediction = model.predict(input_mail_features)
if prediction==0:
    print("this is ham mail")
else: 
    print("this is spam mail")

this is ham mail
