In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/mail_data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

**Label Encoding**

In [None]:
label = LabelEncoder()

In [None]:
lables = label.fit_transform(df['Category'])

In [None]:
df['target'] = lables

In [None]:
df.head()

Feature Extraction

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
nltk.download('stopwords')

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
  stem_content = re.sub('[^a-zA-Z]',' ',content) # Removing all the punctuations
  stem_content = stem_content.lower()
  stem_content = stem_content.split()
  stem_content = [port_stem.stem(word) for word in stem_content if not word in stopwords.words('english')] # taking all the words except the stopwords after splitting and reducing it to its Root word
  stem_content = ' '.join(stem_content)
  return stem_content

In [None]:
df['Message'] = df['Message'].apply(stemming)

In [None]:
X = df['Message'].values
Y = df['target'].values

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

In [None]:
X = vectorizer.transform(X)

In [None]:
X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size=0.2,random_state=3)

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train,Y_train)

In [None]:
# accuracy for training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [None]:
print('Accuracy on training data : ',training_data_accuracy)

In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [None]:
print('Accuracy on test data : ',test_data_accuracy)

Predicting System

In [None]:
input_mail = ["Subject: Meeting Rescheduled to 3 PM Today Hi team,Just a quick note that today’s project sync has been moved from 2 PM to 3 PM to accommodate John's availability.Let me know if you have any conflicts.Thanks,Rachel"]

input_feature = vectorizer.transform(input_mail)

prediction = model.predict(input_feature)

if prediction[0] == 1:
  print('spam Mail')
else:
  print('ham Mail')