In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the dataset

In [4]:
df = pd.read_csv("email-spam.csv")

# Handling missing values

In [6]:
df.fillna(" ", inplace=True)

# Mapping 'spam' to 0 and 'ham' to 1

In [7]:
df['Category'] = df['Category'].map({'spam': 0, 'ham': 1})

# Split data into features (x) and target (y)

In [8]:
x = df['Message']
y = df['Category']

# Split the data into training and testing sets

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=3)

# Feature extraction using TF-IDF

In [10]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
x_train_feature = feature_extraction.fit_transform(x_train)
x_test_feature = feature_extraction.transform(x_test)

# Convert labels to integers

In [11]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

# Build and train the Logistic Regression model

In [12]:
model = LogisticRegression()
model.fit(x_train_feature, y_train)

# Evaluate the model on the training and testing data

In [14]:
accuracy_on_training_data = accuracy_score(y_train, model.predict(x_train_feature))
accuracy_on_test_data = accuracy_score(y_test, model.predict(x_test_feature))

# Example for classifying a new email

In [15]:
input_your_mail = ["As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press to copy your friends Callertune"]
input_data_features = feature_extraction.transform(input_your_mail)
prediction = model.predict(input_data_features)


if prediction[0] == 1:
    print('Normal Mail')
else:
    print('Spam Mail')


Normal Mail
