**Loading Libraries**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

**Loading Dataset**



In [None]:
from google.colab import drive
drive.mount("/content/drive")
data = pd.read_csv("/content/drive/My Drive/data/mail_data.csv")

Mounted at /content/drive


In [None]:
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
group = data.groupby(['Category']).size()
print(group)
(group / len(data)) * 100

Category
ham     4825
spam     747
dtype: int64


Category
ham     86.593683
spam    13.406317
dtype: float64

**missing values**

*`The results show that the data set has no missing value.`*

In [None]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [None]:
data.shape

(5572, 2)

**Classification**

1.   Spliting the Dataset
2.   Create the Model
3.   Evaluating the Model



***`1. Spliting the Dataset`***

In [None]:
X = data["Message"]
Y = data["Category"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [None]:
#transform the Message to vectors which can be used as input to the model
feature_exatraction = TfidfVectorizer(min_df = 1, stop_words="english", lowercase="True")
x_train_features = feature_exatraction.fit_transform(x_train)
x_test_features = feature_exatraction.transform(x_test)

***2. Create the Model***


In [None]:
model = LogisticRegression()
#training the model with logistic regression
model.fit(x_train_features, y_train)

LogisticRegression()

***3. Evaluating the Model***

In [None]:
#training accuracy
prediction_on_training = model.predict(x_train_features)
acc_train = accuracy_score(y_train, prediction_on_training)
print(acc_train)

0.9690374691496523


In [None]:
#test accuracy
prediction_on_test = model.predict(x_test_features)
acc_test = accuracy_score(y_test, prediction_on_test)
print(acc_test)

0.9605381165919282


In [None]:
#new data
input_mail = ["U dun say so early hor... U c already then say..."]
input_feature = feature_exatraction.transform(input_mail)
prediction = model.predict(input_feature)
print(prediction)

['ham']
