# Importing Required Libararies

In [1]:
import pandas as pd
import charset_normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
# df = pd.read_csv('_6_1_spam.csv')
# df

In [3]:
# Had some problems importing the data directly so checking character encoding
with open('_6_1_spam.csv','rb') as temp:
    result = charset_normalizer.detect(temp.read(1000000))
print(result)

{'encoding': 'Windows-1252', 'language': 'English', 'confidence': 0.9981666666666666}


In [4]:
df = pd.read_csv('_6_1_spam.csv',encoding= 'Windows-1252',usecols=['v1','v2'])
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
# Renaming Columns
df.rename(columns= {'v1':'category','v2': 'mail'},inplace= True)

# Checking for na values

In [6]:
df.isnull().sum()

category    0
mail        0
dtype: int64

# Checking for duplicated values

In [7]:
df.duplicated().sum()

403

In [8]:
# Dropping Duplicates
df.drop_duplicates(inplace = True)
df.reset_index(inplace = True,drop= True )

# Data Encoding

In [9]:
df['category'] = df['category'].replace({'spam':1, 'ham':0})

# splitting into input and output features

In [10]:
x = df['mail']
y = df['category']

# Splitting into train and test data

In [11]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=0)

# Feature extraction

In [12]:
feature_extraction = TfidfVectorizer(min_df=1,lowercase= True, stop_words='english')
x_train = feature_extraction.fit_transform(x_train)
x_test = feature_extraction.transform(x_test)

# Model training

In [13]:
lr = LogisticRegression()
lr.fit(x_train,y_train)

# Model Evaluation

In [14]:
print('Accuracy: ',lr.score(x_test,y_test))

Accuracy:  0.9555125725338491


# Spam Mail Prediction

In [15]:
input_mail = [input('Enter Mail: ')]
input_mail_transformed = feature_extraction.transform(input_mail)

mail_prediction = lr.predict(input_mail_transformed)


print('\nHam mail') if mail_prediction == 0 else print('\nSpam mail')

Enter Mail: Subject: Congratulations! You have won a free trip to Hawaii!  Body:  Dear Valued Customer,  You are one of the lucky winners of our exclusive sweepstakes! You have been selected to receive a free 7-day vacation package to Hawaii, including airfare, hotel, and car rental. All you have to do is reply to this email with your full name, address, phone number, and credit card details to claim your prize.  This offer is valid for a limited time only, so hurry up and don’t miss this amazing opportunity. You deserve to treat yourself to a relaxing and fun-filled getaway in paradise. Imagine yourself enjoying the sun, the sand, and the surf with your loved ones. This is your chance to make your dream come true!  Don’t wait any longer. Reply now and start packing your bags. Hawaii awaits you!  Sincerely,  The Travel Zone Team

Spam mail
