# Import Libraries

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

 # Load Data

In [None]:
# Load the dataset
df=pd.read_csv('/content/mail_data.csv')

In [None]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
# Replace null values with a null string
data=df.where((pd.notnull(df)),'')

# Data Exploration

In [None]:
# Check the first few rows of the dataset
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#Displaying concise information about the DataFrame, including data types and non-null counts
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
# Displaying the dimensions of the DataFrame
data.shape

(5572, 2)

In [None]:
# Label spam mail as 0 and non-spam (ham) mail as 1
data.loc[data['Category']=='spam','Category',]=0
data.loc[data['Category']=='ham','Category',]=1


 # Label Encoding

In [None]:
# Separate the data into features (X) and labels (Y)
X=data['Message']

Y=data['Category']

In [None]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [None]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


#Splitting Data

In [None]:
# Split the data into training and test sets
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=3)

In [None]:
print(Y.shape)
print(Y_train.shape)
print(Y_test.shape)


(5572,)
(4457,)
(1115,)


#Feature Extraction

In [None]:
# Initialize the TF-IDF vectorizer
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

# Fit and transform the training data, transform the test data
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Assuming you want to convert the labels Y_train and Y_test to integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


In [None]:
print(X_train)

3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4457, dtype: object


In [None]:
print(X_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

#Model Training

In [None]:
# Train the Logistic Regression model
model=LogisticRegression()
model.fit(X_train_features,Y_train)

#Model Evaluation on Training Data

In [None]:
# Predict on training data
prediction_on_training_data=model.predict(X_train_features)


In [None]:
# Calculate accuracy on training data
accuracy_on_training_data=accuracy_score(Y_train,prediction_on_training_data)
print('Acc on training data : ',accuracy_on_training_data)

Acc on training data :  0.9670181736594121


#Model Evaluation on Test Data

In [None]:
# Predict on test data
prediction_on_test_data = model.predict(X_test_features)


In [None]:
# Calculate accuracy on test data
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print("Accuracy on test data:", accuracy_on_test_data)

Accuracy on test data: 0.9659192825112107


 # Predicting New Data

In [None]:
# Define the input mail
input_mail = ["Hi there! You have won a free ticket to Hawaii. Claim now!"]

# Transform the input mail using the TF-IDF vectorizer
input_data_features = feature_extraction.transform(input_mail)

# Predict using the trained model
prediction = model.predict(input_data_features)

# Print the prediction
print(prediction)

# Print whether the mail is ham or spam
if prediction[0] == 1:
    print('Ham mail')
else:
    print('Spam mail')


[0]
Spam mail
