In [156]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import pickle


In [157]:
spam_mail_1=pd.read_csv("spam_ham_dataset.csv")
spam_mail_1.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [158]:
columns_to_delete = ['Unnamed: 0', 'label_num']
spam_mail_1 = spam_mail_1.drop(columns=columns_to_delete)
string_to_delete = 'Subject:'
spam_mail_1['text'] = spam_mail_1['text'].str.replace(string_to_delete, '')

In [159]:
spam_mail_2 = pd.read_csv("spam_and_ham_file_2.csv",encoding='ISO-8859-1')
columns_delete = ['Unnamed: 2'	,'Unnamed: 3'	,'Unnamed: 4']
spam_mail_2= spam_mail_2.drop(columns=columns_delete)
spam_mail_2.rename(columns={'v1':'label','v2':'text'},inplace=True)
spam_mail_2.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [160]:
mail_spam = pd.concat([spam_mail_1, spam_mail_2], ignore_index=True)

# Save the merged dataframe to a new CSV file
mail_spam.to_csv('merged_file.csv', index=False)
mail_spam.head(10)

Unnamed: 0,label,text
0,ham,enron methanol ; meter # : 988291\r\nthis is ...
1,ham,"hpl nom for january 9 , 2001\r\n( see attache..."
2,ham,"neon retreat\r\nho ho ho , we ' re around to ..."
3,spam,"photoshop , windows , office . cheap . main t..."
4,ham,re : indian springs\r\nthis deal is to book t...
5,ham,ehronline web address change\r\nthis message ...
6,ham,spring savings certificate - take 30 % off\r\...
7,spam,looking for medication ? we ` re the best sou...
8,ham,noms / actual flow for 2 / 26\r\nwe agree\r\n...
9,ham,"nominations for oct . 21 - 23 , 2000\r\n( see..."


In [161]:
print(mail_spam.shape)


(10743, 2)


In [162]:
mail_spam.loc[mail_spam['label'] == 'spam', 'label'] = 0
mail_spam.loc[mail_spam['label'] == 'ham', 'label'] = 1


In [163]:
X = mail_spam['text']
Y = mail_spam['label']


In [164]:
print(X)
print(Y)


0         enron methanol ; meter # : 988291\r\nthis is ...
1         hpl nom for january 9 , 2001\r\n( see attache...
2         neon retreat\r\nho ho ho , we ' re around to ...
3         photoshop , windows , office . cheap . main t...
4         re : indian springs\r\nthis deal is to book t...
                               ...                        
10738    This is the 2nd time we have tried 2 contact u...
10739                Will Ì_ b going to esplanade fr home?
10740    Pity, * was in mood for that. So...any other s...
10741    The guy did some bitching but I acted like i'd...
10742                           Rofl. Its true to its name
Name: text, Length: 10743, dtype: object
0        1
1        1
2        1
3        0
4        1
        ..
10738    0
10739    1
10740    1
10741    1
10742    1
Name: label, Length: 10743, dtype: object


In [165]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)
print(X.shape)
print(X_train.shape)
print(X_test.shape)


(10743,)
(8594,)
(2149,)


In [166]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)


In [167]:
print(X_train_features)


  (0, 10873)	0.38639297850007875
  (0, 40245)	0.5488891460253681
  (0, 28152)	0.41220216197916565
  (0, 7227)	0.3328909671109456
  (0, 14638)	0.5183571676142048
  (1, 43034)	0.7152493988479114
  (1, 34916)	0.5547664531127566
  (1, 47188)	0.4250323281215124
  (2, 3091)	0.12038608795771522
  (2, 40879)	0.09774696804359528
  (2, 42490)	0.0460021610718902
  (2, 41951)	0.09479730699682565
  (2, 22577)	0.06493981871494267
  (2, 18540)	0.10185525321978332
  (2, 1030)	0.1453661254089218
  (2, 1031)	0.1453661254089218
  (2, 44517)	0.07856286049883575
  (2, 31626)	0.06455231251416364
  (2, 31067)	0.05424703767368687
  (2, 968)	0.1453661254089218
  (2, 7456)	0.10823248441839244
  (2, 14308)	0.07192395583851699
  (2, 35438)	0.11409287620505981
  (2, 45355)	0.06502726103036724
  (2, 39650)	0.39670139531127985
  :	:
  (8592, 23603)	0.06850589345454271
  (8592, 45578)	0.05893273398125851
  (8592, 39685)	0.04991478553044328
  (8592, 17851)	0.06599907063306798
  (8592, 29391)	0.033728512544364334
  (85

In [168]:
label_encoder = LabelEncoder()
Y_train = label_encoder.fit_transform(Y_train)
Y_test = label_encoder.transform(Y_test)


In [169]:
model = LogisticRegression()
model.fit(X_train_features, Y_train)


In [170]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_score_of_data = accuracy_score(Y_train, prediction_on_training_data)
print('Accuracy on training data = ', accuracy_score_of_data)


Accuracy on training data =  0.9547358622294624


In [171]:
# Use the already trained model for predictions on the test set
prediction_on_test_data = model.predict(X_test_features)
accuracy_score_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print('Accuracy on test data = ', accuracy_score_on_test_data)


Accuracy on test data =  0.937645416472778


In [172]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
    
# Save the TfidfVectorizer instance
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(feature_extraction, vectorizer_file)    

In [173]:
'''# Assuming you have already defined the 'feature_extraction' and 'model' objects

# Example input mail
input_mail = "ehronline web address change this message is intended for ehronline users only  "
input_data_feature = feature_extraction.transform([input_mail])

# Make a prediction using the trained model
prediction = model.predict(input_data_feature)

# Display the prediction
print("Predicted class:", prediction)

if prediction[0] == 0:
    print("This mail is a spam mail.")
else:
    print("This is not a spam mail.")
    '''


'# Assuming you have already defined the \'feature_extraction\' and \'model\' objects\n\n# Example input mail\ninput_mail = "ehronline web address change this message is intended for ehronline users only  "\ninput_data_feature = feature_extraction.transform([input_mail])\n\n# Make a prediction using the trained model\nprediction = model.predict(input_data_feature)\n\n# Display the prediction\nprint("Predicted class:", prediction)\n\nif prediction[0] == 0:\n    print("This mail is a spam mail.")\nelse:\n    print("This is not a spam mail.")\n    '