<a href="https://colab.research.google.com/github/ying2212/Spam-Mail-Prediction/blob/main/ScamMailDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the Dependencies

In [42]:
import numpy as np
import pandas as pd
import chardet
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection & Pre-Processing

In [74]:
# loading the data from csv file to a pandas Dataframe
raw_mail_data = pd.read_csv('/content/fradulentEmails.csv')

In [75]:
print(raw_mail_data)

     Category                                               Body
0        Scam         Return-Path: <james_ngola2002@maktoob.com>
1        Scam                             X-Sieve: cmu-sieve 2.0
2        Scam         Return-Path: <james_ngola2002@maktoob.com>
3        Scam    Message-Id: <200210310241.g9V2fNm6028281@cs.CU>
4        Scam  From: "MR. JAMES NGOLA." <james_ngola2002@makt...
...       ...                                                ...
3856     Scam  My name is Mr=2E Jonathan Mokoena=2C the Under...
3857     Scam  Intergration at the Specialized Technical Comm...
3858     Scam  Union =28AU=29=2C formerly Organization of Afr...
3859     Scam  aware of the transformation of the OAU to AU=2...
3860     Scam  build a new united Africa modelled on the patt...

[3861 rows x 2 columns]


In [76]:
# replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [77]:
# printing the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0,Category,Body
0,Scam,Return-Path: <james_ngola2002@maktoob.com>
1,Scam,X-Sieve: cmu-sieve 2.0
2,Scam,Return-Path: <james_ngola2002@maktoob.com>
3,Scam,Message-Id: <200210310241.g9V2fNm6028281@cs.CU>
4,Scam,"From: ""MR. JAMES NGOLA."" <james_ngola2002@makt..."


In [78]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(3861, 2)

***Label*** Encoding

In [87]:
# label scam mail as 0;

mail_data.loc[mail_data['Category'] == 'Scam', 'Category',] = 0

scam - 0

In [88]:
# separating the data as texts and label

X = mail_data['Body']

Y = mail_data['Category']

In [89]:
print(X)

0              Return-Path: <james_ngola2002@maktoob.com>
1                                  X-Sieve: cmu-sieve 2.0
2              Return-Path: <james_ngola2002@maktoob.com>
3         Message-Id: <200210310241.g9V2fNm6028281@cs.CU>
4       From: "MR. JAMES NGOLA." <james_ngola2002@makt...
                              ...                        
3856    My name is Mr=2E Jonathan Mokoena=2C the Under...
3857    Intergration at the Specialized Technical Comm...
3858    Union =28AU=29=2C formerly Organization of Afr...
3859    aware of the transformation of the OAU to AU=2...
3860    build a new united Africa modelled on the patt...
Name: Body, Length: 3861, dtype: object


In [90]:
print(Y)

0       0
1       0
2       0
3       0
4       0
       ..
3856    0
3857    0
3858    0
3859    0
3860    0
Name: Category, Length: 3861, dtype: object


Splitting the data into training data & test data

In [91]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [92]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(3861,)
(3088,)
(773,)


Feature Extraction

In [93]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [94]:
print(X_train)

3855                                                     
3021    from the Nigeria Chambers of Commerce and Indu...
1061    the Government so far is in the tune of $700. ...
2527                      Date: Tue, 03 Dec 2002 19:47:42
1460    I wait in anticipation of your fullest co-oper...
                              ...                        
3000       contractor has long been paid,but the contract
1667                                                     
3321    As I need a next of kin/beneficiary and financ...
1688    RE: TRANSFER OF US$21.5 MILLION (TWENTY ONE MI...
1898    CHANGED  MOST OF MY HUSBAND'S BILLIONS OF DOLLARS
Name: Body, Length: 3088, dtype: object


In [95]:
print(X_train_features)

  (1, 242)	0.5071738419994881
  (1, 1489)	0.4533257305544532
  (1, 719)	0.4473832426525597
  (1, 647)	0.48419429306437706
  (1, 1888)	0.3204287701913148
  (2, 1795)	0.32687335528795597
  (2, 212)	0.5461989657160664
  (2, 2729)	0.4862073901777291
  (2, 1158)	0.5008170907333359
  (2, 1348)	0.3280321856456346
  (3, 171)	0.39854824429100505
  (3, 178)	0.4254767637967573
  (3, 60)	0.37531741288207565
  (3, 70)	0.2499619867514085
  (3, 886)	0.3035412606694858
  (3, 6)	0.39179995007419305
  (3, 2726)	0.3553614316364887
  (3, 866)	0.29130468637956086
  (4, 2692)	0.2031725414756447
  (4, 679)	0.3652018565267974
  (4, 747)	0.3171086725918778
  (4, 2192)	0.3652018565267974
  (4, 1080)	0.3474520932804034
  (4, 490)	0.28001710510480055
  (4, 1959)	0.28001710510480055
  :	:
  (3085, 2149)	0.1967585801200048
  (3085, 2162)	0.22009122935675227
  (3085, 2316)	0.4079791479325433
  (3085, 833)	0.18848119107469719
  (3085, 1603)	0.16514854183794975
  (3085, 1189)	0.19098474304292704
  (3085, 600)	0.132836

Training the Model

Logistic Regression

In [96]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression model with the training data
model.fit(X_train_features, Y_train)

Evaluating the trained model

In [None]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [None]:
print('Accuracy on training data : ', accuracy_on_training_data)

Building a Predictive System

In [None]:
input_mail = ["This business is 100% risk free for you so please treat this matter with utmost confidentiality .If you indicate your interest to assist us please just e-mail me for more Explanation on how we plan to execute the transaction."]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==0):
  print('Scam mail')

else:
  print('Legitimate mail')