# OASIS INFOBYTE  **TASK-04**

# EMAIL SPAM DETECTION


###Importing required libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix as cfmt, classification_report as clrt

### Reading and Preprocessing the given SPAM dataset

In [None]:
df=pd.read_csv("/content/spam.csv",encoding='latin')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


Here last 3 columns has no data




In [None]:
#removing unrequired columns from the dataset
df=df.drop(columns=["Unnamed: 2","Unnamed: 3","Unnamed: 4"])
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


Checking for NULL values in the dataset

In [None]:
#This function checks and gives sum of NULL values in the dataset
df.isnull().sum()

v1    0
v2    0
dtype: int64

Checking for duplicate rows in the dataset

In [None]:
#This function gives total no. of duplicate rows in the dataset
df.duplicated().sum()

403

In [None]:
#Remoivng duplicates rows
df=df.drop_duplicates()

In [None]:
#Getting dimensions of the dataset
df.shape

(5169, 2)

In [None]:
#first 5 rows of the dataset
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#last 5 rows of the dataset
df.tail()

Unnamed: 0,v1,v2
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


Replacing the names(spam, ham) with 0, 1

In [None]:
df["v1"].replace({"ham":1, "spam":0},inplace = True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["v1"].replace({"ham":1, "spam":0},inplace = True)


Unnamed: 0,v1,v2
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will Ì_ b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


### Dividing the whole dataset into Independent and Dependent variables

In [None]:
#Independent variables
x=df["v2"]
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5169, dtype: object

In [None]:
#Dependent variables
y=df["v1"]
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: v1, Length: 5169, dtype: int64

### Dividing both independent and dependent variables into Train and Test datasets




In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0)

Train dataset


In [None]:
x_train

4163                  Its ok, called mom instead have fun
2253                         Lol enjoy role playing much?
1309              Ok, be careful ! Don't text and drive !
1046    Do 1 thing! Change that sentence into: \Becaus...
3521    Im sorry bout last nite it wasnåÕt ur fault it...
                              ...                        
5307                              What you did in  leave.
3455                         Ok. I.ll do you right later.
1708    How's my loverboy doing ? What does he do that...
2730                         I havent lei.. Next mon can?
2871                                      See you there! 
Name: v2, Length: 3618, dtype: object

In [None]:
y_train

4163    1
2253    1
1309    1
1046    1
3521    1
       ..
5307    1
3455    1
1708    1
2730    1
2871    1
Name: v1, Length: 3618, dtype: int64

Dimensions of Train dataset

In [None]:
print("Size of x_train is",x_train.shape)
print("Size of y_train is",y_train.shape)

Size of x_train is (3618,)
Size of y_train is (3618,)


Test dataset

In [None]:
x_test

1851    Dunno da next show aft 6 is 850. Toa payoh got...
2375    Thanx 4 2day! U r a goodmate I THINK UR RITE S...
2743    But my family not responding for anything. Now...
3324                         Nope... Juz off from work...
3995    We'll you pay over like  &lt;#&gt; yrs so its ...
                              ...                        
1429    For sale - arsenal dartboard. Good condition b...
1423                     Lol great now im getting hungry.
4780    Yup... Hey then one day on fri we can ask miwa...
4085                          Lemme know when you're here
1500                   Host-based IDPS for linux systems.
Name: v2, Length: 1551, dtype: object

In [None]:
y_test

1851    1
2375    1
2743    1
3324    1
3995    1
       ..
1429    0
1423    1
4780    1
4085    1
1500    1
Name: v1, Length: 1551, dtype: int64

Dimensions of Test dataset

In [None]:
print("Size of x_test is",x_test.shape)
print("Size of y_test is",y_test.shape)

Size of x_test is (1551,)
Size of y_test is (1551,)


###Use TfidfVectorizer to convert the input data(which is in string datatype) into a featured sparse matrix with float datatype.

In [None]:
tvec=TfidfVectorizer(min_df=1,stop_words='english')
x_train_feat=tvec.fit_transform(x_train)
x_test_feat=tvec.transform(x_test)

In [None]:
x_train_feat

<3618x6766 sparse matrix of type '<class 'numpy.float64'>'
	with 27519 stored elements in Compressed Sparse Row format>

In [None]:
x_test_feat

<1551x6766 sparse matrix of type '<class 'numpy.float64'>'
	with 10269 stored elements in Compressed Sparse Row format>

In [None]:
y_train=y_train.astype('int')
y_test=y_test.astype('int')

### Training the model with LogisticRegression using Train data

In [None]:
model=LogisticRegression()
model.fit(x_train_feat,y_train)

Giving Test data to the trained model to predict output

In [None]:
y_pred=model.predict(x_test_feat)
print("Predicted output for test input",y_pred)
print("Actual output for test input",y_test.values)

Predicted output for test input [1 1 1 ... 1 1 1]
Actual output for test input [1 1 1 ... 1 1 1]


Calulating accuracy of the model

In [None]:
acc=accuracy_score(y_test,y_pred)*100
print("Accuracy for predicted output:",acc)

Accuracy for predicted output: 95.55125725338492


### Getting Confusion matrix and Classification matrix

In [None]:
#confusion_matrix as cfmt
cmat=cfmt(y_pred,y_test)
cmat

array([[ 141,    1],
       [  68, 1341]])

In [None]:
#classification_report as clrt
crt=clrt(y_pred,y_test)
print(crt)

              precision    recall  f1-score   support

           0       0.67      0.99      0.80       142
           1       1.00      0.95      0.97      1409

    accuracy                           0.96      1551
   macro avg       0.84      0.97      0.89      1551
weighted avg       0.97      0.96      0.96      1551

