In [1]:
from IPython.display import Image
Image(url='https://miro.medium.com/v2/resize:fit:720/format:webp/1*nBgCTU_hAVG00eYkcRf6Mw.png', height=100, width=400)

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score,recall_score,precision_score

In [5]:
# Load dataset:
df = pd.read_csv("C:\\Users\\91785\\Desktop\\New folder (3)\\jupyter\\spam.csv", encoding="latin1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
# Check columns in dataframe:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [7]:
# Check descriptive statistics
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [8]:
# Check the number of rows and columns present in df:
print('Number of Rows:',df.shape[0])
print('Number of Columns:',df.shape[1])

Number of Rows: 5572
Number of Columns: 5


In [9]:
# Null value count in dataframe:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [10]:
df.isnull().mean()*100  # Check the percentage of null v

v1             0.000000
v2             0.000000
Unnamed: 2    99.102656
Unnamed: 3    99.784637
Unnamed: 4    99.892319
dtype: float64

In [11]:
# Dropping useless columns:
df.drop(columns=df[['Unnamed: 2','Unnamed: 3','Unnamed: 4']],axis=1,inplace=True)

In [12]:
# Checking dataset again:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
# Checking shape of dataframe:
df.shape

(5572, 2)

In [14]:
# Rename columns names for easy to understand, we can also use df.rename
df.columns=['spam/ham','sms']

In [15]:
# Convert the text data into numerical form
df.loc[df['spam/ham'] == 'spam', 'spam/ham',] = 0
df.loc[df['spam/ham'] == 'ham', 'spam/ham',] = 1

In [16]:
# Checking first five rows:
df.head()

Unnamed: 0,spam/ham,sms
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
x = df.sms
x.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: sms, dtype: object

In [18]:
y = df['spam/ham']
y.head()

0    1
1    1
2    0
3    1
4    1
Name: spam/ham, dtype: object

In [19]:
# Divide the whole dataset into training and testing set for model training
from sklearn.model_selection import train_test_split

In [20]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2,random_state=42)

In [21]:
# Checking shape of train and test dataset:
print(x.shape)
print(xtrain.shape)
print(xtest.shape)

(5572,)
(4457,)
(1115,)


In [22]:
xtrain,xtest

(1978    No I'm in the same boat. Still here at my moms...
 3989    (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
 3935       They r giving a second chance to rahul dengra.
 4078       O i played smash bros  &lt;#&gt;  religiously.
 4086    PRIVATE! Your 2003 Account Statement for 07973...
                               ...                        
 3772    I came hostel. I m going to sleep. Plz call me...
 5191                               Sorry, I'll call later
 5226        Prabha..i'm soryda..realy..frm heart i'm sory
 5390                           Nt joking seriously i told
 860                   In work now. Going have in few min.
 Name: sms, Length: 4457, dtype: object,
 3245    Funny fact Nobody teaches volcanoes 2 erupt, t...
 944     I sent my scores to sophas and i had to do sec...
 1044    We know someone who you know that fancies you....
 2484    Only if you promise your getting out as SOON a...
 812     Congratulations ur awarded either å£500 of CD ...
               

In [23]:
ytrain,ytest

(1978    1
 3989    0
 3935    1
 4078    1
 4086    0
        ..
 3772    1
 5191    1
 5226    1
 5390    1
 860     1
 Name: spam/ham, Length: 4457, dtype: object,
 3245    1
 944     1
 1044    0
 2484    1
 812     0
        ..
 4264    1
 2439    1
 5556    1
 4205    1
 4293    0
 Name: spam/ham, Length: 1115, dtype: object)

In [24]:
feat_vect = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
feat_vect

In [25]:
ytrain = ytrain.astype('int')
ytest = ytest.astype('int')

In [26]:
xtrain_vec = feat_vect.fit_transform(xtrain)

In [27]:
xtest_vec = feat_vect.transform(xtest)

In [28]:
print(xtrain)

1978    No I'm in the same boat. Still here at my moms...
3989    (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3935       They r giving a second chance to rahul dengra.
4078       O i played smash bros  &lt;#&gt;  religiously.
4086    PRIVATE! Your 2003 Account Statement for 07973...
                              ...                        
3772    I came hostel. I m going to sleep. Plz call me...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860                   In work now. Going have in few min.
Name: sms, Length: 4457, dtype: object


In [29]:
xtrain_vec

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34794 stored elements and shape (4457, 7472)>

In [30]:
print(xtrain_vec)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34794 stored elements and shape (4457, 7472)>
  Coords	Values
  (0, 1371)	0.4658046386365619
  (0, 4416)	0.4528381701109944
  (0, 1706)	0.3431839629173582
  (0, 7415)	0.348722265231364
  (0, 3210)	0.348722265231364
  (0, 4520)	0.4658046386365619
  (1, 1187)	0.26161139982801973
  (1, 3140)	0.3059746053542906
  (1, 3631)	0.2804339696184593
  (1, 6296)	0.269833648032668
  (1, 1533)	0.2015782058421696
  (1, 2661)	0.3059746053542906
  (1, 5005)	0.1937920260229529
  (1, 4292)	0.2953742837684993
  (1, 419)	0.28715203556385105
  (1, 4533)	0.3059746053542906
  (1, 6440)	0.2953742837684993
  (1, 1649)	0.3059746053542906
  (1, 0)	0.2654936554684193
  (2, 3061)	0.44585171875646595
  (2, 5770)	0.3962151014046925
  (2, 1674)	0.35156722029872034
  (2, 5351)	0.5102109014477275
  (2, 2190)	0.5102109014477275
  (3, 5048)	0.4444794309161828
  :	:
  (4451, 5002)	0.36484607066812064
  (4451, 4939)	0.36484607066812064
  (4452, 3085)	0.25923599228

In [31]:
print(xtest_vec)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7693 stored elements and shape (1115, 7472)>
  Coords	Values
  (0, 1756)	0.31111329907426943
  (0, 2679)	0.3500886226408095
  (0, 2974)	0.34299776014114036
  (0, 3239)	0.34299776014114036
  (0, 3457)	0.3500886226408095
  (0, 3752)	0.1718556592061185
  (0, 4543)	0.38197308370768035
  (0, 6816)	0.4006242977875035
  (0, 7229)	0.2947064107791228
  (1, 1934)	0.22392171769600464
  (1, 1970)	0.2461378627103295
  (1, 2651)	0.3269309971271071
  (1, 3716)	0.3178303138520559
  (1, 4760)	0.29866169283344046
  (1, 5532)	0.33866381848750327
  (1, 5738)	0.25559165628741076
  (1, 5739)	0.35520030142077386
  (1, 5744)	0.35520030142077386
  (1, 5812)	0.22078293973996208
  (1, 6604)	0.19484478334547534
  (1, 6607)	0.27039238853977376
  (2, 305)	0.31938690318093343
  (2, 2707)	0.4882288103453306
  (2, 3835)	0.4855870501823455
  (2, 4106)	0.5120683436791948
  :	:
  (1110, 1533)	0.2553888613819584
  (1110, 1599)	0.27571108916401915
  (1110, 1745)

In [32]:
lr = LogisticRegression()

In [33]:
lr.fit(xtrain_vec,ytrain)

In [34]:
lr.score(xtrain_vec,ytrain)

0.9692618353152345

In [35]:

lr.score(xtest_vec,ytest)

0.9524663677130045

In [36]:

pred_lr=lr.predict(xtest_vec)
pred_lr

array([1, 1, 1, ..., 1, 1, 1])

In [37]:

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [38]:
accuracy_score(ytest,pred_lr)

0.9524663677130045

In [39]:
confusion_matrix(ytest,pred_lr)

array([[100,  50],
       [  3, 962]])

In [40]:
print(classification_report(ytest,pred_lr))

              precision    recall  f1-score   support

           0       0.97      0.67      0.79       150
           1       0.95      1.00      0.97       965

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115

