**LIBRARY DECLARATION**

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import nltk
import string

In [2]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

True

In [3]:
data=pd.read_csv('/content/drive/MyDrive/Wise ML/spam.csv', encoding='latin-1', usecols=['v1', 'v2'])


**DATASET:**

In [4]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


**RENAMING COLUMNS:**

In [6]:
data.rename(columns={'v1':'Status','v2':'Message'},inplace=True)


In [7]:
string.punctuation


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
data.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**Removal of Punctuation and StopWords: **

In [10]:
def text_process(mess):
    nopunc =[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

**Applying the function for the whole dataset: **

In [11]:
features=data['Message'].apply(text_process)

In [12]:
features

0       [Go, jurong, point, crazy, Available, bugis, n...
1                          [Ok, lar, Joking, wif, u, oni]
2       [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3           [U, dun, say, early, hor, U, c, already, say]
4       [Nah, dont, think, goes, usf, lives, around, t...
                              ...                        
5567    [2nd, time, tried, 2, contact, u, U, å£750, Po...
5568                   [Ì, b, going, esplanade, fr, home]
5569                     [Pity, mood, Soany, suggestions]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [Rofl, true, name]
Name: Message, Length: 5572, dtype: object

**STEMMING FUNCTION**

In [13]:
def stemming(text):
    snow=SnowballStemmer("english")
    words=""
    for word in text:
        words+=snow.stem(word)+" "
    return words


In [14]:
features = features.apply(stemming)

In [15]:
features

0       go jurong point crazi avail bugi n great world...
1                                  ok lar joke wif u oni 
2       free entri 2 wkli comp win fa cup final tkts 2...
3                    u dun say earli hor u c alreadi say 
4              nah dont think goe usf live around though 
                              ...                        
5567    2nd time tri 2 contact u u å£750 pound prize 2...
5568                             ì b go esplanad fr home 
5569                             piti mood soani suggest 
5570    guy bitch act like id interest buy someth els ...
5571                                      rofl true name 
Name: Message, Length: 5572, dtype: object

**Converting Text to Vector using Tfidf Vectorizer:**

In [16]:
vec_features=TfidfVectorizer('english').fit_transform(features)

In [17]:
print(vec_features)

  (0, 7574)	0.19449555011263384
  (0, 1138)	0.3501112803835596
  (0, 3342)	0.164475816719243
  (0, 2006)	0.29577542446330046
  (0, 1724)	0.33421906130459
  (0, 4204)	0.29577542446330046
  (0, 7776)	0.2379272168522122
  (0, 3380)	0.19449555011263384
  (0, 1726)	0.29577542446330046
  (0, 1327)	0.2662313123328933
  (0, 2223)	0.2711370860940892
  (0, 5539)	0.23906338437276378
  (0, 4063)	0.3501112803835596
  (0, 3289)	0.14081218308843002
  (1, 5200)	0.5633086751818669
  (1, 7688)	0.44480400570972006
  (1, 4029)	0.47731294876998304
  (1, 4239)	0.42078899608869724
  (1, 5170)	0.2827396376113674
  (2, 71)	0.23416578360011273
  (2, 1210)	0.16749632565308856
  (2, 5814)	0.23416578360011273
  (2, 7274)	0.12583555023089024
  (2, 5770)	0.23416578360011273
  (2, 5855)	0.16104729275511298
  :	:
  (5568, 2797)	0.6620015064495088
  (5568, 3097)	0.5769730291899227
  (5568, 3634)	0.3752780670596876
  (5568, 3289)	0.29668586321213597
  (5569, 6454)	0.5375461940984255
  (5569, 5475)	0.5375461940984255
  (


**Label Encoding for Labels:**

In [18]:
data['Status']=data['Status'].map({'ham':0,'spam':1})


In [19]:
data


Unnamed: 0,Status,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


**Train Test Split:**

In [20]:
x_train,x_test,y_train,y_test=train_test_split(vec_features,data['Status'],test_size=0.3,random_state=0)


**Decision Tree Classifier:**

In [21]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=10)
dt.fit(x_train,y_train)
pre = dt.predict(x_test)
acc = metrics.accuracy_score(y_test,pre)
print("Score:",acc)

Score: 0.9491626794258373


**Confusion Matrix:**

In [22]:
y_test.value_counts()


0    1434
1     238
Name: Status, dtype: int64

In [23]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pre))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1434
           1       0.88      0.74      0.81       238

    accuracy                           0.95      1672
   macro avg       0.92      0.86      0.89      1672
weighted avg       0.95      0.95      0.95      1672



In [24]:
metrics.confusion_matrix(y_test,pre)

array([[1410,   24],
       [  61,  177]])

**ROC Curve:**

In [25]:
metrics.roc_auc_score(y_test,pre)


0.8634805386589781

**Cross Validation**

In [26]:
scores = cross_val_score(dt,x_train,y_train,cv=10)
print(scores)
cv=scores.mean()
print('Cross_val Mean:',cv)

[0.95641026 0.95128205 0.95384615 0.96666667 0.97435897 0.94358974
 0.95128205 0.96666667 0.95128205 0.93589744]
Cross_val Mean: 0.9551282051282051
