<a href="https://colab.research.google.com/github/yashmittal2/machine-learning-/blob/main/Email%20spam%20filter/Email_spam_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df = pd.read_csv('mails.csv')

In [None]:
df.sample(5)

Unnamed: 0,category,message
747,ham,"I promise to take good care of you, princess. ..."
459,ham,1.20 that call cost. Which i guess isnt bad. M...
4497,ham,"In case you wake up wondering where I am, I fo..."
4383,ham,yeah sure thing mate haunt got all my stuff so...
2163,ham,1) Go to write msg 2) Put on Dictionary mode 3...


In [None]:
df.shape

(5572, 2)

PREPROCESSING


In [None]:
encoder = LabelEncoder()

In [None]:
df['category'] = encoder.fit_transform(df['category'])

In [None]:
def transform_text(message):
    message = message.lower()
    message = nltk.word_tokenize(message)
    
    y = []
    for i in message:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in message:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    message = y[:]
    y.clear()
    ps=PorterStemmer()
    for i in message:
        y.append(ps.stem(i))
    
            
    return " ".join(y)

In [None]:
transform_text("OH YEAH,AND HAV A GREAT TIME IN NEWQUAY-SEND ME A POSTCARD !1 LOOK AFTER ALL THE GIRLS WHILE IM GONE(U KNOW THE 1IM TALKIN BOUT!)xx")

'oh yeah hav great time newquay-send postcard 1 look girl im gone u know 1im talkin bout xx'

In [None]:
df['transformed_text'] = df['message'].apply(transform_text)

In [None]:
df.head()

Unnamed: 0,category,message,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi .. avail bugi n great wo...
1,0,Ok lar... Joking wif u oni...,ok lar ... joke wif u oni ...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor ... u c alreadi say ...
4,0,"Nah I don't think he goes to usf, he lives aro...",nah n't think goe usf live around though


MODEL BUILDING

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [None]:
tfidf

In [None]:
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [None]:
X.shape

(5572, 3000)

In [None]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
y = df['category'].values

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))

0.852914798206278


In [None]:
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))

0.9704035874439462


In [None]:
bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))

0.9775784753363229


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [None]:
clfs = {
    'SVC' : svc,
    'KN' : knc, 
    'NB': bnb, 
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    
    return accuracy

In [None]:
train_classifier(svc,X_train,y_train,X_test,y_test)

0.9739910313901345

In [None]:
accuracy_scores = []

for name,clf in clfs.items():
    
    current_accuracy = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    
    accuracy_scores.append(current_accuracy)

For  SVC
Accuracy -  0.9739910313901345
For  KN
Accuracy -  0.9022421524663677
For  NB
Accuracy -  0.9775784753363229
For  DT
Accuracy -  0.9336322869955157
For  LR
Accuracy -  0.95695067264574
For  RF
Accuracy -  0.97847533632287
For  AdaBoost
Accuracy -  0.9748878923766816
For  BgC
Accuracy -  0.9650224215246637
For  ETC
Accuracy -  0.9820627802690582
For  GBDT
Accuracy -  0.9587443946188341
For  xgb
Accuracy -  0.9766816143497757


In [None]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores}).sort_values('Accuracy',ascending=False)

In [None]:
performance_df

Unnamed: 0,Algorithm,Accuracy
8,ETC,0.982063
5,RF,0.978475
2,NB,0.977578
10,xgb,0.976682
6,AdaBoost,0.974888
0,SVC,0.973991
7,BgC,0.965022
9,GBDT,0.958744
4,LR,0.956951
3,DT,0.933632


In [None]:
estimators=[('rf', rfc), ('nb', bnb), ('et', etc)]
final_estimator=SVC()
from sklearn.ensemble import StackingClassifier
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Accuracy -",accuracy_score(y_test,y_pred))

Accuracy - 0.9847533632286996


In [None]:
!pip install -q streamlit

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 KB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 KB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.8/164.8 KB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 KB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for validators (setup.py) ... [?25l[?25hdone


In [None]:
!./ngrok authtokens 2NaORM0YmXMeC1zQrCxAxzA3cQo_7EwhxSCRz3M2AiwxEr67T

/bin/bash: ./ngrok: No such file or directory


In [None]:
!pip install pyngrok

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyngrok
  Downloading pyngrok-5.2.1.tar.gz (761 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/761.3 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m358.4/761.3 KB[0m [31m10.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m761.3/761.3 KB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyngrok
  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone
  Created wheel for pyngrok: filename=pyngrok-5.2.1-py3-none-any.whl size=19790 sha256=bf43e78a21c86441d5a4a369bbcc9f3d3097af3369002b6581b6e7d646358c48
  Stored in directory: /root/.cache/pip/wheels/f6/89/59/49d4249e00957e94813ac136a335d10ed2e09a856c5096f95c
Successfully built pyngrok
Installin

In [None]:
from pyngrok import ngrok 
!streamlit run streamlit_app.py&>/dev/null&
public_url = ngrok.connect(port='8501')



In [None]:
import pickle
pickle.dump(tfidf,open('vectorizes.pkl','wb'))
pickle.dump(clf,open('models.pkl','wb'))
tfidff = pickle.load(open('vectorizes.pkl','rb'))
mode_l = pickle.load(open('models.pkl','rb'))

In [None]:
mode_l

In [None]:
%%writefile streamlit_app.py 
import streamlit as st 
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
import pickle
st.markdown(""" This is a Streamlit App """)
tfidff = pickle.load(open('vectorizes.pkl','rb'))
mode_l = pickle.load(open('models.pkl','rb'))
st.title("Email Spam Classifier")
mail_text = st.text_input("Input the email")
def transform_text(message):
    message = message.lower()
    message = nltk.word_tokenize(message)
    
    y = []
    for i in message:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in message:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    message = y[:]
    y.clear()
    ps=PorterStemmer()
    for i in message:
        y.append(ps.stem(i))
    
            
    return " ".join(y)
if st.button('predict'):
   transform_mail = transform_text(mail_text) 

   vector_input = tfidff.transform([transform_mail])

   result = mode_l.predict(vector_input)
   if result == 1:
       st.header("spam")
   else:
       st.header("not spam ")

Writing streamlit_app.py


In [None]:
!streamlit run /content/streamlit_app.py & npx localtunnel  --port 8501

[..................] \ rollbackFailedOptional: verb npm-session b8de4cad3b7dac1[0m[K
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8502[0m
[34m  External URL: [0m[1mhttp://34.133.129.180:8502[0m
[0m
[K[?25hnpx: installed 22 in 5.41s
your url is: https://hot-islands-look-34-133-129-180.loca.lt
