In [4]:
from google.colab import drive

drive.mount('/content/gdrive/',force_remount=True)

Mounted at /content/gdrive/


In [2]:
import joblib
import string
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def preprocess_text(text):
  text = text.lower()
  text = re.sub('[^A-Za-z]',' ',text)
  text = text.translate(str.maketrans('','',string.punctuation))
  text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))
  ps = PorterStemmer()
  text = ' '.join([ps.stem(word) for word in text.split()])
  return text

In [4]:
model = joblib.load('/content/gdrive/MyDrive/Colab Notebooks/Email Spam Classifier/naive_bayes_model.sav')
tfidf = joblib.load('/content/gdrive/MyDrive/Colab Notebooks/Email Spam Classifier/TfIdfVectorizer.sav')

In [5]:
text = '''

'''
def predict(text):
  text = preprocess_text(text)
  X = tfidf.transform([text]).toarray()
  return model.predict(X)[0]

predict(text)

0

In [6]:
import pandas as pd
import numpy as np
data = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Email Spam Classifier' + '/smsspamcollection/SMSSpamCollection',sep='\t',names = ['class','message'])

In [7]:
from sklearn.metrics import classification_report

predictions = np.array([predict(i) for i in data['message']])

In [21]:
unique, counts = np.unique(predictions, return_counts=True)
print(np.asarray((unique, counts)).T)

[[   0 4887]
 [   1  685]]


In [8]:
y = np.where(data['class'] == 'spam',1,0)
print(classification_report(y,predictions))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      4825
           1       0.98      0.90      0.94       747

    accuracy                           0.98      5572
   macro avg       0.98      0.95      0.97      5572
weighted avg       0.98      0.98      0.98      5572



In [9]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y,predictions)

array([[4813,   12],
       [  74,  673]])

In [10]:
np.array(np.unique(y, return_counts=True)).T

array([[   0, 4825],
       [   1,  747]])

In [11]:
from sklearn.utils import class_weight

class_weight.compute_class_weight(class_weight = 'balanced',classes = [0,1],y = y)

array([0.57740933, 3.72958501])

In [13]:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification

# bert_tokenizer = AutoTokenizer.from_pretrained("mrm8488/bert-tiny-finetuned-sms-spam-detection")
# bert_model = AutoModelForSequenceClassification.from_pretrained("mrm8488/bert-tiny-finetuned-sms-spam-detection")

In [38]:
# from transformers import pipeline
# pipe = pipeline('text-classification',model = bert_model,tokenizer=bert_tokenizer)
pipe = joblib.load('/content/gdrive/MyDrive/Colab Notebooks/Email Spam Classifier/pipeline.sav')
pipe('hi do you need any help call 9954945954945')[0]

{'label': 'LABEL_1', 'score': 0.8984149098396301}

In [15]:
hugging_predictions = np.array([1 if pipe(i)[0]['label'] == 'LABEL_1' else 0 for i in data['message']])

In [16]:
confusion_matrix(y,hugging_predictions)

array([[4801,   24],
       [  69,  678]])

In [17]:
print(classification_report(y,hugging_predictions))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4825
           1       0.97      0.91      0.94       747

    accuracy                           0.98      5572
   macro avg       0.98      0.95      0.96      5572
weighted avg       0.98      0.98      0.98      5572



# Build Gradio Application

In [1]:
!pip install gradio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gradio
  Downloading gradio-3.27.0-py3-none-any.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
Collecting semantic-version
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)
Collecting ffmpy
  Downloading ffmpy-0.3.0.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gradio-client>=0.1.3
  Downloading gradio_client-0.1.3-py3-none-any.whl (286 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.2/286.2 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn
  Downloading uvicorn-0.

In [7]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.13.3 transformers-4.28.1


In [8]:
import joblib
import string
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import gradio as gr

model = joblib.load('/content/gdrive/MyDrive/Colab Notebooks/Email Spam Classifier/naive_bayes_model.sav')
tfidf = joblib.load('/content/gdrive/MyDrive/Colab Notebooks/Email Spam Classifier/TfIdfVectorizer.sav')
pipe = joblib.load('/content/gdrive/MyDrive/Colab Notebooks/Email Spam Classifier/pipeline.sav')

def predict_NB(text):
  text = text.lower()
  text = re.sub('[^A-Za-z]',' ',text)
  text = text.translate(str.maketrans('','',string.punctuation))
  text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))
  ps = PorterStemmer()
  text = ' '.join([ps.stem(word) for word in text.split()])
  X = tfidf.transform([text]).toarray()
  return 'spam' if model.predict(X)[0] == 1 else 0

def predict_PIPE(text):
  result = pipe(text)[0]
  return f'''{'spam' if result['label']=='LABEL_1' else 0}
confidence : {result['score']}'''

def fn(model_choice, input):
  if model_choice=="naive-bayes":
     return predict_NB(input)
  elif model_choice=="tiny-bert":
     return predict_PIPE(input)

gr.Interface(fn, inputs = [gr.inputs.Dropdown(["naive-bayes", "tiny-bert"],default = 'naive-bayes'),'text'], outputs = "text",title = 'Spam Classifier').launch()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



In [12]:
import transformers
import sklearn
import torch
print(re.__version__)
print(nltk.__version__)
print(joblib.__version__)
print(transformers.__version__)
print(sklearn.__version__)
print(torch.__version__)

2.2.1
3.8.1
1.2.0
4.28.1
1.2.2
2.0.0+cu118
