In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('spam.csv', encoding='latin1')

In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
#1 . Data Cleaning
#2 . EDA
#3 . Text Processing
#4 . Model Building
#5 . Evaluation
#6 . Deploy


***1. DATA CLEANING***

In [None]:
df.info()

In [None]:
#drop last 3 cols
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [None]:
df.sample(5)

In [None]:
#renaming the cols
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(5)

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

In [None]:
df['target'] = encoder.fit_transform(df['target'])

In [None]:
!pip install streamlit

In [None]:
import streamlit as st
import pickle
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()


# LOWER CASE t
def transform_text(text):
    text = text.lower()
    # Download punkt_tab if not already downloaded
    try:
        nltk.data.find('tokenizers/punkt_tab')
    except LookupError:
        nltk.download('punkt_tab')
    text = nltk.word_tokenize(text)

    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()

    # Download stopwords if not already downloaded
    try:
        stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)


with open('vectorizer.pkl', 'rb') as f:
    tfidf = pickle.load(f)

with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

st.title("Email/SMS Spam Classifyer")

input_sms = st.text_input("Enter the email message")

# Preprocess
transform_sms = transform_text(input_sms)
# Vectorize
vector_input = tfidf.transform([transform_sms])# Predict
result = model.predict(vector_input)
# Display
if result == 1:
    st.header("Spam")
else:
    st.header("Not Spam")

In [None]:
df.head()

In [None]:
#finding missing values
df.isnull().sum()

In [None]:
#finding the duplicated values
df.duplicated().sum()

In [None]:
#removing duplicates
df = df.drop_duplicates(keep='first')

In [None]:
df.duplicated().sum()

In [None]:
df.shape

***2. EDA(EXPLORATORY DATA ANALYSIS)***

In [None]:
df.head()

In [None]:
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct="%0.2f")
plt.show

here.... the data is imbalenced

In [None]:
import nltk

In [None]:
nltk.download('punkt')

In [None]:
df['num_charachters'] = df['text'].apply(len)

In [None]:
df.head()

fetching the no. of  words


In [None]:
nltk.download('punkt_tab')
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
df.head()

In [None]:
df['num_sentences'] = df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
df[['num_charachters','num_words','num_sentences']].describe()

In [None]:
#ham
df[df['target'] == 0][['num_charachters','num_words','num_sentences']].describe()

In [None]:
#spam
df[df['target'] == 1][['num_charachters','num_words','num_sentences']].describe()

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=((15,5)))
#ham
sns.histplot(df[df['target'] == 0]['num_charachters'])
#spam
sns.histplot(df[df['target'] == 1]['num_charachters'],color = 'red')

In [None]:
plt.figure(figsize=((15,5)))
#ham
sns.histplot(df[df['target'] == 0]['num_words'])
#spam
sns.histplot(df[df['target'] == 1]['num_words'],color = 'red')

In [None]:
plt.figure(figsize=((15,5)))
#ham
sns.histplot(df[df['target'] == 0]['num_sentences'])
#spam
sns.histplot(df[df['target'] == 1]['num_sentences'],color = 'red')

In [None]:
sns.pairplot(df,hue='target')

In [None]:
sns.heatmap(df[['target','num_charachters','num_words','num_sentences']].corr(), annot=True)



```
DATA PREPROCESSING
. LOWER CASE
. TOKENIZATION
. rEMOVING SPECIAL CHARACHTERS
. REMOVING STOP WORDS AND PUNCTUATIONS
. STEMMING

```





In [None]:
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

#LOWER CASE t
def transform_text(text):
  text = text.lower()
  text = nltk.word_tokenize(text)

  y =[]
  for i in text:
    if i.isalnum():
      y.append(i)

  text = y[:]
  y.clear()

  # Download stopwords if not already downloaded
  try:
      stopwords.words('english')
  except LookupError:
      nltk.download('stopwords')

  for i in text:
    if i not in stopwords.words('english') and i not in string.punctuation:
      y.append(i)


  text = y[:]
  y.clear()

  for i in text:
    y.append(ps.stem(i))

  return " ".join(y)

In [None]:
transform_text("Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...")

In [None]:
df['text'][0]

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('loving')

In [None]:
df['transformed_text'] = df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width= 500,height=500,min_font_size= 10,background_color='white')

In [None]:
spam_wc = wc.generate(df[df['target'] == 1]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(15,6))
plt.imshow(spam_wc)

In [None]:
ham_wc = wc.generate(df[df['target'] == 0]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(15,6))
plt.imshow(ham_wc)

In [None]:
spam_corpus = []
for msg in df[df['target'] == 1]['transformed_text'].tolist():
  for words in msg.split():
    spam_corpus.append(words)

In [None]:
len(spam_corpus)

In [None]:
from collections import Counter
temp_df = pd.DataFrame(Counter(spam_corpus).most_common(30))
# Custom VIBGYOR color palette
vibgyor_colors = ['#8A2BE2', '#4B0082', '#0000FF', '#008000', '#FFFF00', '#FFA500', '#FF0000']
sns.barplot(x=temp_df[0], y=temp_df[1], hue=temp_df[0], palette=vibgyor_colors, legend=False)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
ham_corpus = []
for msg in df[df['target'] == 0]['transformed_text'].tolist():
  for words in msg.split():
    ham_corpus.append(words)

In [None]:
len(ham_corpus)

In [None]:
from collections import Counter
temp_df = pd.DataFrame(Counter(ham_corpus).most_common(30))
# Custom VIBGYOR color palette
vibgyor_colors = ['#8A2BE2', '#4B0082', '#0000FF', '#008000', '#FFFF00', '#FFA500', '#FF0000']
sns.barplot(x=temp_df[0], y=temp_df[1], hue=temp_df[0], palette=vibgyor_colors, legend=False)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
df.head()


***MODEL BUILDING***

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [None]:
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [None]:
#from sklearn.preprocessing import MinMaxScaler
#scaler = MinMaxScaler()
#X = scaler.fit_transform(X)

In [None]:
# appending the num_character col to X
#X = np.hstack((X,df['num_characters'].values.reshape(-1,1)))

In [None]:
X.shape

In [None]:
Y = df['target'].values

In [None]:
Y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2, random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
gnb.fit(X_train,Y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score  (Y_test,y_pred1))
print(confusion_matrix(Y_test,y_pred1))
print(precision_score (Y_test,y_pred1))

In [None]:
mnb.fit(X_train,Y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score  (Y_test,y_pred2))
print(confusion_matrix(Y_test,y_pred2))
print(precision_score (Y_test,y_pred2))

In [None]:
bnb.fit(X_train,Y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score  (Y_test,y_pred3))
print(confusion_matrix(Y_test,y_pred3))
print(precision_score (Y_test,y_pred3))

In [None]:
# tfidf --> MNB

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [None]:
clfs = {
    'SVC' : svc,
    'KN' : knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'AdaBoost': abc,
    'BgC': bc,
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)

    return accuracy,precision

In [None]:
train_classifier(svc,X_train,Y_train,X_test,Y_test)

In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():

    current_accuracy,current_precision = train_classifier(clf, X_train,Y_train,X_test,Y_test)

    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [None]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)

In [None]:
performance_df

In [None]:
performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")

In [None]:
performance_df1

In [None]:
sns.catplot(x = 'Algorithm', y='value',
               hue = 'variable',data=performance_df1, kind='bar',height=5)
plt.ylim(0.5,1.0)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
# model improve
# 1. Change the max_features parameter of TfIdf

In [None]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_max_ft_3000':accuracy_scores,'Precision_max_ft_3000':precision_scores}).sort_values('Precision_max_ft_3000',ascending=False)

In [None]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_scaling':accuracy_scores,'Precision_scaling':precision_scores}).sort_values('Precision_scaling',ascending=False)

In [None]:
new_df = performance_df.merge(temp_df,on='Algorithm')

In [None]:
new_df_scaled = new_df.merge(temp_df,on='Algorithm')

In [None]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_num_chars':accuracy_scores,'Precision_num_chars':precision_scores}).sort_values('Precision_num_chars',ascending=False)

In [None]:
new_df_scaled.merge(temp_df,on='Algorithm')

In [None]:
# Voting Classifier
svc = SVC(kernel='sigmoid', gamma=1.0,probability=True)
mnb = MultinomialNB()
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)

from sklearn.ensemble import VotingClassifier

In [None]:
voting = VotingClassifier(estimators=[('svm', svc), ('nb', mnb), ('et', etc)],voting='soft')

In [None]:
voting.fit(X_train,Y_train)

In [None]:
y_pred = voting.predict(X_test)
print("Accuracy",accuracy_score(Y_test,y_pred))
print("Precision",precision_score(Y_test,y_pred))

In [None]:
# Applying stacking
estimators=[('svm', svc), ('nb', mnb), ('et', etc)]
final_estimator=RandomForestClassifier()

In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)

In [None]:
clf.fit(X_train,Y_train)
y_pred = clf.predict(X_test)
print("Accuracy",accuracy_score(Y_test,y_pred))
print("Precision",precision_score(Y_test,y_pred))

In [None]:
import pickle
from sklearn.naive_bayes import MultinomialNB

# Re-initialize and fit MultinomialNB to ensure the saved model is fitted
mnb_fitted = MultinomialNB()
mnb_fitted.fit(X_train, Y_train)

pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb_fitted,open('model.pkl','wb'))