In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv('/content/sample_data/spam.csv', encoding='ISO-8859-1')

In [None]:
df

In [None]:
df.shape

***1. Data Cleaning***

In [None]:
 df.info() # checking wether columns are needed or not.

In [None]:
# in 2,3,4 columns values are empty
'''So we will drop last 3 columns'''
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [None]:
df.head()

In [None]:
# renaming the columns for better understanding
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
df['target']=encoder.fit_transform(df['target']) # to assign 0 value to ham and 1 value to spam

In [None]:
df.head()

In [None]:
# missing values
df.isnull().sum()

In [None]:
#checking for duplicate values
df.duplicated().sum()

In [None]:
df=df.drop_duplicates(keep='first')

In [None]:
df.duplicated().sum()

In [None]:
df.shape

***2. EDA***

In [None]:
#Checking percentage of ham and spam
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct="%0.2f",colors=['green','red'])
plt.show()

In [None]:
# Data is not balanced

In [None]:
import nltk
nltk.download('punkt')

In [None]:
df['num_characters']=df['text'].apply(len)

In [None]:
df.head()

In [None]:
# fetching num of words
df['num_words']=df['text'].apply(lambda x:len(nltk.word_tokenize(x)))  # dividing sentence on basis of words

In [None]:
main_df=df.copy()

In [None]:
df['num_sentences']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x))) #dividing the sentence on basis of small sentences

In [None]:
df[['num_characters', 'num_words' , 'num_sentences']].describe()

In [None]:
# This is for ham messages
df[df['target']==0][['num_characters','num_words', 'num_sentences']].describe()

In [None]:
# This is for spam messages
df[df['target']==1][['num_characters','num_words', 'num_sentences']].describe()

In [None]:
import seaborn as sns

In [None]:
sns.histplot(df[df['target']==0]['num_characters'],color='green')
sns.histplot(df[df['target']==1]['num_characters'],color='red')

In [None]:
sns.histplot(df[df['target']==0]['num_sentences'],color='green')
sns.histplot(df[df['target']==1]['num_sentences'],color='red')

In [None]:
# Checking relationship of number of words with sentences
sns.pairplot(df,hue='target',vars=['num_characters','num_words','num_sentences'],palette='Set1')

In [None]:
# it tells data has outlier

In [None]:
df.drop(columns=['text'],inplace=True) # for applying correlation

In [None]:
sns.heatmap(df.corr(),annot=True,cmap='cividis')

3. **Data Preprocessing ---> ***LowerCase>Tokenization>RemoveSpeacialCharacters>RemovingStopwords(like is , of , the)>Stemming/Lemitization(converting danced,dancing to dance only)

In [None]:
import nltk
import string
from nltk.corpus import stopwords
nltk.download('stopwords')

def text_transform(text):
  text=text.lower()
  text=nltk.word_tokenize(text)

  y=[]
  for i in text:
    if i.isalnum():
      y.append(i)


  text=y[:] #cloning
  y.clear()

  for i in text:
    if i not in stopwords.words('english') and i not in string.punctuation: #for removing stopwords
      y.append(i)

  text=y[:] #cloning
  y.clear()

  for i in text:
    y.append(ps.stem(i))


  return  " " .join(y)

In [None]:
#Stemming
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
ps.stem('dancing')

In [None]:
main_df

In [None]:
main_df['text'][0]

In [None]:
main_df['transformed_text']=main_df['text'].apply(text_transform)

In [None]:
main_df.head()

In [None]:
 from wordcloud import WordCloud
 wc=WordCloud(width=500,height=500,min_font_size=10,background_color='red')

In [None]:
# For Spam messages
spam_wc= wc.generate(main_df[main_df['target']==1]['transformed_text'].str.cat(sep=" "))
plt.imshow(spam_wc)

In [None]:
#For Ham Messages
wc=WordCloud(width=500,height=500,min_font_size=10,background_color='green')
ham_wc= wc.generate(main_df[main_df['target']==0]['transformed_text'].str.cat(sep=" "))
plt.imshow(ham_wc)

In [None]:
main_df.head()

In [None]:
# Finding top 30 words of spam messages
spam_corpus=[]  # for spam messages
for msg in main_df[main_df['target']==1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [None]:
len(spam_corpus)

In [None]:
# Before plotting
from collections import Counter
fig, ax = plt.subplots()
ax.set_facecolor('red')
sns.barplot(x=pd.DataFrame(Counter(spam_corpus).most_common(30))[0],y=pd.DataFrame(Counter(spam_corpus).most_common(30))[1], palette="viridis" ,color='red')
plt.xticks(rotation='vertical')
plt.show()

In [None]:
# Finding top 30 words of ham messages
ham_corpus = []  # for ham messages
for msg in main_df[main_df['target']==0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [None]:
len(ham_corpus)

In [None]:
fig, ax = plt.subplots()
ax.set_facecolor('green')
sns.barplot(x=pd.DataFrame(Counter(ham_corpus).most_common(30))[0],y=pd.DataFrame(Counter(ham_corpus).most_common(30))[1], palette="viridis" )
plt.xticks(rotation='vertical')
plt.show()

***4. Model Building***

In [None]:
main_df.head()

In [None]:
# We will vectorize using bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv=CountVectorizer()
tfidf=TfidfVectorizer()

In [None]:
X = tfidf.fit_transform(main_df['transformed_text']).toarray() # we need only numerical data

In [None]:
X.shape #5169->sms , 6708 words

In [None]:
y=df['target'].values
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:
gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

In [None]:
# For Gaussian
gnb.fit(X_train,y_train)
y_pred1=gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

In [None]:
# For Multinnomial
mnb.fit(X_train,y_train)
y_pred2=mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
# For Bernoulii
bnb.fit(X_train,y_train)
y_pred3=mnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

In [None]:
# tfidf we have used then mnb

In [None]:
# Imported all Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
# Called objects here and setted all the hyperparameters
svc=SVC(kernel='sigmoid',gamma=1.0)
knc=KNeighborsClassifier()
mnb=MultinomialNB()
dtc=DecisionTreeClassifier(max_depth=5)
lrc=LogisticRegression(solver='liblinear',penalty='l1')
rfc=RandomForestClassifier(n_estimators=50,random_state=2)
abc=AdaBoostClassifier(n_estimators=50,random_state=2)
etc=ExtraTreesClassifier(n_estimators=50,random_state=2)
gbdt=GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb=XGBClassifier(n_estimators=50,random_state=2)

In [None]:
clf ={
    'SVC':svc,
    'KNN':knc,
    'NB':mnb,
    'DT':dtc,
    'LR':lrc,
    'RF':rfc,
    'AdaBoost':abc,
    'ETC':etc,
    'GBDT':gbdt,
    'XGB':xgb
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
  clf.fit(X_train,y_train)
  y_pred=clf.predict(X_test)
  accuracy=accuracy_score(y_test,y_pred)
  precision=precision_score(y_test,y_pred)

  return accuracy,precision

In [None]:
# eg of running any classifier
train_classifier(dtc,X_train,y_train,X_test,y_test) # in output: (precision , accuracy)

In [None]:
# Applied a loop on dictionary clfs and picked every algo trained the model and stored accuracy score for every algorithim
accuracy_scores=[]
precision_scores=[]

for name,clf in clf.items():
  current_accuracy,current_precision=train_classifier(clf,X_train,y_train,X_test,y_test)
  print("For ",name)
  print("Accuracy - ",current_accuracy)
  print("Precision - ",current_precision)

  accuracy_scores.append(current_accuracy)
  precision_scores.append(current_precision)

In [None]:
performance_df=pd.DataFrame({'Algorithm':clf.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)

In [None]:
performance_df

In [None]:
sns.barplot(x='Algorithm',y='Accuracy',data=performance_df,palette='viridis')

Now Conveting this to a website

In [None]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))