In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
df = pd.read_csv('spam.csv',encoding='latin-1')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace = True)

In [None]:
df.head()

In [None]:
df.rename(columns = {'v1':'target','v2':'text'},inplace = True)

In [None]:
df

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()

In [None]:
df['target']=encoder.fit_transform(df['target'])

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
#remove duplicates

In [None]:
df = df.drop_duplicates(keep='first')

In [None]:
df.shape

In [None]:
#EDA

In [None]:
df.head()

In [None]:
df['target'].value_counts()

In [None]:
plt.pie(df['target'].value_counts(),labels = ['ham','spam'],autopct ="%0.2f" )
plt.show()

In [None]:
import nltk

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('punkt_tab')

In [None]:
df['num_characters'] = df['text'].apply(len)

In [None]:
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))
df['num_words']

In [None]:
df['num_sentences']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df[['num_characters','num_words','num_sentences']].describe()

In [None]:
#ham
df[df['target']==0][['num_characters','num_words','num_sentences']].describe()

In [None]:
#spam
df[df['target']==1][['num_characters','num_words','num_sentences']].describe()

In [None]:
sns.histplot(df[df['target']==0]['num_characters'])
sns.histplot(df[df['target']==1]['num_characters'],color='red')
plt.legend(['ham','spam'])
plt.show()

In [None]:
sns.histplot(df[df['target']==0]['num_words'])
sns.histplot(df[df['target']==1]['num_words'],color='red')
plt.legend(['ham','spam'])
plt.show()

In [None]:
sns.pairplot(df,hue='target')

In [None]:
df_numerical = df.drop(columns=['text'])
display(df_numerical.corr())

In [None]:
df.head()

In [None]:
sns.heatmap(df_numerical.corr(), annot=True)
plt.show()

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def transform_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    cleaned_tokens = [token for token in tokens if token.isalnum()]
    cleaned_tokens = [token for token in cleaned_tokens if token not in stopwords.words('english') and token not in string.punctuation]
    cleaned_tokens = [ps.stem(token) for token in cleaned_tokens]
    return " ".join(cleaned_tokens)

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
import string
string.punctuation

In [None]:
transform_text('Hi How are You the running above 20% eg %%')

In [None]:
df['transformed_text'] = df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')

In [None]:
spam_wc = wc.generate(df[df['target']==1]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.imshow(spam_wc)

In [None]:
ham_wc = wc.generate(df[df['target']==0]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.imshow(ham_wc)

In [None]:
df.head()

In [None]:
spam_corpus = []
for msg in df[df['target']==1]['transformed_text'].tolist():
  for words in msg.split():
    spam_corpus.append(words)

In [None]:
len(spam_corpus)

In [None]:
ham_corpus = []
for msg in df[df['target']==0]['transformed_text'].tolist():
  for words in msg.split():
    ham_corpus.append(words)

In [None]:
len(ham_corpus)

In [None]:
from collections import Counter
sns.barplot(x=pd.DataFrame(Counter(spam_corpus).most_common(30))[0], y=pd.DataFrame(Counter(spam_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features = 3000)

In [None]:
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)

In [None]:
X

In [None]:
y = df['target'].values

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

In [None]:
mnb.fit(X_train,y_train)
y_pred2 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

In [None]:
#tfidf --> BNB

In [None]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(bnb,open('model.pkl','wb'))