# Import the required libraries

In [None]:
# Regular expression to remove the unwanted charaters
import re

# Pandas to store the dataset and make the necessary operations for EDA
import pandas as pd

# Matplotlib to visualize our data
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# NLTK tools to help clean the string for the model to train
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import string

# Tfdf vectorizer to convert the words into vectors
from sklearn.feature_extraction.text import TfidfVectorizer

# To split the data for train and test
from sklearn.model_selection import train_test_split

# Metrics to measure the performance and accuracy for the model
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score

# Calculate the model running time
import time

# Use counter to calculated the most common words
from collections import Counter

# Exploratory Data Analysis

### Load the data

In [None]:
spam_df = pd.read_csv('spam.csv', encoding='latin-1')
spam_df.head()

### Drop the unwanted columns

In [None]:
spam_df.columns

In [None]:
df = spam_df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
df.head()

### Rename the columns

In [None]:
df.rename(columns={'v1': 'target', 'v2': 'text'}, inplace=True)
df.head()

### Check for duplicate values

In [None]:
df.duplicated().sum()

### Remove the duplicates

In [None]:
df = df.drop_duplicates(keep='first')
df.head()

### Convert the target column into numerical values

In [None]:
from sklearn.preprocessing import LabelEncoder
df['target'] = LabelEncoder().fit_transform(df['target'])
df.head()

### Count the total number of spam and not spam text

In [None]:
df['target'].value_counts()

### Lets visualize the spam count in pie chart

In [None]:
plt.pie(df['target'].value_counts(), labels=['not spam', 'spam'], autopct='%0.2f')

### Lets create a new column to keep the number of characters in each text

In [None]:
df['num_characters'] = df['text'].apply(len)
df.head()

### Lets create a new column to keep the number of words in each text

In [None]:
df['num_words'] = df['text'].apply(lambda x: len(nltk.word_tokenize(x)))
df.head()

### Lets create a new column to keep the number of sentences in each text

In [None]:
df['num_sentence'] = df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))
df.head()

In [None]:
class SpamClassifier():
    def __init__(self, X, y, max_features):
        self.max_features = max_features
        
        X_vec = self.transform_and_vectorize(X);
        self.plot_most_common_words();
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X_vec, y, 
                                                                                test_size=0.2, 
                                                                                random_state=42)
        
    # Clean the feature column
    def transform_text(self, text, method, method_name):
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.lower()
        text = text.split()
        words = []
        if method_name == 'stemmer':
            words = [method.stem(word) for word in text if word not in stopwords.words('english') and word not in string.punctuation]
        else:
            words = [method.lemmatize(word) for word in text if word not in stopwords.words('english') and word not in string.punctuation]
        return ' '.join(words)
    
    # Convert the feature column into vectors
    def data_into_vector(self, X):
        tfidf = TfidfVectorizer(max_features=self.max_features)
        return tfidf.fit_transform(X)
    
    # Clean the feature using portal stemming
    def using_stemming(self, X):
        stemmer = PorterStemmer()
        return X.apply(lambda x: self.transform_text(x, stemmer, 'stemmer'))
    
    # Clean the feature using snowball stemming
    def using_stemming(self, X):
        stemmer = SnowballStemmer('english')
        return X.apply(lambda x: self.transform_text(x, stemmer, 'stemmer'))
    
    # Clean the feature using lemmatization
    def using_lemmatization(self, X):
        lemma = WordNetLemmatizer()
        return X.apply(lambda x: self.transform_text(x, lemma, 'lemma'))
    
    # Transform and vectorize the text
    def transform_and_vectorize(self, X):
        df['transform_text'] = self.using_stemming(X)
        return self.data_into_vector(df['transform_text'])
    
    # Count the total number of words in the list of text
    def calculate_the_total_words(self, target):
        corpus = []
        for text in df[df['target'] == target]['transform_text'].tolist():
            for word in text.split():
                corpus.append(word)
        return corpus
    
    # Plot barplot from corpus
    def plot_barplot(self, corpus, x_label):
        plt.figure(figsize=(18, 6))
        sns.barplot(pd.DataFrame(Counter(corpus).most_common(30))[0], pd.DataFrame(Counter(corpus).most_common(30))[1])
        plt.xticks(rotation='vertical')
        plt.xlabel(x_label)
        plt.ylabel('Number of words')
        plt.show()
    
    # Plot the graph for most common words in the data for spam and not spam text
    def plot_most_common_words(self):
        corpus_spam = self.calculate_the_total_words(0)
        corpus_not_spam = self.calculate_the_total_words(1)
        
        self.plot_barplot(corpus_spam, 'Most common non spam words')
        self.plot_barplot(corpus_not_spam, 'Most common spam words')
    
    # Training the model
    def model_train(self, X_train, y_train, classifier):
        print(classifier)
        return classifier.fit(X_train, y_train)
    
    # Prediction from model
    def model_prediction(self, model, X_test):
        return model.predict(X_test)
    
    # Count number of prediction are true or false using confusion matrix
    def model_confusion_matrix(self, y_test, y_pred):
        return confusion_matrix(y_test, y_pred)
    
    # Model accuracy score
    def model_accuracy(self, y_test, y_pred):
        return accuracy_score(y_test, y_pred)
    
    # Model precision score
    def model_precision(self, y_test, y_pred):
        return precision_score(y_test, y_pred)
        
    # Train the model and check the performance
    def train_check_performance(self, classifier):
        model = self.model_train(self.X_train, self.y_train, classifier)
        y_pred = self.model_prediction(model, self.X_test)

        confusion_mat = self.model_confusion_matrix(self.y_test, y_pred)
        accuracy = self.model_accuracy(self.y_test, y_pred)
        precision = self.model_precision(self.y_test, y_pred)

        return (model, y_pred, confusion_mat, accuracy, precision)
    
    # Call this to train model
    def run_classifier(self, classifier):
        return self.train_check_performance(classifier)

In [None]:
performance_matrix = []
def run_classifier_check_performance(spam_classifier, classifier):
    model, y_pred, confusion_mat, accuracy, precision = spam_classifier.run_classifier(classifier)
    performance_matrix.append({'Classifier': classifier, 'Accuracy': accuracy, 'Precision': precision})
    print("==============================================================")
    print("Confusion Matrix:\n", confusion_mat)
    print("==============================================================")
    print("Accuracy:", accuracy)
    print("==============================================================")
    print("Precision:", precision)
    print("==============================================================")

In [None]:
spam_classifier = SpamClassifier(df['text'], df['target'], 5000)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
# from xgboost import XGBClassifier

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
bnb = BernoulliNB()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
# xgb = XGBClassifier(n_estimators=50,random_state=2)

In [None]:
classifiers = [svc, knc, bnb, mnb, dtc, lrc, rfc, abc, bc, etc, gbdt] # GaussianNB() will not work on the lot of sparse data (contains a lot of zeros) it required the dense data to work properly
for classifier in classifiers:
    run_classifier_check_performance(spam_classifier, classifier)
    print()
    print()
    print()

In [None]:
df.head()

In [None]:
performance_df = pd.DataFrame(performance_matrix)
performance_df