In [1]:
import pandas as pd
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
df = pd.read_csv('email_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [4]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
df.isna()
null_values = df.isnull().sum()

# Print the null values for each column
print("Null Values in Each Column:")
print(null_values)

Null Values in Each Column:
Category    0
Message     0
dtype: int64


In [7]:
ham_count = df['Category'].value_counts().get('ham', 0)
spam_count = df['Category'].value_counts().get('spam', 0)

ham_count

4825

In [8]:
spam_count


747

# Implementing Multinomial Naive Bayse


In [9]:

from sklearn.naive_bayes import MultinomialNB

# Extract features (X) and labels (Y)
X = df['Message']
Y = df['Category']

# Split the data into training (70%) and testing (30%) sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, stratify=Y, test_size=0.3, random_state=42
)

# Convert the text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Initialize the Naive Bayes classifier (MultinomialNB)
classifier_nb = MultinomialNB()

# Measure training time
start_time = time.time()
classifier_nb.fit(X_train_vectorized, y_train)
training_time = time.time() - start_time

# Make predictions on the test set
predictions_nb = classifier_nb.predict(X_test_vectorized)

# Evaluate the performance
accuracy_nb = accuracy_score(y_test, predictions_nb)
conf_matrix_nb = confusion_matrix(y_test, predictions_nb)
classification_rep_nb = classification_report(y_test, predictions_nb)

# Calculate error rate
error_rate_nb = 1 - accuracy_nb

# Print the results
print("Multinomial Naive Bayes Classifier:")
print("Training Time: {:.4f} seconds".format(training_time))
print("Accuracy:", accuracy_nb)
print("Error Rate:", error_rate_nb)
print("\nConfusion Matrix:\n", conf_matrix_nb)
print("\nClassification Report:\n", classification_rep_nb)

Multinomial Naive Bayes Classifier:
Training Time: 0.0130 seconds
Accuracy: 0.9844497607655502
Error Rate: 0.015550239234449759

Confusion Matrix:
 [[1444    4]
 [  22  202]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1448
        spam       0.98      0.90      0.94       224

    accuracy                           0.98      1672
   macro avg       0.98      0.95      0.97      1672
weighted avg       0.98      0.98      0.98      1672



# Implementing Gaussian Naive Bayes

In [10]:

from sklearn.naive_bayes import GaussianNB

# Extract features (X) and labels (Y)
X = df['Message']
Y = df['Category']

# Split the data into training (70%) and testing (30%) sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, stratify=Y, test_size=0.3, random_state=42
)

# Convert the text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Initialize the Naive Bayes classifier (GaussianNB)
classifier_gaussian = GaussianNB()

# Measure training time
start_time = time.time()
classifier_gaussian.fit(X_train_vectorized.toarray(), y_train)
training_time = time.time() - start_time

# Make predictions on the test set
predictions_gaussian = classifier_gaussian.predict(X_test_vectorized.toarray())

# Evaluate the performance
accuracy_gaussian = accuracy_score(y_test, predictions_gaussian)
conf_matrix_gaussian = confusion_matrix(y_test, predictions_gaussian)
classification_rep_gaussian = classification_report(y_test, predictions_gaussian)

# Calculate error rate
error_rate_gaussian = 1 - accuracy_gaussian

# Print the results
print("Gaussian Naive Bayes Classifier:")
print("Training Time: {:.4f} seconds".format(training_time))
print("Accuracy:", accuracy_gaussian)
print("Error Rate:", error_rate_gaussian)
print("\nConfusion Matrix:\n", conf_matrix_gaussian)
print("\nClassification Report:\n", classification_rep_gaussian)

Gaussian Naive Bayes Classifier:
Training Time: 0.8585 seconds
Accuracy: 0.8869617224880383
Error Rate: 0.11303827751196172

Confusion Matrix:
 [[1284  164]
 [  25  199]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.98      0.89      0.93      1448
        spam       0.55      0.89      0.68       224

    accuracy                           0.89      1672
   macro avg       0.76      0.89      0.80      1672
weighted avg       0.92      0.89      0.90      1672



# Implementing DecisionTreeClassifier(J48)

In [11]:
from sklearn.tree import DecisionTreeClassifier

# Split the data into training (70%) and testing (30%) sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    df['Message'], df['Category'], stratify=df['Category'], test_size=0.3, random_state=42
)

# Convert the text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Initialize the Decision Tree classifier
classifier_dt = DecisionTreeClassifier(random_state=42)

# Measure training time
start_time = time.time()
classifier_dt.fit(X_train_vectorized, y_train)
training_time = time.time() - start_time

# Make predictions on the test set
predictions_dt = classifier_dt.predict(X_test_vectorized)

# Evaluate the performance
accuracy_dt = accuracy_score(y_test, predictions_dt)
conf_matrix_dt = confusion_matrix(y_test, predictions_dt)
classification_rep_dt = classification_report(y_test, predictions_dt)

# Calculate error rate
error_rate_dt = 1 - accuracy_dt

# Print the results
print("Decision Tree Classifier:")
print("Training Time: {:.4f} seconds".format(training_time))
print("Accuracy:", accuracy_dt)
print("Error Rate:", error_rate_dt)
print("\nConfusion Matrix:\n", conf_matrix_dt)
print("\nClassification Report:\n", classification_rep_dt)


Decision Tree Classifier:
Training Time: 0.0966 seconds
Accuracy: 0.9641148325358851
Error Rate: 0.03588516746411485

Confusion Matrix:
 [[1427   21]
 [  39  185]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.97      0.99      0.98      1448
        spam       0.90      0.83      0.86       224

    accuracy                           0.96      1672
   macro avg       0.94      0.91      0.92      1672
weighted avg       0.96      0.96      0.96      1672



# Multinomial Naive Bayes GUI

In [12]:
from PyQt5.QtCore import Qt
import sys
from PyQt5.QtWidgets import QApplication, QWidget, QVBoxLayout, QLabel, QPushButton, QTextEdit
from PyQt5.QtCore import Qt  # Add this import line
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
import time

# ... (rest of your code)


class NaiveBayesGUI(QWidget):
    def __init__(self):
        super().__init__()

        self.init_ui()

    def init_ui(self):
        self.setWindowTitle('Multinomial Naive Bayes Classifier GUI')
        self.setGeometry(100, 100, 600, 400)

        df= pd.read_csv('email_data.csv')

        # Extract features (X) and labels (Y)
        X = df['Message']
        Y = df['Category']

        # Split the data into training (70%) and testing (30%) sets with stratification
        X_train, X_test, y_train, y_test = train_test_split(
            X, Y, stratify=Y, test_size=0.3, random_state=42
        )

        # Convert the text data into numerical features using CountVectorizer
        self.vectorizer = CountVectorizer()
        X_train_vectorized = self.vectorizer.fit_transform(X_train)
        X_test_vectorized = self.vectorizer.transform(X_test)

        # Initialize the Naive Bayes classifier (MultinomialNB)
        self.classifier_nb = MultinomialNB()

        # Measure training time
        start_time = time.time()
        self.classifier_nb.fit(X_train_vectorized, y_train)
        self.training_time = time.time() - start_time

        # Create GUI components
        self.text_input = QTextEdit(self)
        self.text_input.setPlaceholderText('Enter a message for classification...')
        
        self.classify_button = QPushButton('Classify', self)
        self.classify_button.clicked.connect(self.classify_message)

        self.result_label = QLabel(self)
        self.result_label.setTextInteractionFlags(Qt.TextSelectableByMouse)

        # Set up the layout
        layout = QVBoxLayout()
        layout.addWidget(self.text_input)
        layout.addWidget(self.classify_button)
        layout.addWidget(self.result_label)

        self.setLayout(layout)

    def classify_message(self):
        # Get the input text from the user
        user_input = self.text_input.toPlainText()

        # Vectorize the input text
        input_vectorized = self.vectorizer.transform([user_input])

        # Make predictions
        prediction = self.classifier_nb.predict(input_vectorized)

        # Display the result
        self.result_label.setText(f'Predicted Category: {prediction[0]}\nTraining Time: {self.training_time:.4f} seconds')


def main():
    app = QApplication(sys.argv)
    window = NaiveBayesGUI()
    window.show()
    sys.exit(app.exec_())

if __name__ == '__main__':
    main()


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


# Gaussian NaiveBayes GUI

In [None]:
from PyQt5.QtCore import Qt
import sys
from PyQt5.QtWidgets import QApplication, QWidget, QVBoxLayout, QLabel, QPushButton, QTextEdit
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
import time

class NaiveBayesGUI(QWidget):
    def __init__(self):
        super().__init__()

        self.init_ui()

    def init_ui(self):
        self.setWindowTitle('Gaussian Naive Bayes Classifier GUI')
        self.setGeometry(100, 100, 600, 400)

        df = pd.read_csv('email_data.csv')

        # Extract features (X) and labels (Y)
        X = df['Message']
        Y = df['Category']

        # Split the data into training (70%) and testing (30%) sets with stratification
        X_train, X_test, y_train, y_test = train_test_split(
            X, Y, stratify=Y, test_size=0.3, random_state=42
        )

        # Convert the text data into numerical features using CountVectorizer
        self.vectorizer = CountVectorizer()
        X_train_vectorized = self.vectorizer.fit_transform(X_train)
        X_test_vectorized = self.vectorizer.transform(X_test)

        # Measure training time
        start_time = time.time()

        # Convert the sparse matrix to a dense array for training
        X_train_vectorized_dense = X_train_vectorized.toarray()

        # Initialize the Naive Bayes classifier (GaussianNB)
        self.classifier_nb = GaussianNB()

        # Fit the classifier using the dense array
        self.classifier_nb.fit(X_train_vectorized_dense, y_train)

        self.training_time = time.time() - start_time

        # Create GUI components
        self.text_input = QTextEdit(self)
        self.text_input.setPlaceholderText('Enter a message for classification...')
        
        self.classify_button = QPushButton('Classify', self)
        self.classify_button.clicked.connect(self.classify_message)

        self.result_label = QLabel(self)
        self.result_label.setTextInteractionFlags(Qt.TextSelectableByMouse)

        # Set up the layout
        layout = QVBoxLayout()
        layout.addWidget(self.text_input)
        layout.addWidget(self.classify_button)
        layout.addWidget(self.result_label)

        self.setLayout(layout)

    def classify_message(self):
        # Get the input text from the user
        user_input = self.text_input.toPlainText()

        # Vectorize the input text
        input_vectorized = self.vectorizer.transform([user_input])

        # Convert the sparse matrix to a dense array for prediction
        input_vectorized_dense = input_vectorized.toarray()

        # Make predictions
        prediction = self.classifier_nb.predict(input_vectorized_dense)

        # Display the result
        self.result_label.setText(f'Predicted Category: {prediction[0]}\nTraining Time: {self.training_time:.4f} seconds')


def main():
    app = QApplication(sys.argv)
    window = NaiveBayesGUI()
    window.show()
    sys.exit(app.exec_())

if __name__ == '__main__':
    main()


# Decision Tree GUI



In [None]:

class DecisionTreeGUI(QWidget):
    def __init__(self):
        super().__init__()

        self.init_ui()

    def init_ui(self):
        self.setWindowTitle('Decision Tree Classifier GUI')
        self.setGeometry(100, 100, 600, 400)

        df= pd.read_csv('email_data.csv')

        # Extract features (X) and labels (Y)
        X = df['Message']
        Y = df['Category']

        # Split the data into training (70%) and testing (30%) sets with stratification
        X_train, X_test, y_train, y_test = train_test_split(
            X, Y, stratify=Y, test_size=0.3, random_state=42
        )

        # Convert the text data into numerical features using CountVectorizer
        self.vectorizer = CountVectorizer()
        X_train_vectorized = self.vectorizer.fit_transform(X_train)
        X_test_vectorized = self.vectorizer.transform(X_test)

        # Initialize the Naive Bayes classifier (MultinomialNB)
        self.classifier_nb = DecisionTreeClassifier(random_state=42)
        # Measure training time
        start_time = time.time()
        self.classifier_nb.fit(X_train_vectorized, y_train)
        self.training_time = time.time() - start_time

        # Create GUI components
        self.text_input = QTextEdit(self)
        self.text_input.setPlaceholderText('Enter a message for classification...')
        
        self.classify_button = QPushButton('Classify', self)
        self.classify_button.clicked.connect(self.classify_message)

        self.result_label = QLabel(self)
        self.result_label.setTextInteractionFlags(Qt.TextSelectableByMouse)

        # Set up the layout
        layout = QVBoxLayout()
        layout.addWidget(self.text_input)
        layout.addWidget(self.classify_button)
        layout.addWidget(self.result_label)

        self.setLayout(layout)

    def classify_message(self):
        # Get the input text from the user
        user_input = self.text_input.toPlainText()

        # Vectorize the input text
        input_vectorized = self.vectorizer.transform([user_input])

        # Make predictions
        prediction = self.classifier_nb.predict(input_vectorized)

        # Display the result
        self.result_label.setText(f'Predicted Category: {prediction[0]}\nTraining Time: {self.training_time:.4f} seconds')


def main():
    app = QApplication(sys.argv)
    window = DecisionTreeGUI()
    window.show()
    sys.exit(app.exec_())

if __name__ == '__main__':
    main()
