In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import csv
from datetime import datetime
import re

In [None]:
label_modularity = "NewClass"
model = "All"
validation_size = 0.1

In [None]:
def select_model(model):
    models = []
    
    if model == "NB" or model == "All":
        models.append({
            "m": MultinomialNB(),
            "name": "Naive Bayes"
        })
    if model == "SVM" or model == "All":
        models.append({
            "m": SVC(kernel="linear", random_state=42),
            "name": "SVM"
        })
    if model == "RF" or model == "All":
        models.append({
            "m": RandomForestClassifier(n_estimators=100, random_state=42),
            "name": "Random Forest"
        })
    if model == "KNN" or model == "All":
        models.append({
            "m": KNeighborsClassifier(n_neighbors=5),
            "name": "KNN"
        })
    if model == "LR" or model == "All":
        models.append({
            "m": LogisticRegression(),
            "name": "LR"
        })

    if len(models) == 0:
        models.append({
            "m": MultinomialNB(),
            "name": "Naive Bayes"
        })
        
    return models

In [None]:
filename = 'FinalDataset/polished_dataset_nosub_67.csv'
name_to_save = "nosub"

data = pd.read_csv(filename)

train_data, validation_data = train_test_split(data, test_size=validation_size, random_state=42)

In [None]:
#GET NUMBER OF LABELS
original_labels = train_data[label_modularity].unique().tolist()
num_labels = train_data[label_modularity].nunique()
print(num_labels)

In [None]:
# Get total number of texts
num_texts = train_data.shape[0]
print(num_texts)

In [None]:
#Get total number of words
all_text = ' '.join(train_data['Text'].values)
num_words = len(all_text.split())
print(num_words)

In [None]:
#Calculate Z
z = num_texts / (num_words/num_texts)
print(z)

In [None]:
#Count token time
token_start = time.time()

In [None]:
label_encoder = LabelEncoder()
train_data[label_modularity] = label_encoder.fit_transform(train_data[label_modularity])
labels = train_data[label_modularity].tolist()

validation_data[label_modularity] = label_encoder.fit_transform(validation_data[label_modularity])
val_labels = validation_data[label_modularity].tolist()

In [None]:
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english', lowercase=True)
train_features = vectorizer.fit_transform(train_data['Text'])
validation_features = vectorizer.transform(validation_data['Text'])

In [None]:
token_finish = time.time()

print(token_finish - token_start)

In [None]:
# Select any model
classifiers = select_model(model)

for c in classifiers:
    train_start = time.time()
    c["m"].fit(train_features, labels)
    train_finish = time.time()
    print("Train Time "+c["name"]+": "+ str(train_finish - train_start))

    val_start = time.time()
    
    val_predictions = c["m"].predict(validation_features)
    accuracy = accuracy_score(val_labels, val_predictions)
    precision = precision_score(val_labels, val_predictions, average='weighted')
    recall = recall_score(val_labels, val_predictions, average='weighted')
    f1 = f1_score(val_labels, val_predictions, average='weighted')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

    val_finish = time.time()
    print("Validation Time "+c["name"]+": "+ str(val_finish - val_start))

    #Save results
    csv_file = "Results/ModelResults.csv"
    current_date = datetime.now().strftime('%Y-%m-%d')
    column_names = ["Model Name", "Num Classes", "Total Texts", "Total Words", "Accuracy", "Precision", "F1 Score", "Recall", "Train Time", "Validation Time", "Date"]


    with open(csv_file, mode='a', newline='') as file:
        writer = csv.writer(file)

        if file.tell() == 0:
            writer.writerow(column_names)
        
        writer.writerow([c["name"], num_labels, num_texts, num_words, accuracy, precision, f1, recall, train_finish - train_start, val_finish - val_start, current_date, name_to_save])

In [None]:
conf_matrix = confusion_matrix(val_labels, val_predictions)

In [None]:
class_accuracy = {}
for i in range(len(label_encoder.classes_)):
    class_name = label_encoder.classes_[i]
    total = conf_matrix[i].sum()
    correct = conf_matrix[i][i]
    accuracy = correct / total if total > 0 else 0.0
    class_accuracy[class_name] = accuracy

In [None]:
# Accuracy by class
sorted_class_accuracy = sorted(class_accuracy.items(), key=lambda x: x[1], reverse=True)

print("Accuracy by class:")
for class_name, acc in sorted_class_accuracy:
    print(f"{class_name}: {acc}")

# General accuracy
accuracy = accuracy_score(val_labels, val_predictions)
print("General Accuracy:", accuracy)

In [None]:
normalize=False
cmap=plt.cm.Blues

plt.figure(figsize=(14, 10))
plt.title("")

plt.imshow(conf_matrix, interpolation='nearest', cmap=cmap)
plt.colorbar()
tick_marks = np.arange(num_labels)
plt.xticks(tick_marks, original_labels, rotation=90)
plt.yticks(tick_marks, original_labels)

for i in range(num_labels):
        for j in range(num_labels):
            plt.gca().add_patch(plt.Rectangle((j - 0.5, i - 0.5), 1, 1, color='black', fill=None, linewidth=0.5))

plt.xlabel('Predicted Label')
plt.ylabel('Real Label')
plt.show()