In [None]:
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [None]:
# Setting hyperparameters
seed = 202

In [None]:
def seed_everything(seed=2023):
    import random
    random.seed(seed)
    np.random.seed(seed)

In [None]:
def load_data():
    train = pd.read_csv("train_v2_drcat_02.csv")
    return train

In [None]:
def vectorize_text(train_data):
    min_ngram = 3
    max_ngram = 4
    vectorizer = TfidfVectorizer(ngram_range=(min_ngram, max_ngram), sublinear_tf=True)
    X = vectorizer.fit_transform(train_data)
    return X, vectorizer

In [None]:
def save_vectorizer(vectorizer):
    with open('Models/vectorizer.pkl', 'wb') as file:
        pickle.dump(vectorizer, file)

In [None]:
def create_models(random_state=None):
    # SGD
    sgd_model = SGDClassifier(max_iter=5000, tol=1e-3, loss="modified_huber", random_state=random_state)  
    sgd_model2 = SGDClassifier(max_iter=5000, tol=1e-3, loss="modified_huber", random_state=(random_state + 1000) if random_state is not None else None, class_weight="balanced") 
    sgd_model3 = SGDClassifier(max_iter=10000, tol=5e-4, loss="modified_huber", random_state=(random_state + 2000) if random_state is not None else None, early_stopping=True)  
    # MNB 
    mnb_model = MultinomialNB(alpha=0.02)
    mnb_model2 = MultinomialNB(alpha=0.1)
    mnb_model3 = MultinomialNB(alpha=0.2)
    # Voting Classifier
    estimators=[
        ('sgd1', sgd_model), 
        ('sgd2', sgd_model2),
        ('sgd3', sgd_model3),
        ('mnb1', mnb_model),
        ('mnb2', mnb_model2),
        ('mnb3', mnb_model3),
    ]
    # Create the ensemble model
    ensemble = VotingClassifier(
        estimators=estimators,
        voting='soft',
        verbose=0,
    )
    return ensemble

In [None]:
def train_model(model, X, y):
    model.fit(X, y)
    return model

In [None]:
def evaluate_model(model, X, y):
    predictions = model.predict(X)
    y_prob = predictions.ravel()
    accuracy = accuracy_score(y, predictions)
    precision = precision_score(y, predictions)
    recall = recall_score(y, predictions)
    f1 = f1_score(y, predictions)
    cm = confusion_matrix(y, predictions)
    fpr, tpr, thresholds = roc_curve(y, y_prob)
    roc_auc = auc(fpr, tpr)
    print("Test Accuracy: {:.2f}%".format(accuracy*100))
    print("Test Precision: {:.2f}%".format(precision*100))
    print("Test Recall: {:.2f}%".format(recall*100))
    print("Test F1-score: {:.2f}%".format(f1*100))
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", cbar=False)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
def save_model(saved_model):
    with open('Models/VotingClassifier.pkl', 'wb') as file:
        pickle.dump(saved_model, file)

In [None]:
def classify_input(model, vectorizer):
    while True:
        user_input = input('Enter a text string to classify (or "exit" to quit): ')
        if user_input.lower() == 'exit':
            break
        if user_input.lower() == '':
            continue
        f = vectorizer.transform([user_input])
        prediction = model.predict_proba(f)
        if prediction[0][1] > 0.5:
            print("Text classified as AI.")
        else:
            print("Text classified as human.")
    print("END")

In [None]:
if __name__ == '__main__':
    seed_everything(seed)
    data = load_data()
    
    # Split data into training and testing sets
    train_text, test_text, train_labels, test_labels = train_test_split(data['text'], data['label'], test_size=0.2)
    
    # Vectorize text data
    X_train_vectorized, vectorizer = vectorize_text(train_text)
    save_vectorizer(vectorizer)
    X_test_vectorized = vectorizer.transform(test_text)
    
    # Create and train the model
    model = create_models(random_state=seed)
    trained_model = train_model(model, X_train_vectorized, train_labels)
    
    # Evaluate the model
    evaluate_model(trained_model, X_test_vectorized, test_labels)
    
    # Save the model
    save_model(trained_model)
    
    # Allow interactive classification
    classify_input(trained_model, vectorizer)