In [None]:
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Setting hyperparameters
seed = 202

In [None]:
def seed_everything(seed=2023):
    import random
    random.seed(seed)
    np.random.seed(seed)

In [None]:
def load_data():
    data = pd.read_csv("train_v2_drcat_02.csv")
    return data

In [None]:
def tokenize_data(train_data, test_data):
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(train_data)
    train_sequences = pad_sequences(tokenizer.texts_to_sequences(train_data), maxlen=100)
    test_sequences = pad_sequences(tokenizer.texts_to_sequences(test_data), maxlen=100)
    return tokenizer, train_sequences, test_sequences

In [None]:
def save_tokenizer(tokenizer):
    with open('Models/tokenizer.pkl', 'wb') as file:
        pickle.dump(tokenizer, file)

In [None]:
def define_model():
    input_layer = Input(shape=(100,))
    embedding_layer = Embedding(input_dim=5000, output_dim=50)(input_layer)
    conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
    pooling_layer = MaxPooling1D(pool_size=5)(conv_layer)
    flatten_layer = Flatten()(pooling_layer)
    output_layer = Dense(units=1, activation='sigmoid')(flatten_layer)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
def train_model(model, sequences_train, labels_train, sequences_test, labels_test):
    model.fit(sequences_train, labels_train, epochs=5, batch_size=4, validation_data=(sequences_test, labels_test))
    return model

In [None]:
def evaluate_model_cnn(model, sequences_test, labels_test):
    loss, accuracy = model.evaluate(sequences_test, labels_test)
    predictions = model.predict(sequences_test)
    predictions = predictions.round().astype(int)  # Round for binary classification
    y_prob = predictions.ravel()
    
    # Calculate performance metrics
    precision = precision_score(labels_test, predictions)
    recall = recall_score(labels_test, predictions)
    f1 = f1_score(labels_test, predictions)
    cm = confusion_matrix(labels_test, predictions)
    fpr, tpr, thresholds = roc_curve(labels_test, y_prob)
    roc_auc = auc(fpr, tpr)
    
    # Print performance metrics
    print("Test Accuracy: {:.2f}%".format(accuracy*100))
    print("Test Precision: {:.2f}%".format(precision*100))
    print("Test Recall: {:.2f}%".format(recall*100))
    print("Test F1-score: {:.2f}%".format(f1*100))
    
    # Print confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", cbar=False)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()
    
    #print ROC curve
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
def save_model(saved_model):
    with open('Models/CNN.pkl', 'wb') as file:
        pickle.dump(saved_model, file)

In [None]:
def classify_input(model, tokenizer):
    while True:
        user_input = input('Enter a text string to classify (or "exit" to quit): ')
        if user_input.lower() == 'exit':
            break
        if user_input.lower() == '':
            continue
        sequence = pad_sequences(tokenizer.texts_to_sequences([user_input]), maxlen=100)
        prediction = model.predict(sequence)[0][0]
        if prediction > 0.5:
            print("Text classified as AI.")
        else:
            print("Text classified as human.")
    print("END")

In [None]:
if __name__ == '__main__':
    seed_everything(seed)
    data = load_data()
    train_text, test_text, train_labels, test_labels = train_test_split(data['text'], data['label'], test_size=0.2)
    tokenizer, train_sequences, test_sequences = tokenize_data(train_text, test_text)
    save_tokenizer(tokenizer)
    model = define_model()
    trained_model = train_model(model, train_sequences, train_labels, test_sequences, test_labels)
    evaluate_model_cnn(trained_model, test_sequences, test_labels)
    save_model(trained_model)
    classify_input(trained_model, tokenizer)