In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [None]:
import matplotlib.pyplot as plt

## Test if the file is stored properly

In [None]:
data_set = pd.read_csv("/kaggle/input/ai-generated-text-preprocessed/preprocessed_data.csv")
data_set.head()

In [None]:
only_one_rows = data_set[data_set['generated'] == 1]
only_one_rows.head()

In [None]:
only_zero_rows = data_set[data_set['generated'] == 0]
only_zero_rows.head()

In [None]:
n=10000
equal_one_rows = only_one_rows[:n]
equal_zero_rows = only_zero_rows[:n]

# Create a combined dataset with exactly 10 1s and 10 0s
combined_dataset = pd.concat([equal_one_rows, equal_zero_rows])
combined_dataset['generated'].value_counts()

In [None]:
equal_one_rows_val = only_one_rows[50000:51000]
equal_zero_rows_val = only_zero_rows[50000:51000]

# Create a combined dataset with exactly 10 1s and 10 0s
combined_dataset_val = pd.concat([equal_one_rows_val, equal_zero_rows_val])
combined_dataset_val['generated'].value_counts()

In [None]:
validation_set_x = combined_dataset_val['essay']
validation_set_y = combined_dataset_val['generated']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(combined_dataset['essay'],combined_dataset['generated'])

In [None]:
y_train.value_counts()

# Data set

In [None]:
combined_dataset['generated'].value_counts().plot(kind='bar', rot=0, color=['blue', 'orange'])

plt.xlabel('Values')
plt.ylabel('Count')
plt.title('Bar Plot of 1s and 0s')

plt.show()

In [None]:
# combined_dataset_val
combined_dataset_val['generated'].value_counts().plot(kind='bar', rot=0, color=['blue', 'orange'])

plt.xlabel('Values')
plt.ylabel('Count')
plt.title('Bar Plot of 1s and 0s')

plt.show()

# New Model

In [None]:
from tensorflow.keras import activations, optimizers, losses
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

In [None]:
MODEL_NAME = 'distilbert-base-uncased'

In [None]:
def construct_encodings(x, tkzr, trucation=True, padding=True):
    return tkzr(x, truncation=trucation, padding=padding)

def construct_tfdataset(encodings, y=None):
    if y:
        return tf.data.Dataset.from_tensor_slices((dict(encodings),y))
    else:
        # this case is used when making predictions on unseen samples after training
        return tf.data.Dataset.from_tensor_slices(dict(encodings))  

In [None]:
## convert all to list for distilbert
X_train_list = X_train.tolist()
y_train_list = y_train.tolist()

X_test_list = X_test.tolist()
y_test_list = y_test.tolist()

validation_set_x_list = validation_set_x.tolist()
validation_set_y_list = validation_set_y.tolist()

In [None]:
## it will create X,y
def create_tf_set(dataX, dataY, tkzr):
    encodings = construct_encodings(dataX, tkzr)
    tfdataset = construct_tfdataset(encodings, dataY)
    return tfdataset
    

In [None]:
tkzr = DistilBertTokenizer.from_pretrained(MODEL_NAME)
tf_train = create_tf_set(X_train_list,y_train_list,tkzr)
tf_test = create_tf_set(X_test_list,y_test_list,tkzr)
tf_validation = create_tf_set(validation_set_x_list,validation_set_y_list,tkzr)

In [None]:
BATCH_SIZE = 16
tfdataset_train = tf_train.batch(BATCH_SIZE)
tfdataset_test = tf_test.batch(BATCH_SIZE)
tfdataset_validation = tf_validation.batch(BATCH_SIZE)

In [None]:
N_EPOCHS = 10

model_distilbert = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
optimizer = optimizers.Adam(learning_rate=3e-5)
loss_fn  = losses.SparseCategoricalCrossentropy(from_logits=True)
model_distilbert.compile(optimizer=optimizer, loss=loss_fn , metrics=['accuracy'])


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
checkpoint_filepath = 'model_checkpoint.h5'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',  # You can choose a metric to monitor, e.g., 'val_loss' or 'val_accuracy'
    mode='max',  # 'max' if you want to save the model with the highest monitored metric, 'min' for the lowest
    save_best_only=True,  # Save only the best model
    verbose=1
)

In [None]:
history = model_distilbert.fit(tfdataset_train, batch_size=BATCH_SIZE, epochs=N_EPOCHS,validation_data=tfdataset_validation,callbacks=[model_checkpoint_callback] )

In [None]:
history.params

In [None]:
history.history.keys()

In [None]:
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Epoch vs Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# Plot training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Epoch vs Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
model_distilbert.save_weights('new_distilbert_model_weights.h5')

In [None]:
model_distilbert.get_metrics_result()

In [None]:
testSet = model_distilbert.evaluate(tfdataset_test, return_dict=True, batch_size=BATCH_SIZE)
print(testSet)

In [None]:
y_pred_probs = model_distilbert.predict(tfdataset_test, batch_size=BATCH_SIZE)

# Assuming logits are along axis 1
y_pred = np.argmax(y_pred_probs.logits, axis=1)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
y_true = []  # Populate y_true with your actual labels

# Assuming tfdataset_test is an iterable dataset
for batch in tfdataset_test:
    labels = batch[1]  # Adjust this based on your dataset structure
    y_true.extend(labels.numpy())

# Compute confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)

In [None]:
import seaborn as sns

In [None]:
# Plot confusion matrix using seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=range(2), yticklabels=range(2))
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# tf_val = create_tf_set(val_X_list,val_y_list,tkzr)

In [None]:
# tfdataset_val = tf_val.batch(BATCH_SIZE)
# val = model_distilbert.evaluate(tfdataset_val, return_dict=True, batch_size=BATCH_SIZE)
# print(val)

# Evaluation

In [None]:
# tkzr = DistilBertTokenizer.from_pretrained(MODEL_NAME)
# validation_set_x_list = validation_set_x.tolist()
# validation_set_y_list = validation_set_y.tolist()
# tf_validation = create_tf_set(validation_set_x_list,validation_set_y_list,tkzr)

In [None]:
# model_load = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)

# # Load the saved weights
# model_load.load_weights('/kaggle/input/distilbert-ml-project/model_checkpoint.h5')

# # Compile the model (you need to compile it before evaluation)
# optimizer = optimizers.Adam(learning_rate=0.005)
# loss_fn = losses.SparseCategoricalCrossentropy(from_logits=True)
# model_load.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])


In [None]:
# tfdataset_val = tf_validation.batch(16)
# val_pred = model_load.predict(tfdataset_val, batch_size=16)
# y_pred = np.argmax(val_pred.logits, axis=1)

In [None]:
# from sklearn.metrics import confusion_matrix, classification_report
# y_true = []  # Populate y_true with your actual labels

# # Assuming tfdataset_test is an iterable dataset
# for batch in tfdataset_val:
#     labels = batch[1]  # Adjust this based on your dataset structure
#     y_true.extend(labels.numpy())

# # Compute confusion matrix
# conf_matrix = confusion_matrix(y_true, y_pred)

In [None]:
# # Plot confusion matrix using seaborn
# plt.figure(figsize=(10, 8))
# sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=range(2), yticklabels=range(2))
# plt.title('Confusion Matrix')
# plt.xlabel('Predicted Label')
# plt.ylabel('True Label')
# plt.show()