In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv
/kaggle/input/bert/tensorflow2/en-uncased-preprocess/3/saved_model.pb
/kaggle/input/bert/tensorflow2/en-uncased-preprocess/3/keras_metadata.pb
/kaggle/input/bert/tensorflow2/en-uncased-preprocess/3/assets/vocab.txt
/kaggle/input/bert/tensorflow2/en-uncased-preprocess/3/variables/variables.index
/kaggle/input/bert/tensorflow2/en-uncased-preprocess/3/variables/variables.data-00000-of-00001
/kaggle/input/bert/tensorflow2/bert-en-uncased-l-12-h-768-a-12/2/saved_model.pb
/kaggle/input/bert/tensorflow2/bert-en-uncased-l-12-h-768-a-12/2/keras_metadata.pb
/kaggle/input/bert/tensorflow2/bert-en-uncased-l-12-h-768-a-12/2/assets/vocab.txt
/kaggle/input/bert/tensorflow2/bert-en-uncased-l-12-h-768-a-12/2/variables/variables.index
/kaggle/input/bert/tensorflo

In [2]:
import tensorflow_text as text
import tensorflow_hub as hub
import tensorflow as tf
import random
import tqdm
from tqdm import tqdm
import copy



Prepara Data from source: llm-7-prompt-training-dataset

In [3]:
train_essays_01_source = pd.read_csv('/kaggle/input/llm-7-prompt-training-dataset/train_essays_7_prompts_v2.csv')

#Slicing the dataset with different labels
train_essays_01_source_label_1 = train_essays_01_source[train_essays_01_source['label'] == 1].reset_index(drop = True)  
train_essays_01_source_label_0 = train_essays_01_source[train_essays_01_source['label'] == 0].reset_index(drop = True)

#Set number of data points required from each label
#Creating new data set with data sampled from each labels
label_1_count = 1638 #1638 data points available
label_0_count = 13712 #13712 data points available

essay_dataset_raw = pd.concat([train_essays_01_source_label_1.sample(n= label_1_count, random_state = 36, axis = 0), 
                         train_essays_01_source_label_0.sample(n= label_0_count, random_state = 36, axis = 0)], 
                        axis = 0, sort = True).sample(frac = 1)

In [4]:
from nltk.tokenize import word_tokenize
from nltk import sent_tokenize


# Function to split text into sequences of 128 words
def split_text_into_sequences(text):
    words = word_tokenize(text)
    sequences = [words[i:i + 128] for i in range(0, len(words), 128)]
    return [' '.join(seq) for seq in sequences]


def get_df_with_128_length_sequences(essay_dataset_raw):

    #essay_dataset_raw.head()
    essay_dataset_raw['text_seq_list'] = essay_dataset_raw['text'].map(split_text_into_sequences)


    essay_dataset_raw.drop(columns = ['text'], inplace = True)
    essay_dataset_raw = essay_dataset_raw.explode('text_seq_list')

    essay_dataset_raw.rename(columns = {'text_seq_list':'text'}, inplace = True)
    
    return essay_dataset_raw

In [5]:
#Bert encoder

text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer(
    "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-12-h-768-a-12/versions/2",trainable=False)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 768].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 768].

In [6]:
encoder_model = tf.keras.Model(inputs=text_input, outputs=sequence_output)

In [7]:
#del model

In [8]:
import tensorflow as tf
from tensorflow.keras import layers


# Input text
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)

#Text Embeddings generation with Bert
input_embeddings = encoder_model(text_input)

# Apply pooling right after the input layer
pooled_input = layers.GlobalMaxPooling1D()(input_embeddings)

# Reshape pooled input to add singleton dimension
query = tf.expand_dims(pooled_input, axis=1)

# Multi-Head Attention layer with pooling layer output as key and input embedding vectors as values
num_heads = 12
key_dim = 64  # Adjust based on experimentation
attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(query, input_embeddings)

# Global average pooling
global_pool = layers.GlobalAveragePooling1D()(attention_output)

# Dropout to prevent overfitting
dropout = layers.Dropout(0.2)(global_pool)

# Dense output layer with 1 unit and sigmoid activation (for binary classification)
output = layers.Dense(1, activation="sigmoid")(dropout)

# Create the model
model = tf.keras.Model(inputs=text_input, outputs=output)

In [9]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 model (Functional)          (None, 128, 768)             1094822   ['input_2[0][0]']             
                                                          41                                      
                                                                                                  
 global_max_pooling1d (Glob  (None, 768)                  0         ['model[0][0]']               
 alMaxPooling1D)                                                                                  
                                                                                            

In [10]:
essay_dataset_raw = get_df_with_128_length_sequences(essay_dataset_raw)
essay_dataset_raw = essay_dataset_raw.sample(frac = 1)

# essay_dataset_test = pd.read_csv('/kaggle/input/daigt-proper-train-dataset/train_drcat_04.csv')
# essay_dataset_test = essay_dataset_test.sample(frac = 0.1)

# essay_dataset_test = get_df_with_128_length_sequences(essay_dataset_test)

In [11]:
# essay_test, label_test = essay_dataset_test.iloc[:,5], essay_dataset_test.iloc[:,1]

from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam

def get_optimizer(batch_no, lr_rate):
    # The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
    # by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
    # not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
    num_train_steps = len(essay_batch)//batch_batch_size * num_epochs
    lr_scheduler = PolynomialDecay(
        initial_learning_rate=lr_rate*pow(2,-batch_no-1), end_learning_rate=lr_rate*pow(2,-batch_no-2), decay_steps=num_train_steps
    )
    
    opt = Adam(learning_rate=lr_scheduler)
    return opt

In [12]:
%%time
batch_size = 15000
num_batches = len(essay_dataset_raw)//batch_size

batch_batch_size = 32
num_epochs = 5
learning_rate = 5e-5

essay_dataset_raw_batches = np.array_split(essay_dataset_raw.iloc[:,1], num_batches)
essay_label_batches = np.array_split(essay_dataset_raw.iloc[:,0], num_batches)

test_evaluation_results = []

for batch_no in tqdm(range(num_batches)):
    
    #Train:CV:Test :: i:j:k 
    i, j = 40, 30
    essay_batch = essay_dataset_raw_batches[batch_no]
    label_batch = essay_label_batches[batch_no]

    fraction_size = len(essay_batch)//(i+j)
    essay_train, essay_CV = tf.split(essay_batch, [fraction_size*i, len(essay_batch)-fraction_size*i], 0)
    label_train, label_CV = tf.split(label_batch, [fraction_size*i, len(label_batch)-fraction_size*i], 0)
    
    model.compile(
    loss = 'binary_crossentropy',
    optimizer = get_optimizer(batch_no, learning_rate),
    metrics = ['accuracy', 'AUC'])
    
    h = model.fit(essay_train, label_train,
                    validation_data = (essay_CV, label_CV),
                    batch_size = batch_batch_size,
                    epochs = num_epochs,
                    callbacks = [tf.keras.callbacks.EarlyStopping(monitor = 'val_auc', patience = 0, mode = 'max', 
                                 restore_best_weights = True, min_delta = 0.005, start_from_epoch = 1)])
    
    learning_rate = learning_rate*0.95
    
#     print('Evaluation of model after: ',batch_no)
#     _ = model.evaluate(essay_test, label_test)
#     test_evaluation_results.append(_)
#     print('\n\n')
    

# h = model.fit(
#    encoded_essay_train, label_train,
#    validation_data = (encoded_essay_CV, label_CV),
#    epochs = 20,
#     callbacks = [
#       tf.keras.callbacks.EarlyStopping(monitor = 'val_auc', patience = 1, mode = 'max', 
#                                       restore_best_weights = True, min_delta = 0.0, start_from_epoch = 2)
#    ]
# )

  return bound(*args, **kwds)
  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1/5
Epoch 2/5
Epoch 3/5


 25%|██▌       | 1/4 [05:22<16:07, 322.52s/it]

Epoch 1/5
Epoch 2/5
Epoch 3/5


 50%|█████     | 2/4 [10:37<10:36, 318.05s/it]

Epoch 1/5
Epoch 2/5
Epoch 3/5


 75%|███████▌  | 3/4 [15:52<05:16, 316.48s/it]

Epoch 1/5
Epoch 2/5
Epoch 3/5


100%|██████████| 4/4 [21:06<00:00, 316.64s/it]

CPU times: user 19min 47s, sys: 1min 45s, total: 21min 32s
Wall time: 21min 6s





In [13]:
# test_data = pd.read_csv('/kaggle/input/daigt-proper-train-dataset/train_drcat_04.csv')
# #test_data.drop(columns = 'generated', inplace = True)

In [14]:
# test_data.head()
# print(test_data.shape)

# test_data_subset = test_data.sample(frac = 0.02)
# print(test_data_subset.shape)
# test_data_subset.head()

In [15]:
# model.evaluate(test_data_subset['text'],test_data_subset['label'])

# #[0.2413991540670395, 0.9253393411636353, 0.9713884592056274] with lr schedule 01
# #[0.14819669723510742, 0.935520350933075, 0.9877243041992188] with lr schedule 02
# #[0.21204924583435059, 0.9151583909988403, 0.9826033115386963] with lr schedule 03

In [16]:
#model.export('ai_text_det_kaggle.pkl')
#tf.saved_model.save(model, 'ai_text_det_kaggle_saved_model_02.pkl')
model.save('ai_text_det_kaggle_saved_model_03.keras')

In [17]:
test_data = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

In [18]:
# Apply the function to the 'text' column
test_data['text_sequences'] = test_data['text'].apply(split_text_into_sequences)

# Explode the sequences to create rows for each sequence
df_exploded = test_data.explode('text_sequences')

# Drop the original 'text' column
df_exploded = df_exploded.drop('text', axis=1)

#Resetting test_data
test_data = df_exploded.copy()
del df_exploded



In [19]:
%%time
# encoded_test_data = encoder_model(train_essays_01_source.iloc[:10000, 0])
# test_set_predictions = model.predict(encoded_test_data)


if len(test_data)<100:
    test_set_predictions = model.predict(test_data.iloc[:, 2])

else:
    batch_size = 100
    num_batches = len(test_data)//batch_size

    test_data_batches = np.array_split(test_data, num_batches)
    test_set_predictions_batch_list = []

    for batch in tqdm(test_data_batches):
        test_set_predictions_batch = model.predict(batch.iloc[:, 2])
        test_set_predictions_batch_list.append(test_set_predictions_batch)

    test_set_predictions = np.concatenate(test_set_predictions_batch_list, axis = 0)

CPU times: user 864 ms, sys: 12.3 ms, total: 877 ms
Wall time: 868 ms


In [20]:
test_data['prediction'] = test_set_predictions.flatten()

test_data_agg = test_data.groupby(['id','prompt_id']).aggregate({'prediction':'mean'})

test_data_agg.reset_index(inplace = True)

test_data_agg.rename(columns = {'prediction':'genarated'}, inplace = True)

In [21]:
test_data_agg['genarated'] = np.round(test_data_agg['genarated'], decimals = 4)
submission = test_data_agg[['id', 'genarated']]
submission.to_csv('/kaggle/working/submission.csv', index = False, header = submission.columns.tolist())

# EDA

In [22]:
# import matplotlib.pyplot as plt
# import numpy
# from sklearn import metrics

# def plot_confusion_matrix(label_test, test_set_predictions, threashold = 0.5):
    
#     if not test_set_pred_labels:
#         test_set_pred_labels = [prob >= threashold for prob in test_set_predictions.flatten()]

#     confusion_matrix = metrics.confusion_matrix(label_test, test_set_pred_labels)

#     cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])

#     cm_display.plot()
#     plt.show()

In [23]:
# plot_confusion_matrix(label_test, test_set_predictions, 0.5)

In [24]:
# import numpy as np
# from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
# import matplotlib.pyplot as plt
# import plotly.graph_objects as go
# from sklearn.metrics import roc_curve

# def plot_roc_curve(true_y, y_prob):
#     """
#     Plots an interactive ROC curve based on the probabilities and displays thresholds on hover.
#     """
    
#     fpr, tpr, thresholds = roc_curve(true_y, y_prob, drop_intermediate=False)

#     fig = go.Figure()

#     fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC Curve', hovertext=[f'Threshold: {threshold:.2f}' for threshold in thresholds]))

#     fig.update_layout(
#         title='ROC Curve',
#         xaxis=dict(title='False Positive Rate'),
#         yaxis=dict(title='True Positive Rate'),
#         hovermode='closest'
#     )

#     fig.show()

In [25]:
# plot_roc_curve(label_test, test_set_predictions.flatten())
#print('ROC AUC Score: ',roc_auc_score(label_test, test_set_predictions.flatten()))

# TEST Data Predctions, a different approach!

In [26]:
# test_data.info()

In [27]:
# test_data = pd.read_csv('/kaggle/input/daigt-proper-train-dataset/train_drcat_04.csv')
# #test_data.drop(columns = 'generated', inplace = True)

In [28]:
# test_data_label = test_data[['essay_id','text']].copy()
# test_data.drop(columns = ['source', 'prompt','fold'], inplace = True)
# test_data.rename(columns = {'essay_id':'id'}, inplace = True)
# test_data['prompt_id'] = test_data['id']

In [29]:
# len(test_data.iloc[6,1].split())
# #test_data.iloc[6,1]

In [30]:
# test_data.info()
# test_data = test_data.sample(frac = 0.25)

In [31]:
# test_data.info()

In [32]:
# from nltk.tokenize import word_tokenize
# from nltk import sent_tokenize


# # Function to split text into sequences of 128 words
# def split_text_into_sequences(text):
#     words = word_tokenize(text)
#     sequences = [words[i:i + 128] for i in range(0, len(words), 128)]
#     return [' '.join(seq) for seq in sequences]


# # Apply the function to the 'text' column
# test_data['text_sequences'] = test_data['text'].apply(split_text_into_sequences)

# # Explode the sequences to create rows for each sequence
# df_exploded = test_data.explode('text_sequences')

# # Drop the original 'text' column
# df_exploded = df_exploded.drop('text', axis=1)

# #Resetting test_data
# test_data = df_exploded.copy()
# del df_exploded





In [33]:
# test_data = test_data.sample(frac = 8, replace = True)

In [34]:
# %%time
# tqdm
# # Apply the function to the 'text' column
# test_data['text_sequences'] = test_data['text'].apply(split_text_into_sequences)

# # Explode the sequences to create rows for each sequence
# df_exploded = test_data.explode('text_sequences')

# # Drop the original 'text' column
# df_exploded = df_exploded.drop('text', axis=1)

# # Display the resulting dataframe
# print(df_exploded)

In [35]:
# test_data = df_exploded

In [36]:
# %%time
# # encoded_test_data = encoder_model(train_essays_01_source.iloc[:10000, 0])
# # test_set_predictions = model.predict(encoded_test_data)


# if len(test_data)<100:
#     encoded_test_data = encoder_model(test_data.iloc[:, 3])
#     test_set_predictions = model.predict(encoded_test_data)

# else:
#     batch_size = 100
#     num_batches = len(test_data)//batch_size

#     test_data_batches = np.array_split(test_data, num_batches)
#     test_set_predictions_batch_list = []

#     for batch in test_data_batches:
#         encoded_test_data_batch = encoder_model(batch.iloc[:, 3])
#         test_set_predictions_batch = model.predict(encoded_test_data_batch)
#         test_set_predictions_batch_list.append(test_set_predictions_batch)

#     test_set_predictions = np.concatenate(test_set_predictions_batch_list, axis = 0)

In [37]:
# batch.iloc[:, 3]

In [38]:
# test_set_predictions.flatten()

In [39]:
# test_data['prediction'] = test_set_predictions.flatten()

In [40]:
# test_data.head()

In [41]:
# test_data['prediction'] = test_set_predictions.flatten()

# test_data_agg = test_data.groupby(['id','prompt_id','label']).aggregate({'prediction':['max', 'min', 'mean', 'first']})

# test_data_agg.reset_index(inplace = True)

In [42]:
# test_data_agg.reset_index(inplace = True)

In [43]:
# pred_custom = []
# for index,row in test_data_agg.iterrows():
    
#     if row['prediction']['first'] >= 0.5:
#         pred_custom.append(row['prediction']['max'])
#     else:
#         pred_custom.append(row['prediction']['min'])
    

In [44]:
# test_data_agg['custom'] = pred_custom

In [45]:
# test_data_agg.head()

In [46]:
#print('ROC AUC Score: ',roc_auc_score(label_test, test_set_predictions.flatten()))

In [47]:
# for i in range(3,8):
#     print('ROC with metric:',test_data_agg.columns[i])
#     print('ROC AUC Score: ',roc_auc_score(test_data_agg.iloc[:,2], test_data_agg.iloc[:,i]))

In [48]:
# (0.9824285407738714 - 0.9667853753352559)*100