In [1]:
!pip install -q transformers nltk tensorflow tensorflow-hub tensorflow_text

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25h

## Import libraries

In [2]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [3]:
train_prompt_file = "/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv"
df_train_prompt = pd.read_csv(train_prompt_file)

train_essay_file = "/kaggle/input/llm-detect-ai-generated-text/train_essays.csv"
df_train_essay = pd.read_csv(train_essay_file)

test_essay_file = "/kaggle/input/llm-detect-ai-generated-text/test_essays.csv"
df_test_essay = pd.read_csv(test_essay_file)

sub_file = "/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv"
df_sub = pd.read_csv(sub_file)

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv'

In [None]:
df_train_essay.head()

In [None]:
df_train_prompt.head()

In [None]:
df_train_essay.info()

In [None]:
df_train_essay["generated"].value_counts()

In [None]:
sns.countplot(x=df_train_essay['generated'])
plt.show

In [None]:
df_train_essay["prompt_id"].value_counts()

In [None]:
sns.countplot(x=df_train_essay['prompt_id'])
plt.show

Since the generated distrobution are not equal, lets add external dataset

In [None]:
train_essay_ex_file = "/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv"
df_train_essay_ex = pd.read_csv(train_essay_ex_file)

In [None]:
df_train_essay_ex.rename(columns = {"label":"generated"}, inplace=True)
df_train_essay_ex['generated'].value_counts()

In [None]:
sns.countplot(x=df_train_essay_ex['generated'])
plt.show

### Merge Dataset

In [None]:
df_train = pd.concat([df_train_essay_ex[["text", "generated"]],
                                   df_train_essay[["text", "generated"]]])

In [None]:
df_train.info()

In [None]:
df_train['generated'].value_counts()

In [None]:
sns.countplot(x=df_train['generated'])
plt.show

### Balancing Dataset

In [None]:
sns.countplot(x=df_train['generated'])
plt.show

## Average Len

In [None]:
df_train["text_len"] = df_train["text"].apply(lambda x : len(x.split()))

In [None]:
sns.histplot(df_train['text_len'], bins=20, kde=True)
plt.title('Histogram of Text Length')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
print(df_train["text_len"].mean())
print(df_train["text_len"].max())

Since the average seq len is 377. Then, it is enough to use bert with 512 seq len

## Data pre-process

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download the stopwords dataset (you can move this line to the main part of your code if you prefer)
# nltk.download('stopwords')
# nltk.download('punkt')

def remove_stopwords(input_text, language='english'):
    """
    Remove stopwords from the input text.

    Parameters:
    - input_text (str): The input text containing stopwords.
    - language (str): The language of the stopwords. Default is 'english'.

    Returns:
    - str: The cleaned text without stopwords.
    """
    # Tokenize the text
    words = word_tokenize(input_text['text'])

    print(f"Index {input_text['index_col']}") if input_text['index_col'] % 10000 == 0 else None

    # Remove stopwords
    filtered_words = [word for word in words if word.lower() not in stopwords.words(language)]

    # Join the filtered words to form the cleaned text
    cleaned_text = ' '.join(filtered_words)

    return cleaned_text

### Split

In [None]:
from sklearn.model_selection import train_test_split

x, y = df_train['text'], df_train['generated']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=2529)


# Model

In [None]:
# Import libraries
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub


In [None]:
model_path = "/kaggle/input/bert/tensorflow2/bert-en-uncased-l-12-h-128-a-2/2"
preprocess_path = "/kaggle/input/bert/tensorflow2/en-uncased-preprocess/3/"

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(preprocess_path)
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer(
    model_path,
    trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 512].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 512].
dense_1 = tf.keras.layers.Dense(128 , activation='relu')(pooled_output)
dropout = tf.keras.layers.Dropout(0.7 , name="dropout1")(pooled_output)
dense_2 = tf.keras.layers.Dense(64 , activation='relu')(dropout)
dropout = tf.keras.layers.Dropout(0.5 , name="dropout2")(dense_2)

dense_out = tf.keras.layers.Dense(1 , activation='sigmoid', name='output')(dropout)


model = tf.keras.Model(inputs=text_input, outputs=dense_out)
model.summary()

In [None]:
# https://huggingface.co/google/bert_uncased_L-12_H-128_A-2
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss='binary_crossentropy',
              metrics=["acc"])

checkpoint_filepath = 'checkpoint.hdf5'
metric = 'val_accuracy'
callback_list = [tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                                                    monitor=metric,
                                                    verbose=2,
                                                    save_best_only=True,
                                                    mode='max'),
                 tf.keras.callbacks.EarlyStopping(monitor=metric,
                                                  patience=0,
                                                  restore_best_weights=True)
                ]
history = model.fit(x_train, y_train , batch_size=8, callbacks=[callback_list],
                    epochs=5 , validation_data=(x_test, y_test))
# model.load_weights(checkpoint_filepath)
model.save("model-bert")

# Test

In [None]:
loss , acc = model.evaluate(x_train, y_train)
print("Accuracy on Train data:",acc)
loss , acc = model.evaluate(x_test, y_test)
print("Accuracy on Test data:",acc)

In [None]:
y_pred = model.predict(df_test_essay['text'])
y_pred

In [None]:
submission_data = {'id': df_test_essay['id'], 'generated': y_pred[:, 0]}
submission = pd.DataFrame(submission_data)

# Save the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

In [None]:
pd.read_csv("/kaggle/working/submission.csv")