In [1]:
import numpy as np
import pandas as pd
import nltk
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Input, SimpleRNN, LSTM , GRU, Bidirectional, Embedding
from keras.layers import Dropout
from keras.utils import to_categorical
from keras.callbacks import Callback

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


import random
random.seed(0)


In [2]:
# load the dataset
data = pd.read_csv('data/train_essay_combined.csv', encoding='utf-8')
data.head()

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0


In [3]:
data['text'].astype(str)

0       Cars. Cars have been around since they became ...
1       Transportation is a large necessity in most co...
2       "America's love affair with it's vehicles seem...
3       How often do you ride in a car? Do you drive a...
4       Cars are a wonderful thing. They are perhaps o...
                              ...                        
2757    Dear Senator,\n\nI am writing to you today to ...
2758    Dear Senator,\n\nI am writing to you today to ...
2759    Dear Senator,\n\nI am writing to you today to ...
2760    Dear Senator,\n\nI am writing to you today to ...
2761    Dear Senator,\n\nI am writing to you today to ...
Name: text, Length: 2762, dtype: object

In [4]:
train, val = train_test_split(data, test_size=0.2, random_state=0)

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.replace('\r', ' ')
    return text

In [6]:
test = pd.read_csv("data/test_essays.csv", encoding='utf-8')
test.head()

Unnamed: 0,id,prompt_id,text
0,0000aaaa,2,Aaa bbb ccc.
1,1111bbbb,3,Bbb ccc ddd.
2,2222cccc,4,CCC ddd eee.
3,3333dddd,5,Recent advancements in large language model (L...
4,4444eeee,6,"The Kaggle competition, titled LLM - Detect AI..."


In [7]:
from spacy.lang.en import English
nlp = English()
tokenizer = nlp.tokenizer
train_essays = []

for _, row in train.iterrows():
  train_essays.append([token.text for token in tokenizer(preprocess_text(row["text"]))])

val_essays = []
for _,row in val.iterrows():
  val_essays.append([token.text for token in tokenizer(preprocess_text(row["text"]))])

test_essays = []
for _,row in test.iterrows():
  test_essays.append([token.text for token in tokenizer(preprocess_text(row["text"]))])

In [8]:
texts = train_essays + val_essays
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(texts)
vocab_size = len(tokenizer.word_index) + 1

In [9]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_essays)
val_sequences = tokenizer.texts_to_sequences(val_essays)
test_sequences = tokenizer.texts_to_sequences(test_essays)

# Get max sequence length
max_length = max([len(x) for x in train_sequences])
print('Maximum essay length: {}'.format(max_length))

# Pad sequences
train_padded = keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_length, padding='post')
val_padded = keras.preprocessing.sequence.pad_sequences(val_sequences, maxlen=max_length, padding='post')
test_padded = keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Convert to numpy array
train_padded = np.array(train_padded)
val_padded = np.array(val_padded)
test_padded = np.array(test_padded)

train_labels = np.array([x["generated"] for _, x in train.iterrows()])
val_labels = np.array([x["generated"] for _, x in val.iterrows()])

Maximum essay length: 1496


In [10]:
class ROCCallback(Callback):
    def __init__(self, validation_data):
        self.x_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.x_val)
        auc = roc_auc_score(self.y_val, y_pred)
        print(f"\nAUC-ROC on validation data: {auc:.4f}")

In [11]:
import os
if os.path.exists("models/lstm.h5"):
    model = keras.models.load_model("models/lstm.h5")

else:
    model = Sequential()

    embedding_dim = 300
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
    model.add(Bidirectional(LSTM(128, return_sequences=True)))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(
        loss='mean_squared_error', 
        optimizer='adam', 
        metrics=['accuracy'])

    model.summary()

    roc_callback = ROCCallback(validation_data=(val_padded, val_labels))

    model.fit(train_padded, 
            train_labels, 
            epochs=10, 
            verbose=1, 
            validation_data=(val_padded, val_labels), 
            batch_size=64,
            callbacks=[keras.callbacks.EarlyStopping(patience=3), roc_callback])


In [12]:
model.save('models/lstm.h5')

  saving_api.save_model(


In [14]:
# predict the score
pred = model.predict(test_padded)
submission = pd.DataFrame()
submission["id"] = test["id"]
submission["generated"] = pred
submission




Unnamed: 0,id,generated
0,0000aaaa,0.999998
1,1111bbbb,0.999998
2,2222cccc,0.999998
3,3333dddd,0.999966
4,4444eeee,0.999985
5,5555ffff,0.999998
