In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import nltk # NLP and string preprocessing
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer # NLP preprocessing
from sklearn.model_selection import train_test_split # test train split
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import Sequential # sequential neural network
from tensorflow.keras.layers import Dense # dense layer for nn
import tensorflow as tf

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv


In [2]:
df_train = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
df_test = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
df_train_prompts = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv")

In [3]:
df_train.head()

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0


In [4]:
df_test.head()

Unnamed: 0,id,prompt_id,text
0,0000aaaa,2,Aaa bbb ccc.
1,1111bbbb,3,Bbb ccc ddd.
2,2222cccc,4,CCC ddd eee.


In [5]:
df_train_prompts.head()

Unnamed: 0,prompt_id,prompt_name,instructions,source_text
0,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
1,1,Does the electoral college work?,Write a letter to your state senator in which ...,# What Is the Electoral College? by the Office...


Create a function to preprocess text for NLP:
1. Tokenize text into words
2. Remove non-alphabetic characters
3. Remove stop words (filler words without sentiment or relevence to NLP)
4. Stem words (reduce words to their basic meaning)
5. Make all characters lowercase

In [6]:
def text_process(text):
    """
    Processes text by tokenizing, removing stop words, stemming,
    and returning a cleaned string.

    Args:
        text: The string to be processed.

    Returns:
        A string containing the cleaned and processed text.
    """
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    ps = PorterStemmer()
    filtered_tokens = [ps.stem(word.lower()) for word in tokens if word.isalnum() and word.lower() not in stop_words]
    return " ".join(filtered_tokens)

### Apply NLP Preprocessing to Text

Apply preprocessing to df_trian text

In [7]:
df_train["text"] = df_train["text"].apply(text_process)

Apply preprocessing to df_test text

In [8]:
df_test['text'] = df_test["text"].apply(text_process)

Remove id column

In [9]:
df_train.drop(columns=['id'], inplace=True)
df_train.head()

Unnamed: 0,prompt_id,text,generated
0,0,car car around sinc becam famou 1900 henri for...,0
1,0,transport larg necess countri worldwid doubt c...,0
2,0,america love affair vehicl seem cool say elisa...,0
3,0,often ride car drive one motor vehicl work sto...,0
4,0,car wonder thing perhap one world greatest adv...,0


In [10]:
tfidf_vectorizer = TfidfVectorizer(max_features=500 )

X = tfidf_vectorizer.fit_transform(df_train["text"]).toarray()
y = df_train["generated"]

Create test, train split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
model = Sequential()
model.add(Dense(100, activation="relu", input_dim =500 ))
model.add(Dense(50, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

In [13]:
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [14]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
y_pred_binary[0]



array([0])

In [16]:
accuracy = accuracy_score(y_test, y_pred_binary)


print(accuracy*100)

99.63768115942028


In [17]:
X_tfidf_test = tfidf_vectorizer.transform(df_test['text']).toarray()
X_combined_tensor_test = tf.convert_to_tensor(X_tfidf_test, dtype=tf.float32)

In [18]:
test_pred = model.predict(X_combined_tensor_test)
test_pred_binary = (test_pred > 0.5).astype(float)
test_pred_binary



array([[0.],
       [0.],
       [0.]])

In [19]:
submission_df = pd.DataFrame({'id': df_test['id'], 'generated': test_pred_binary.flatten()})
submission_df.to_csv("submission.csv", index=False, quoting=3)
final=pd.read_csv('submission.csv')
final.head()

Unnamed: 0,id,generated
0,0000aaaa,0.0
1,1111bbbb,0.0
2,2222cccc,0.0
