In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM 

In [6]:
df = pd.read_csv('data/AI_Human.csv')
df.head()

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0


In [29]:
len(df)

487235

In [30]:
# Assuming df is your original dataframe
df_zero = df[df['generated'] == 0]
df_one = df[df['generated'] == 1]

# Sample 5000 rows from each dataframe
df_zero_sampled = df_zero.sample(5000, random_state=1)
df_one_sampled = df_one.sample(5000, random_state=1)

# Concatenate the two dataframes
df = pd.concat([df_zero_sampled, df_one_sampled])

In [31]:
# Train/test split
X = df.drop(columns=['generated'])
y = df['generated']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=45)

In [32]:
# Determining the max length (in words) of rows of the data
maxlen = df['text'].apply(lambda x: len(x.split())).max()
print(f'Max length: {maxlen}')

Max length: 1642


In [33]:
# Estimating the size of the dataset's vocabulary
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
vocab_size = len(tokenizer.word_index)

print(f'Estimated vocabulary size: {vocab_size}')

Estimated vocabulary size: 40378


In [36]:
# Setting the number of words to consider as features
max_features = vocab_size

# Converting to list
X_train = X_train['text'].tolist()
X_test = X_test['text'].tolist()

# Instantiating the tokenizer
tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Converting texts to sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Padding the sequences
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

In [37]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the LSTM model
model = Sequential([
    Embedding(max_features, 32),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='rmsprop', # Using rmsprop as the optimizer as it's good for RNNs
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=10,
          batch_size=128,
          validation_split=0.2) # Setting validation split to monitor the model's performance and prevent overfitting.

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x31098da90>

In [38]:
# Evaluating the model on test data
results = model.evaluate(X_test, y_test)



In [39]:
# Answers to the assignment.
# Note! Do not edit this cell, just run it after you complete the task. 

# TODO: Put the result of the model.evaluate() function in the results variable.
# TODO: set metrics=['accuracy'] to monitor in model.compile() to see how much of the neural network's predictions is correct at different stages of training.

print(f"Test Loss:{results[0]} Test Accuracy:{results[1]*100}%")

Test Loss:0.08526962995529175 Test Accuracy:98.0400025844574%
