In [11]:
import io
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import sys
sys.path.append('../dataloader')
from dataloader import daigtv2_loader
sys.path.append('../part1')

In [4]:

# -------- 1. Load your data --------
# Replace with your actual data-loading code.
path_to_folder = "../../"

df = daigtv2_loader(path_to_folder)
texts = df['text'].values
labels = df['label'].values

In [6]:
# -------- 2. Tokenize and pad --------
MAX_VOCAB = 20000
MAX_LEN   = 500

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')

word_index = tokenizer.word_index
vocab_size = min(MAX_VOCAB, len(word_index)) + 1


In [7]:

# -------- 3. Load pretrained FastText vectors --------
EMBEDDING_DIM = 300
embedding_matrix = np.random.normal(
    size=(vocab_size, EMBEDDING_DIM)
).astype(np.float32)

def build_embedding_matrix(
    vec_file: str,
    word_index: dict,
    vocab_size: int,
    embedding_dim: int
) -> np.ndarray:
    """
    Reads `vec_file` (FastText .vec) line by line,
    and fills rows in embedding_matrix for words in word_index.
    """
    with io.open(vec_file, 'r', encoding='utf-8', newline='\n', errors='ignore') as fin:
        header = fin.readline()  # e.g. "1000000 300"
        for line in fin:
            parts = line.rstrip().split(' ')
            word = parts[0]
            if word in word_index:
                idx = word_index[word]
                if idx < vocab_size:
                    vect = np.asarray(parts[1:], dtype=np.float32)
                    if vect.shape[0] == embedding_dim:
                        embedding_matrix[idx] = vect
    return embedding_matrix

# Point this to wherever you downloaded “wiki-news-300d-1M.vec”
VEC_FILE = '../../wiki-news-300d-1M.vec'
embedding_matrix = build_embedding_matrix(
    VEC_FILE, word_index, vocab_size, EMBEDDING_DIM
)

In [20]:

# -------- 4. Define the Keras model --------
model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_LEN,
        trainable=False,    # freeze pre-trained vectors
    ),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

# callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

model_checkpoint = ModelCheckpoint(
    'best_model.h5',
    monitor='val_accuracy',
    save_best_only=True,
    mode='min'
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=2,
    min_lr=1e-6
)


# -------- 5. Train --------
history = model.fit(
    X,
    labels,
    batch_size=32,
    epochs=100,
    validation_split=0.1,
    callbacks=[early_stopping, model_checkpoint, reduce_lr]
)


Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 500, 300)          6000300   
                                                                 
 bidirectional_3 (Bidirectio  (None, 256)              439296    
 nal)                                                            
                                                                 
 dropout_7 (Dropout)         (None, 256)               0         
                                                                 
 dense_7 (Dense)             (None, 1)                 257       
                                                                 
Total params: 6,439,853
Trainable params: 439,553
Non-trainable params: 6,000,300
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


In [19]:

# -------- 4. Define the Keras model --------
model2 = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_LEN,
        trainable=False,    # freeze pre-trained vectors
    ),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model2.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model2.summary()

# callbacks
early_stopping2 = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

model_checkpoint2 = ModelCheckpoint(
    'best_model2.h5',
    monitor='val_accuracy',
    save_best_only=True,
    mode='min'
)

reduce_lr2 = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=2,
    min_lr=1e-6
)


# -------- 5. Train --------
history = model2.fit(
    X,
    labels,
    batch_size=32,
    epochs=100,
    validation_split=0.1,
    callbacks=[early_stopping2, model_checkpoint2, reduce_lr2]
)

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 500, 300)          6000300   
                                                                 
 lstm_6 (LSTM)               (None, 128)               219648    
                                                                 
 dropout_6 (Dropout)         (None, 128)               0         
                                                                 
 dense_6 (Dense)             (None, 1)                 129       
                                                                 
Total params: 6,220,077
Trainable params: 219,777
Non-trainable params: 6,000,300
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


In [21]:
# restore best weights to each model
model.load_weights('best_model.h5')
model2.load_weights('best_model2.h5')

In [22]:
# evaluate the models
loss, accuracy = model.evaluate(X, labels, verbose=0)
print(f"Model 1 - Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")
loss2, accuracy2 = model2.evaluate(X, labels, verbose=0)
print(f"Model 2 - Loss: {loss2:.4f}, Accuracy: {accuracy2:.4f}")

Model 1 - Loss: 0.2839, Accuracy: 0.8899
Model 2 - Loss: 0.3750, Accuracy: 0.8730


In [28]:
# test specific examples (human from reddit)
human_text =  "I personally have a different argument as to why the electoral college is a good thing. My argument is more about trying to refocus what the office of the President is supposed to be about - the representative of the many states for foreign relations and treaties and as the check on Congress.\n\nThe president continues to be regarded as the \"most powerful person in the world\" which elevates the office to almost monarch-like reverence. The problem with this is it excludes Congress as to where the power is and should be. The notion to change to popular vote gives even more power to what should be a weak executive. A strong(er) executive opens the door to more authoritarian figures to just simply appeal to a cult of personality.\n\nSo, the EC is the states voting for their representative. This is it's true purpose and why it should continue IMO."

# gemini generated text
ai_text = "Dear Senator,\n\nI am writing to you today to express my strong support for abolishing the Electoral College and electing the President by popular vote. I believe that this is the only way to ensure that every American's vote counts and that our elections are truly representative of the will of the people.\n\nThe Electoral College is a system that was devised over 200 years ago, when the United States was a very different country. At the time, it was believed that the Electoral College would help to protect the interests of smaller states against the larger states. However, the Electoral College has become increasingly outdated and irrelevant in the 21st century.\n\nOne of the biggest problems with the Electoral College is that it gives too much power to a small number of states. In the 2016 election, for example, Donald Trump won the Electoral College despite losing the popular vote by nearly three million votes. This is because Trump won a majority of the electoral votes in a small number of swing states, such as Pennsylvania, Michigan, and Wisconsin.\n\nThis system is unfair to the voters in the states that Trump lost. Their votes were essentially ignored, and they had no say in who became President. This is not how a democracy should work.\n\nAnother problem with the Electoral College is that it encourages candidates to focus on a small number of swing states. In the 2016 election, for example, Trump spent very little time campaigning in states that he was sure to win, such as California and Texas. Instead, he focused all of his attention on the swing states, where he knew that the election would be decided.\n\nThis is not a good way to run a presidential election. Candidates should be campaigning all over the country, not just in a handful of swing states. This is the only way to ensure that all Americans have a voice in the election.\n\nI urge you to support legislation that would abolish the Electoral College and elect the President by popular vote. This is the only way to ensure that our elections are truly fair and representative of the will of the people.\n\nThank you for your time and consideration.\n\nSincerely,\n\n[Your Name]"

# preprocess the text
human_seq = tokenizer.texts_to_sequences([human_text])
human_seq = pad_sequences(human_seq, maxlen=MAX_LEN, padding='post', truncating='post')
ai_seq = tokenizer.texts_to_sequences([ai_text])
ai_seq = pad_sequences(ai_seq, maxlen=MAX_LEN, padding='post', truncating='post')

# make predictions
human_pred = model.predict(human_seq)
ai_pred = model.predict(ai_seq)

print(f"Human text prediction: {human_pred[0][0]:.4f}")
print(f"AI text prediction: {ai_pred[0][0]:.4f}")


Human text prediction: 0.4908
AI text prediction: 0.9684
