In [None]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import sys
sys.path.append('../dataloader')
from dataloader import daigtv2_loader
sys.path.append('../part1')

In [4]:

# -------- 1. Load your data --------
# Replace with your actual data-loading code.
path_to_folder = "~/OneDrive - Georgia Institute of Technology/GT - Spring 2025/ISYE 4600/Final Project/"

df = daigtv2_loader(path_to_folder)
texts = df['text'].values
labels = df['label'].values

In [None]:

# -------- 2. Train FastText on your corpus --------
# (Alternatively: load pre-trained vectors via FastText.load_fasttext_format)
ft_model = FastText(
    sentences=[t.split() for t in texts],
    vector_size=300,
    window=5,
    min_count=2,
    epochs=10
)

# -------- 3. Tokenize and pad --------
MAX_VOCAB = 20000
MAX_LEN   = 500

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')

# -------- 4. Build embedding matrix --------
word_index = tokenizer.word_index
vocab_size = min(MAX_VOCAB, len(word_index)) + 1  # +1 for padding index 0
embedding_dim = ft_model.vector_size

# Initialize random matrix for words not in FastText
embedding_matrix = np.random.normal(
    size=(vocab_size, embedding_dim)
).astype(np.float32)

for word, idx in word_index.items():
    if idx >= vocab_size:
        continue
    if word in ft_model.wv:
        embedding_matrix[idx] = ft_model.wv[word]

# -------- 5. Define the Keras model --------
model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=MAX_LEN,
        trainable=False,    # freeze FastText weights
    ),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -------- 6. Train --------
history = model.fit(
    X,
    labels,
    batch_size=32,
    epochs=5,
    validation_split=0.1
)


KeyboardInterrupt: 

In [None]:
# -------- 2. Load pre‑trained FastText vectors --------
# Download from https://fasttext.cc/docs/en/english-vectors.html and unzip cc.en.300.bin
FASTTEXT_BIN = 'cc.en.300.bin'
ft_wv = KeyedVectors.load_word2vec_format(FASTTEXT_BIN, binary=True)

# -------- 3. Tokenize and pad --------
MAX_VOCAB = 20000
MAX_LEN   = 500

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')

# -------- 4. Build embedding matrix using pre‑trained vectors --------
word_index = tokenizer.word_index
vocab_size = min(MAX_VOCAB, len(word_index)) + 1
embedding_dim = ft_wv.vector_size

# Start with random vectors for any OOV tokens
embedding_matrix = np.random.normal(
    size=(vocab_size, embedding_dim)
).astype(np.float32)

for word, idx in word_index.items():
    if idx >= vocab_size:
        continue
    if word in ft_wv:
        embedding_matrix[idx] = ft_wv[word]

# -------- 5. Define the Keras model --------
model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=MAX_LEN,
        trainable=False         # freeze embeddings for speed/stability
    ),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=Adam(1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -------- 6. Train --------
history = model.fit(
    X,
    labels,
    batch_size=32,
    epochs=5,
    validation_split=0.1
)
