In [16]:
import pandas as pd, numpy as np, ujson as json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt

In [17]:
# Load claim and evidence files
with open("../data/train-claims.json") as f:
    claim = json.load(f)

with open("../data/evidence.json") as f:
    evidence = json.load(f)

# Build dataframe with evidence texts
df = pd.DataFrame([
    {
        "id": cid,
        "text": "[CLAIM] " + item["claim_text"] + " [EVIDENCE] " + " ".join([
            evidence[eid] for eid in item["evidences"] if eid in evidence
        ]),
        "label": item["claim_label"]
    }
    for cid, item in claim.items()
])


In [18]:
le = LabelEncoder()
df["label_encoded"] = le.fit_transform(df["label"])
NUM_CLASSES = len(le.classes_)

In [19]:
from sklearn.model_selection import StratifiedKFold

checkpoint_path = "../data/LSTM_model.keras"

k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

val_accuracies = []
best_model = None
best_val_acc = 0

for fold, (train_idx, val_idx) in enumerate(skf.split(df["text"], df["label_encoded"])):
    print(f"\nFold {fold+1}/{k}")
    
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    # Recreate TextVectorizer (can also cache across folds)
    vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=10000, output_sequence_length=256, pad_to_max_tokens=True
    )
    vectorizer.adapt(train_df["text"].values)

    # Create datasets
    def make_ds(d): return tf.data.Dataset.from_tensor_slices((d["text"].values, d["label_encoded"].values))\
        .batch(64).cache().prefetch(tf.data.AUTOTUNE)

    train_ds = make_ds(train_df)
    val_ds = make_ds(val_df)

    # Define model
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(), dtype=tf.string),
        vectorizer,
        tf.keras.layers.Embedding(10000, 64, mask_zero=True),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
        tf.keras.layers.GlobalMaxPooling1D(),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(len(df["label_encoded"].unique()), activation="softmax")
    ])

    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )

    history = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=10,
        callbacks=[tf.keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)],
        verbose=0
    )

    val_acc = max(history.history["val_accuracy"])
    print(f"✅ Fold {fold+1} best val_accuracy: {val_acc:.4f}")
    val_accuracies.append(val_acc)
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model = model
        model.save(checkpoint_path)



Fold 1/5




✅ Fold 1 best val_accuracy: 0.5163

Fold 2/5




✅ Fold 2 best val_accuracy: 0.5854

Fold 3/5




✅ Fold 3 best val_accuracy: 0.5650

Fold 4/5




✅ Fold 4 best val_accuracy: 0.5755

Fold 5/5




✅ Fold 5 best val_accuracy: 0.5469


In [20]:
print("\n📊 Cross-validation result:")
print(f"Mean val_accuracy: {np.mean(val_accuracies):.4f}")
print(f"Std dev: {np.std(val_accuracies):.4f}")


📊 Cross-validation result:
Mean val_accuracy: 0.5578
Std dev: 0.0244


In [21]:
with open("../data/dev-claims.json") as f:
    dev = json.load(f)

# Load your trained model
model = tf.keras.models.load_model(checkpoint_path)

output = {}

for cid, obj in dev.items():
    claim_text = obj["claim_text"]
    evidence_strs = [evidence.get(eid, "") for eid in obj["evidences"][:3]]  # top 3
    text_input = "[CLAIM] " + claim_text + " [EVIDENCE] " + " ".join(evidence_strs)
    
    # Predict: you must wrap in np.array and use .reshape or tf.convert
    pred = model.predict(tf.convert_to_tensor([text_input]), verbose=0)
    label = le.inverse_transform([np.argmax(pred)])[0]
    
    output[cid] = {
        "claim_text": claim_text,
        "evidences": obj["evidences"],
        "claim_label": label
    }


In [22]:
from collections import Counter

label_counts = Counter([entry["claim_label"] for entry in output.values()])

print("🔢 Label counts:")
for label, count in label_counts.items():
    print(f"{label:>20}: {count}")

🔢 Label counts:
            SUPPORTS: 123
     NOT_ENOUGH_INFO: 31


In [23]:
with open("../data/dev-predicted-lstm.json", "w") as f:
    json.dump(output, f, indent=2)


In [26]:
with open("../data/test-evidence-faiss.json") as f:
    test = json.load(f)

# Load your trained model
model = tf.keras.models.load_model(checkpoint_path)

output = {}

for cid, obj in test.items():
    claim_text = obj["claim_text"]
    evidence_strs = [evidence.get(eid, "") for eid in obj["evidences"][:3]]  # top 3
    text_input = "[CLAIM] " + claim_text + " [EVIDENCE] " + " ".join(evidence_strs)
    
    # Predict: you must wrap in np.array and use .reshape or tf.convert
    pred = model.predict(tf.convert_to_tensor([text_input]), verbose=0)
    label = le.inverse_transform([np.argmax(pred)])[0]
    
    output[cid] = {
        "claim_text": claim_text,
        "evidences": obj["evidences"],
        "claim_label": label
    }
with open("../data/test-predicted-lstm.json", "w") as f:
    json.dump(output, f, indent=2)



In [None]:
# class CFG:
#     sequence_length = 256
#     vocab_size = 10000
#     batch_size = 64
#     embed_dim = 128
#     hidden_1 = 32
#     hidden_2 = 32
#     lr = 1e-3
#     epochs = 20

In [None]:
# vectorizer = tf.keras.layers.TextVectorization(
#     max_tokens=CFG.vocab_size,
#     output_sequence_length=CFG.sequence_length,
#     pad_to_max_tokens=True
# )
# vectorizer.adapt(train_df["text"].values)

In [None]:
# def create_dataset(dataframe, shuffle=True):
#     ds = tf.data.Dataset.from_tensor_slices((dataframe["text"].values, dataframe["label_encoded"].values))
#     if shuffle:
#         ds = ds.shuffle(1024)
#     return ds.batch(CFG.batch_size).cache().prefetch(tf.data.AUTOTUNE)

# train_ds = create_dataset(train_df)
# val_ds = create_dataset(val_df, shuffle=False)

In [None]:
# model = tf.keras.Sequential([
#     tf.keras.layers.Input(shape=(), dtype=tf.string),
#     vectorizer,
#     tf.keras.layers.Embedding(CFG.vocab_size, CFG.embed_dim, mask_zero=True),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(CFG.hidden_1, return_sequences=True)),
#     tf.keras.layers.Dropout(0.3),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(CFG.hidden_2)),
#     tf.keras.layers.Dropout(0.3),
#     tf.keras.layers.Dense(32, activation="relu"),
#     tf.keras.layers.Dense(NUM_CLASSES, activation="softmax")
# ])

In [None]:
# model.compile(
#     loss=tf.keras.losses.SparseCategoricalCrossentropy(),
#     optimizer=tf.keras.optimizers.Adam(learning_rate=CFG.lr),
#     metrics=["accuracy"]
# )



In [None]:
# weights = compute_class_weight(
#     class_weight='balanced',
#     classes=np.unique(train_df["label_encoded"]),
#     y=train_df["label_encoded"]
# )
# class_weights = dict(enumerate(weights))

In [None]:
# # Train + save 
# checkpoint_path = "../data/LSTM_model.keras"

# history = model.fit(
#     train_ds,
#     validation_data=val_ds,
#     epochs=CFG.epochs,
#     class_weight=class_weights,
#     callbacks=[
#         tf.keras.callbacks.ModelCheckpoint(
#             checkpoint_path,
#             save_best_only=True,
#             monitor="val_accuracy",
#             mode="max"
#         ),
#         tf.keras.callbacks.EarlyStopping(
#             monitor='val_loss', 
#             patience=3, 
#             restore_best_weights=True
#         ),
#     ]
# )


In [None]:
# def plot_learning(history):
#     plt.figure(figsize=(12, 4))
#     for i, key in enumerate(["loss", "accuracy"]):
#         plt.subplot(1, 2, i+1)
#         plt.plot(history.history[key], label="train")
#         plt.plot(history.history[f"val_{key}"], label="val")
#         plt.title(key.capitalize())
#         plt.xlabel("Epoch")
#         plt.ylabel(key)
#         plt.legend()
#     plt.tight_layout()
#     plt.show()

# plot_learning(history)

In [None]:
# with open("../data/dev-claims.json") as f:
#     dev = json.load(f)

# # Load your trained model
# model = tf.keras.models.load_model(checkpoint_path)

# output = {}

# for cid, obj in dev.items():
#     claim_text = obj["claim_text"]
#     evidence_strs = [evidences.get(eid, "") for eid in obj["evidences"][:3]]  # top 3
#     text_input = "[CLAIM] " + claim_text + " [EVIDENCE] " + " ".join(evidence_strs)
    
#     # Predict: you must wrap in np.array and use .reshape or tf.convert
#     pred = model.predict(tf.convert_to_tensor([text_input]), verbose=0)
#     label = le.inverse_transform([np.argmax(pred)])[0]
    
#     output[cid] = {
#         "claim_text": claim_text,
#         "evidences": obj["evidences"],
#         "claim_label": label
#     }
