In [1]:
import sys
import os
from constants import ROOT_DIR
import pandas as pd
import random

# Add src directory to sys.path
# Adapted from Taras Alenin's answer on StackOverflow at:
# https://stackoverflow.com/a/55623567
src_path = os.path.join(ROOT_DIR, 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Import custom modules
from lila_dataset import LILADataset  # noqa: E402

In [2]:
def convert_pair(pid, p, v):
    anchor_ids = p[0]['input_ids'].squeeze().tolist()[1:-1]
    other_ids = p[1]['input_ids'].squeeze().tolist()[1:-1]
    truth = p[2]

    return (pid,
            [' '.join([v[a_id] for a_id in anchor_ids]),
             ' '.join([v[o_id] for o_id in other_ids])]), (pid, truth)


def convert_pairs(dataset, vocab):
    converted_pairs = []
    truths = []

    for pid, p in enumerate(dataset):
        pair, truth = convert_pair(pid, p, vocab)
        converted_pairs.append(pair)
        truths.append(truth)

    return converted_pairs, truths


def save_pairs(ps, ts, view, fold, train):
    df_p = pd.DataFrame(ps, columns=['id', 'pair'])
    df_t = pd.DataFrame(ts, columns=['id', 'same'])

    output_dir = f"./data/{'train' if train else 'validate'}/{view}/k_{fold}"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    # Write the pairs to JSONL
    pairs_path = os.path.join(output_dir, 'pairs.jsonl')
    with open(pairs_path, 'w') as f:
        f.write(df_p.to_json(orient='records', lines=True))
        print(f"Wrote pairs to {pairs_path}")

    truths_path = os.path.join(output_dir, 'truth.jsonl')
    with open(truths_path, 'w') as f:
        f.write(df_t.to_json(orient='records', lines=True))
        print(f"Wrote truths to {truths_path}")

In [3]:
distortion_dirs = [
    "undistorted",
    "DV-MA-k-300",
    "DV-MA-k-3000",
    "DV-MA-k-20000"
]

for view in distortion_dirs:
    # Reset any existing splits
    LILADataset.reset_splits()

    view_path = f'../data/normalized/{view}'
    metadata_path = '../data/normalized/metadata.csv'

    # Instantiate the full LILA dataset
    # Split the dataset 80/20 train/val (equivalent to getting the first fold
    # in 5-fold cross val)
    num_splits = 5
    full_dataset = LILADataset(view_path,
                               metadata_path,
                               cnk_size=512,
                               num_pairs=20_720,
                               num_splits=num_splits)

    for k in range(num_splits):
        train_dataset, val_dataset = full_dataset.get_train_val_datasets(k)
        # Get the models vocabulary for converting ids back to words
        # Adapted from:
        # https://discuss.huggingface.co/t/find-the-eqivalent-for-word-index-in-bert/13170
        vocab = full_dataset.tokenizer.vocab
        # Swap keys and values to be {id: word} for quicker lookups
        # Adapted from:
        # https://stackoverflow.com/a/13149770
        vocab = {v: k for v, k in zip(list(vocab.values()),
                                      list(vocab.keys()))}

        # Run the conversion and saving pipeline
        train_pairs = random.sample(train_dataset._pairs,
                                    len(train_dataset._pairs))
        train_pairs, train_truths = convert_pairs(train_pairs, vocab)
        save_pairs(train_pairs, train_truths, view=view, fold=k, train=True)

        val_pairs = random.sample(val_dataset._pairs,
                                  len(val_dataset._pairs))
        val_pairs, val_truths = convert_pairs(val_pairs, vocab)
        save_pairs(val_pairs, val_truths, view=view, fold=k, train=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (58898 > 512). Running this sequence through the model will result in indexing errors


Wrote pairs to ./data/train/DV-MA-k-300/k_0/pairs.jsonl
Wrote truths to ./data/train/DV-MA-k-300/k_0/truth.jsonl
Wrote pairs to ./data/validate/DV-MA-k-300/k_0/pairs.jsonl
Wrote truths to ./data/validate/DV-MA-k-300/k_0/truth.jsonl
Wrote pairs to ./data/train/DV-MA-k-300/k_1/pairs.jsonl
Wrote truths to ./data/train/DV-MA-k-300/k_1/truth.jsonl
Wrote pairs to ./data/validate/DV-MA-k-300/k_1/pairs.jsonl
Wrote truths to ./data/validate/DV-MA-k-300/k_1/truth.jsonl
Wrote pairs to ./data/train/DV-MA-k-300/k_2/pairs.jsonl
Wrote truths to ./data/train/DV-MA-k-300/k_2/truth.jsonl
Wrote pairs to ./data/validate/DV-MA-k-300/k_2/pairs.jsonl
Wrote truths to ./data/validate/DV-MA-k-300/k_2/truth.jsonl
Wrote pairs to ./data/train/DV-MA-k-300/k_3/pairs.jsonl
Wrote truths to ./data/train/DV-MA-k-300/k_3/truth.jsonl
Wrote pairs to ./data/validate/DV-MA-k-300/k_3/pairs.jsonl
Wrote truths to ./data/validate/DV-MA-k-300/k_3/truth.jsonl
Wrote pairs to ./data/train/DV-MA-k-300/k_4/pairs.jsonl
Wrote truths to 

Token indices sequence length is longer than the specified maximum sequence length for this model (37976 > 512). Running this sequence through the model will result in indexing errors


Wrote pairs to ./data/train/DV-MA-k-3000/k_0/pairs.jsonl
Wrote truths to ./data/train/DV-MA-k-3000/k_0/truth.jsonl
Wrote pairs to ./data/validate/DV-MA-k-3000/k_0/pairs.jsonl
Wrote truths to ./data/validate/DV-MA-k-3000/k_0/truth.jsonl
Wrote pairs to ./data/train/DV-MA-k-3000/k_1/pairs.jsonl
Wrote truths to ./data/train/DV-MA-k-3000/k_1/truth.jsonl
Wrote pairs to ./data/validate/DV-MA-k-3000/k_1/pairs.jsonl
Wrote truths to ./data/validate/DV-MA-k-3000/k_1/truth.jsonl
Wrote pairs to ./data/train/DV-MA-k-3000/k_2/pairs.jsonl
Wrote truths to ./data/train/DV-MA-k-3000/k_2/truth.jsonl
Wrote pairs to ./data/validate/DV-MA-k-3000/k_2/pairs.jsonl
Wrote truths to ./data/validate/DV-MA-k-3000/k_2/truth.jsonl
Wrote pairs to ./data/train/DV-MA-k-3000/k_3/pairs.jsonl
Wrote truths to ./data/train/DV-MA-k-3000/k_3/truth.jsonl
Wrote pairs to ./data/validate/DV-MA-k-3000/k_3/pairs.jsonl
Wrote truths to ./data/validate/DV-MA-k-3000/k_3/truth.jsonl
Wrote pairs to ./data/train/DV-MA-k-3000/k_4/pairs.jsonl

Token indices sequence length is longer than the specified maximum sequence length for this model (24306 > 512). Running this sequence through the model will result in indexing errors


Wrote pairs to ./data/train/DV-MA-k-20000/k_0/pairs.jsonl
Wrote truths to ./data/train/DV-MA-k-20000/k_0/truth.jsonl
Wrote pairs to ./data/validate/DV-MA-k-20000/k_0/pairs.jsonl
Wrote truths to ./data/validate/DV-MA-k-20000/k_0/truth.jsonl
Wrote pairs to ./data/train/DV-MA-k-20000/k_1/pairs.jsonl
Wrote truths to ./data/train/DV-MA-k-20000/k_1/truth.jsonl
Wrote pairs to ./data/validate/DV-MA-k-20000/k_1/pairs.jsonl
Wrote truths to ./data/validate/DV-MA-k-20000/k_1/truth.jsonl
Wrote pairs to ./data/train/DV-MA-k-20000/k_2/pairs.jsonl
Wrote truths to ./data/train/DV-MA-k-20000/k_2/truth.jsonl
Wrote pairs to ./data/validate/DV-MA-k-20000/k_2/pairs.jsonl
Wrote truths to ./data/validate/DV-MA-k-20000/k_2/truth.jsonl
Wrote pairs to ./data/train/DV-MA-k-20000/k_3/pairs.jsonl
Wrote truths to ./data/train/DV-MA-k-20000/k_3/truth.jsonl
Wrote pairs to ./data/validate/DV-MA-k-20000/k_3/pairs.jsonl
Wrote truths to ./data/validate/DV-MA-k-20000/k_3/truth.jsonl
Wrote pairs to ./data/train/DV-MA-k-2000

Token indices sequence length is longer than the specified maximum sequence length for this model (20656 > 512). Running this sequence through the model will result in indexing errors


AssertionError: The requested number of same pairs requires more chunks than can be generated from this population: 31 < 35

In [14]:
distortion_dirs = [
    "undistorted",
    "DV-MA-k-300",
    "DV-MA-k-3000",
    "DV-MA-k-20000"
]

for distortion_dir in distortion_dirs:
    for k in range(num_splits):
        print(f"""
echo "START: {distortion_dir}/k_{k}" && \\
python3 cngdist.py \\
        --model_dir="models/baseline/{distortion_dir}/k_{k}" \\
        -i="data/validate/{distortion_dir}/k_{k}" \\
        -num_iterations=0 \\
        -o="out/{distortion_dir}/k_{k}" \\
> evals/{distortion_dir}_k_{k}_CLIOUT.txt && \\
"""
             )
print("echo \"COMPLETE\"")


echo "START: undistorted/k_0" && \
python3 cngdist.py \
        --model_dir="models/baseline/undistorted/k_0" \
        -i="data/validate/undistorted/k_0" \
        -num_iterations=0 \
        -o="out/undistorted/k_0" \
> evals/undistorted_k_0_CLIOUT.txt && \


echo "START: undistorted/k_1" && \
python3 cngdist.py \
        --model_dir="models/baseline/undistorted/k_1" \
        -i="data/validate/undistorted/k_1" \
        -num_iterations=0 \
        -o="out/undistorted/k_1" \
> evals/undistorted_k_1_CLIOUT.txt && \


echo "START: undistorted/k_2" && \
python3 cngdist.py \
        --model_dir="models/baseline/undistorted/k_2" \
        -i="data/validate/undistorted/k_2" \
        -num_iterations=0 \
        -o="out/undistorted/k_2" \
> evals/undistorted_k_2_CLIOUT.txt && \


echo "START: undistorted/k_3" && \
python3 cngdist.py \
        --model_dir="models/baseline/undistorted/k_3" \
        -i="data/validate/undistorted/k_3" \
        -num_iterations=0 \
        -o="out/undistorted/