In [1]:
import sys
import os
from constants import ROOT_DIR
import pandas as pd
import random

# Add src directory to sys.path
# Adapted from Taras Alenin's answer on StackOverflow at:
# https://stackoverflow.com/a/55623567
src_path = os.path.join(ROOT_DIR, 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Import custom modules
from lila_dataset import LILADataset  # noqa: E402

In [2]:
def convert_pair(pid, p, v):
    anchor_ids = p[0]['input_ids'].squeeze().tolist()[1:-1]
    other_ids = p[1]['input_ids'].squeeze().tolist()[1:-1]
    truth = p[2]

    return (pid,
            [' '.join([v[a_id] for a_id in anchor_ids]),
             ' '.join([v[o_id] for o_id in other_ids])]), (pid, truth)


def convert_pairs(dataset, vocab):
    converted_pairs = []
    truths = []

    for pid, p in enumerate(dataset):
        pair, truth = convert_pair(pid, p, vocab)
        converted_pairs.append(pair)
        truths.append(truth)

    return converted_pairs, truths


def save_pairs(ps, ts, view, fold, train):
    df_p = pd.DataFrame(ps, columns=['id', 'pair'])
    df_t = pd.DataFrame(ts, columns=['id', 'same'])

    output_dir = ("../data/test/bias-investigation/"
                  f"{'train' if train else 'validate'}/{view}/k_{fold}")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    # Write the pairs to JSONL
    pairs_path = os.path.join(output_dir, 'pairs.jsonl')
    with open(pairs_path, 'w') as f:
        f.write(df_p.to_json(orient='records', lines=True))
        print(f"Wrote pairs to {pairs_path}")

    truths_path = os.path.join(output_dir, 'truth.jsonl')
    with open(truths_path, 'w') as f:
        f.write(df_t.to_json(orient='records', lines=True))
        print(f"Wrote truths to {truths_path}")

In [3]:
# Reset any existing splits
LILADataset.reset_splits()

view = 'DV-MA-k-20000'
view_path = f'../data/normalized/{view}'
metadata_path = '../data/normalized/metadata.csv'

# Instantiate the full LILA dataset
# Split the dataset 80/20 train/val (equivalent to getting the first fold
# in 5-fold cross val)
num_folds = 5
full_dataset = LILADataset(view_path,
                           metadata_path,
                           cnk_size=512,
                           num_pairs=20_720,
                           num_folds=num_folds)

for k in range(num_folds):
    if k > 0:
        break
    train_dataset, val_dataset = full_dataset.get_train_val_datasets(k)
    # Get the models vocabulary for converting ids back to words
    # Adapted from:
    # https://discuss.huggingface.co/t/find-the-eqivalent-for-word-index-in-bert/13170
    vocab = full_dataset.tokenizer.vocab
    # Swap keys and values to be {id: word} for quicker lookups
    # Adapted from:
    # https://stackoverflow.com/a/13149770
    vocab = {v: k for v, k in zip(list(vocab.values()),
                                  list(vocab.keys()))}

    # Run the conversion and saving pipeline
    train_pairs = random.sample(train_dataset._pairs,
                                len(train_dataset._pairs))
    train_pairs, train_truths = convert_pairs(train_pairs, vocab)
    save_pairs(train_pairs, train_truths, view=view, fold=k,
               train=True)

    val_pairs = random.sample(val_dataset._pairs,
                              len(val_dataset._pairs))
    val_pairs, val_truths = convert_pairs(val_pairs, vocab)
    save_pairs(val_pairs, val_truths, view=view, fold=k, train=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (24306 > 512). Running this sequence through the model will result in indexing errors


Wrote pairs to ../data/test/bias-investigation/train/DV-MA-k-20000/k_0/pairs.jsonl
Wrote truths to ../data/test/bias-investigation/train/DV-MA-k-20000/k_0/truth.jsonl
Wrote pairs to ../data/test/bias-investigation/validate/DV-MA-k-20000/k_0/pairs.jsonl
Wrote truths to ../data/test/bias-investigation/validate/DV-MA-k-20000/k_0/truth.jsonl


In [4]:
train_pairs_file = '../data/test/bias-investigation/train/DV-MA-k-20000/k_0/pairs.jsonl'
train_truths_file = '../data/test/bias-investigation/train/DV-MA-k-20000/k_0/truth.jsonl'
val_pairs_file = '../data/test/bias-investigation/validate/DV-MA-k-20000/k_0/pairs.jsonl'
val_truths_file = '../data/test/bias-investigation/validate/DV-MA-k-20000/k_0/truth.jsonl'

In [5]:
df_train_pairs = pd.read_json(train_pairs_file, lines=True)
df_train_truths = pd.read_json(train_truths_file, lines=True)
df_val_pairs = pd.read_json(val_pairs_file, lines=True)
df_val_truths = pd.read_json(val_truths_file, lines=True)

In [26]:
investigation_samples_markdown = ''
for i in range(10):
    rand_train_idx = random.randrange(0, len(df_train_pairs))
    rand_val_idx = random.randrange(0, len(df_val_pairs))
    train_anchor = df_train_pairs.iloc[rand_train_idx]['pair'][0]
    train_other = df_train_pairs.iloc[rand_train_idx]['pair'][1]
    train_label = df_train_truths.iloc[rand_train_idx]['same']
    val_anchor = df_val_pairs.iloc[rand_val_idx]['pair'][0]
    val_other = df_val_pairs.iloc[rand_val_idx]['pair'][1]
    val_label = df_val_truths.iloc[rand_val_idx]['same']
    investigation_samples_markdown += f"## SAMPLE {i+1}\n"
    investigation_samples_markdown += f"### TRAIN SAMPLE {i+1}\n"
    investigation_samples_markdown += '- **' + ('Same-Author'
                                                if train_label == 1
                                                else 'Different-Author') + '**'
    investigation_samples_markdown += '\n'
    investigation_samples_markdown += f"  - _Anchor_:\n    - >{train_anchor}\n"
    investigation_samples_markdown += f"  - _Other_:\n    - >{train_other}\n"
    investigation_samples_markdown += f"### VAL SAMPLE {i+1}\n"
    investigation_samples_markdown += '- **' + ('Same-Author'
                                                if val_label == 1
                                                else 'Different-Author') + '**'
    investigation_samples_markdown += '\n'
    investigation_samples_markdown += f"  - _Anchor_:\n    - >{val_anchor}\n"
    investigation_samples_markdown += f"  - _Other_:\n    - >{val_other}\n"
    investigation_samples_markdown += '\n---\n'

with open('./bias_investigation_samples.md', 'w') as f:
    f.write(investigation_samples_markdown)