# Prep files for ColBERT

In [32]:
import json
import pandas as pd
import pickle as pkl

### helpers

In [47]:
def write_tsv(df, path):
    df.to_csv(path, index=False, header=None, sep='\t')
    
def get_indices(split_name, path_splits=path_splits):
    d = json.load(open(f'{path_splits}/{split_name}.json'))
    return {int(x['id'].split('_')[-1]) for x in d}

def paragraph_id(r) -> str:
    f, a, p = r['file_idx'], r['article_idx'], r['paragraph_idx']
    return f"{f}_{a}_{p}"

def fix_ranks(rows: pd.DataFrame) -> pd.DataFrame:
    pids = set()
    r = 1
    vals = []
    
    for _, row in rows.iterrows():
        qid, pid, rank = row.get('qid'), row.get('pid'), row.get('rank')
        if pid not in pids:
            pids.add(pid)
            vals.append({'qid': qid, 'pid': pid, 'rank': r})
            r += 1
    
    return pd.DataFrame(vals)

def dedup_ranking(df):
    return (
        rankings_dev
        .groupby('qid')
        .apply(fix_ranks)
        .reset_index(drop=True)
    )

### params

In [66]:
path_paragraphs = 'wiki-7500-paragraphs.csv'
path_splits = 'splits_7500'
path_wiki_res = 'wiki_res_7500.pkl'
path_out_paragraphs = 'paragraphs.tsv'
path_out_rankings_dev = 'rankings_dev.tsv'
path_out_rankings_train = 'rankings_train.tsv'
path_nq_train = 'train_preprocessed.json'
path_questions = 'questions.tsv'
path_questions_dev = 'questions_dev.tsv'

n = 7_500

### get results

In [36]:
results = pkl.load(open(path_wiki_res, 'rb'))

### paragraphs

In [3]:
df_paragraph = pd.read_csv(path_paragraphs)

In [4]:
len(df_paragraph)

1099066

In [7]:
df_paragraph.head(3)

Unnamed: 0,file,file_idx,article_idx,paragraph_idx,title,paragraph_cnt,paragraph,paragraph_char_cnt
0,/data/dph-data/wikidump/20181220_concat/0000,0,12,0,Ochlodes yuma,1,"Ochlodes yuma, the Yuma skipper, is a species...",260
1,/data/dph-data/wikidump/20181220_concat/0000,0,32,2,Ocho Rios,3,"The town has restaurants, and in Margaritavil...",955
2,/data/dph-data/wikidump/20181220_concat/0000,0,34,1,Ocho apellidos vascos,4,"The weekend of its premiere, the film gathere...",604


In [13]:
paragraphs = (
    df_paragraph
    .assign(pid=range(len(df_paragraph)))
    .assign(
        text=(
            df_paragraph.paragraph
            .str.replace('\n', '')
            .str.replace('\t', '')
        )
    )
)    

In [None]:
write_tsv(path_out_paragraphs)

In [17]:
!wc -l $path_out_paragraphs

1099066 paragraphs.tsv


In [18]:
!ls -lh $path_out_paragraphs

-rw-r--r-- 1 root root 842M Jun  2 23:25 paragraphs.tsv


In [30]:
%%time
paragraphs_to_idx = {
    paragraph_id(row): idx
    for idx, row in df_paragraph.iterrows()
}

### get indices

In [23]:
train_idxs = get_indices('train')
dev_idxs = get_indices('dev')
test_idxs = get_indices('test')

### get rankings

In [37]:
%%time
rankings = pd.DataFrame(
    {
        'qid': idx,
        'pid': paragraphs_to_idx[f"{m.article.file_idx}_{m.article.article_idx}_{m.paragraph_idx}"],
        'rank': r + 1,
    }
    for idx in range(len(results.matches_answers))
    for r, m in enumerate(
        list(results.matches_answers[idx])
        + list(results.matches_question[idx])
    )
)

CPU times: user 5.57 s, sys: 40 ms, total: 5.61 s
Wall time: 5.61 s


In [38]:
len(rankings)

1537500

In [43]:
rankings.sample(3)

Unnamed: 0,qid,pid,rank
1242124,6059,239386,30
1091145,5322,91320,136
1180975,5760,877775,176


In [44]:
rankings_train = rankings[rankings.qid.isin(train_idxs)]
rankings_dev = rankings[rankings.qid.isin(dev_idxs)]
assert (len(rankings) - (len(rankings_dev) + len(rankings_train))) == len(rankings_dev)

In [48]:
rankings_dev_dedup = dedup_ranking(rankings_dev)

In [53]:
rankings_dev_dedup.head(3)

Unnamed: 0,qid,pid,rank
0,14,1020453,1
1,14,547325,2
2,14,1020392,3


In [51]:
write_tsv(rankings_train, path_out_rankings_train)
write_tsv(rankings_dev_dedup, path_out_rankings_dev)

### questions

In [65]:
df_questions = pd.DataFrame(
    json.load(open(path_nq_train))['data'][:n]
)
df_questions = df_questions.assign(
    id=df_questions.id.str.split('_').str[-1].astype(int)
)[['id', 'question']]
df_questions_dev = df_questions[df_questions.isin(dev_idxs)]

In [63]:
df_questions.sample(3)

Unnamed: 0,id,question,answers
2014,2014,who plays teenage jamie in one tree hill,[Jackson Brundage]
340,340,who played leatherface in texas chainsaw massa...,[Andrew Bryniarski]
6922,6922,what does mono no aware mean in japanese,[the pathos of things]


In [67]:
write_tsv(df_questions, path_questions)
write_tsv(df_questions, path_questions_dev)