In [22]:
import json
import os

file_type = 'tsv'
assert file_type in {'csv', 'tsv'}

DATA_DIR = '{}/research/data'.format(os.getenv('HOME'))
save_dir = f'{DATA_DIR}/hotpot-squad-q-classify'
if file_type != 'csv':
    save_dir += '-' + file_type

for split in ['train', 'dev']:
    labeled_qlines = []
    if file_type == 'tsv':
        labeled_qlines.append('sentence\tlabel')

    with open(f'{DATA_DIR}/hotpot-orig/{split}.json') as f:
        hotpot = json.load(f)
    for example in hotpot:
        q = example['question'].strip()
        labeled_qlines.append(f'"0","{q}"' if file_type == 'csv' else f'{q} \t0')
    
    with open(f'{DATA_DIR}/squad/{split}.json') as f:
        squad = json.load(f)
    for article in squad['data']:
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                q = qa['question'].strip()
                labeled_qlines.append(f'"1","{q}"' if file_type == 'csv' else f'{q} \t1')
    
    if file_type == 'tsv':
        labeled_qlines[-1] += '\n'
    
    os.makedirs(save_dir, exist_ok=True)
    save_file = f'{save_dir}/{split}.{file_type}'
    print(f'Saving to {save_file}...')
    with open(save_file, 'w') as f:
        f.writelines('\n'.join(labeled_qlines))

Saving to /Users/ethanperez/research/data/hotpot-squad-q-classify-tsv/train.tsv...
Saving to /Users/ethanperez/research/data/hotpot-squad-q-classify-tsv/dev.tsv...


In [None]:
# Run in terminal to process CSV file for fasttext input:
myshuf() {
  perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
}

normalize_text() {
  tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
    sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
        -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
        -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
}

cat ~/research/data/hotpot-squad-q-classify/dev.csv | normalize_text > ~/research/data/hotpot-squad-q-classify/dev.txt
cat ~/research/data/hotpot-squad-q-classify/train.csv | normalize_text > ~/research/data/hotpot-squad-q-classify/train.txt