# Prepare subset data and sentence-transformer features

In [148]:
import os
import json
import h5py
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter, defaultdict

In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 1.96MB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 264kB/s]
Downloading: 100%|██████████| 10.6k/10.6k [00:00<00:00, 15.5MB/s]
Downloading: 100%|██████████| 612/612 [00:00<00:00, 1.18MB/s]
Downloading: 100%|██████████| 116/116 [00:00<00:00, 185kB/s]
Downloading: 100%|██████████| 39.3k/39.3k [00:00<00:00, 6.54MB/s]
Downloading: 100%|██████████| 90.9M/90.9M [00:01<00:00, 77.3MB/s]
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 73.5kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 224kB/s]
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 14.1MB/s]
Downloading: 100%|██████████| 350/350 [00:00<00:00, 547kB/s]
Downloading: 100%|██████████| 13.2k/13.2k [00:00<00:00, 15.0MB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 9.29MB/s]
Downloading: 100%|██████████| 349/349 [00:00<00:00, 506kB/s]


In [17]:
webqa_test_path = '../../../data/WebQA_test.json'
with open(webqa_test_path, 'r') as f:
    test_data = json.load(f)

In [6]:
webqa_train_path = '../../../data/WebQA_train_val.json'
with open(webqa_train_path, 'r') as f:
    train_data = json.load(f)

In [61]:
qcate2count = dict(Counter([train_data[k]['Qcate'] for k in train_data]))

In [62]:
qcate2int = {cate:i for i,cate in enumerate(qcate2count)}

In [33]:
train_queries = '../../webqa/pos_neg_image_fact_analysis/train.tsv'
val_queries = '../../webqa/pos_neg_image_fact_analysis/val.tsv'
test_queries = '../../webqa/pos_neg_image_fact_analysis/test.tsv'

### Prepare Subset Data

In [94]:
phases = ['train', 'test', 'val']
for phase in phases:
    curr_data_path = '../../webqa/pos_neg_image_fact_analysis/%s.tsv'%phase
    phase_df = pd.read_csv(curr_data_path, header=None, sep='\t')
    phase_df = phase_df.rename(columns={0: 'Guid', 1: 'utt_id', 2: 'isPos'})
    phase_df['qcate'] = 0
    for i in tqdm(range(len(phase_df))):
        tmp_guid, tmp_uttid, tmp_ispos, _ = phase_df.iloc[i]
        # print(qcate2int[train_data[tmp_guid]['Qcate']])
        phase_df['qcate'][i] = qcate2int[train_data[tmp_guid]['Qcate']]
    phase_df.to_csv('%s.csv'%phase, index=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phase_df['qcate'][i] = qcate2int[train_data[tmp_guid]['Qcate']]
100%|██████████| 6331/6331 [00:01<00:00, 3828.47it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phase_df['qcate'][i] = qcate2int[train_data[tmp_guid]['Qcate']]
100%|██████████| 2948/2948 [00:00<00:00, 3811.96it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phase_df['qcate'][i] = qcate2int[train_data[tmp_guid]['Qcate']]
100%|██████████| 1343/1343 [00:00<00:00, 3830.99it/s]


### Prepare SBert Features

In [151]:
phases = ['train', 'test', 'val']
for phase in phases:
    curr_data_path = 'subData/%s.csv'%phase
    phase_df = pd.read_csv(curr_data_path)
    # phase_df = phase_df.rename(columns={0: 'Guid', 1: 'utt_id', 2: 'isPos'})
    phase_id = []
    phase_senteces = []
    for i in tqdm(range(len(phase_df))):
        tmp_guid, tmp_uttid, tmp_ispos, tmp_qcate = phase_df.iloc[i]
        if tmp_ispos == 1:
            tmp_pool = train_data[tmp_guid]['img_posFacts']
        else:
            tmp_pool = train_data[tmp_guid]['img_negFacts']
        for item in tmp_pool:
            if item['image_id'] == tmp_uttid:
                phase_id.append(f'{tmp_guid}-{tmp_uttid}-{tmp_ispos}-{tmp_qcate}')
                phase_senteces.append(item['caption'])
                break

    phase_h5_out_path = '%s.h5'%phase
    phase_feats = model.encode(phase_senteces)
    assert len(phase_feats) == len(phase_id)
    phase_h5 = h5py.File(phase_h5_out_path,'w')
    phase_h5.create_dataset('text_uttid', data=phase_id)
    phase_h5.create_dataset('text_sbert_feat', data=phase_feats)
    phase_h5.close()

100%|██████████| 6331/6331 [00:00<00:00, 15716.31it/s]
100%|██████████| 2948/2948 [00:00<00:00, 15402.66it/s]
100%|██████████| 1343/1343 [00:00<00:00, 14586.41it/s]
