In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
%cd /content/drive/MyDrive/CogQA/CogQA

/content/drive/MyDrive/CogQA/CogQA


In [6]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# train func

In [None]:
import re
import json
from tqdm import tqdm, trange
import pdb
import random
from collections import namedtuple
import numpy as np
import copy
import torch
import traceback
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.optimization import BertAdam
from model import BertForMultiHopQuestionAnswering, CognitiveGNN
from utils import warmup_linear, find_start_end_after_tokenized, find_start_end_before_tokenized, bundle_part_to_batch, judge_question_type, fuzzy_retrieve, WindowMean, fuzz
from data import convert_question_to_samples_bundle, homebrew_data_loader

In [None]:
class Checkpoint(object):
    """Checkpoint class"""
    @staticmethod
    def save(model, path):
        """Save model using name"""
        name = f'{model.name}.pt'
        torch.save(model.state_dict(), path+name)

    @staticmethod
    def load(model,path, name):
        """Load model using name"""
        #name = f'{model.name}.pt'
        model.load_state_dict(torch.load(path+name))
        return model

In [None]:
def train(bundles, model1, device, mode, model2, batch_size, num_epoch, gradient_accumulation_steps, lr1, lr2, alpha):
    '''Train Sys1 and Sys2 models.
    
    Train models by task #1(tensors) and task #2(bundle). 
    
    Args:
        bundles (list): List of bundles.
        model1 (BertForMultiHopQuestionAnswering): System 1 model.
        device (torch.device): The device which models and data are on.
        mode (str): Defaults to 'tensors'. Task identifier('tensors' or 'bundle').
        model2 (CognitiveGNN): System 2 model.
        batch_size (int): Defaults to 4.
        num_epoch (int): Defaults to 1.
        gradient_accumulation_steps (int): Defaults to 1. 
        lr1 (float): Defaults to 1e-4. Learning rate for Sys1.
        lr2 (float): Defaults to 1e-4. Learning rate for Sys2.
        alpha (float): Defaults to 0.2. Balance factor for loss of two systems.
    
    Returns:
        ([type], [type]): Trained models.
    '''

    # Prepare optimizer for Sys1
    param_optimizer = list(model1.named_parameters())
    # hack to remove pooler, which is not used.
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    num_batch, dataloader = homebrew_data_loader(bundles, mode = mode, batch_size=batch_size)
    num_steps = num_batch * num_epoch
    global_step = 0
    opt1 = BertAdam(optimizer_grouped_parameters, lr = lr1, warmup = 0.1, t_total=num_steps)
    model1.to(device)
    model1.train()

    # Prepare optimizer for Sys2
    if mode == 'bundle':
        opt2 = Adam(model2.parameters(), lr=lr2)
        model2.to(device)
        model2.train()
        warmed = False # warmup for jointly training

    for epoch in trange(num_epoch, desc = 'Epoch'):
        ans_mean, hop_mean = WindowMean(), WindowMean()
        opt1.zero_grad()
        if mode == 'bundle':
            final_mean = WindowMean()
            opt2.zero_grad()
        tqdm_obj = tqdm(dataloader, total = num_batch)

        for step, batch in enumerate(tqdm_obj):
            try:
                if mode == 'tensors':
                    batch = tuple(t.to(device) for t in batch)
                    hop_loss, ans_loss, pooled_output = model1(*batch)
                    hop_loss, ans_loss = hop_loss.mean(), ans_loss.mean()
                    pooled_output.detach()
                    loss = ans_loss + hop_loss
                elif mode == 'bundle':
                    hop_loss, ans_loss, final_loss = model2(batch, model1, device)
                    hop_loss, ans_loss = hop_loss.mean(), ans_loss.mean()
                    loss = ans_loss + hop_loss + alpha * final_loss
                loss.backward()

                if (step + 1) % gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses. From BERT pytorch examples
                    lr_this_step = lr1 * warmup_linear(global_step/num_steps, warmup = 0.1)
                    for param_group in opt1.param_groups:
                        param_group['lr'] = lr_this_step
                    global_step += 1
                    if mode == 'bundle':
                        opt2.step()
                        opt2.zero_grad()
                        final_mean_loss = final_mean.update(final_loss.item())
                        tqdm_obj.set_description('ans_loss: {:.2f}, hop_loss: {:.2f}, final_loss: {:.2f}'.format(
                            ans_mean.update(ans_loss.item()), hop_mean.update(hop_loss.item()), final_mean_loss))
                        # During warming period, model1 is frozen and model2 is trained to normal weights
                        if final_mean_loss < 0.9 and step > 100: # ugly manual hyperparam
                            warmed = True
                        if warmed:
                            opt1.step()
                        opt1.zero_grad()
                    else:
                        opt1.step()
                        opt1.zero_grad()
                        tqdm_obj.set_description('ans_loss: {:.2f}, hop_loss: {:.2f}'.format(
                            ans_mean.update(ans_loss.item()), hop_mean.update(hop_loss.item())))
                    if step % 1000 == 0:
                        #output_model1_file = './models/bert-base-uncased_1.bin.tmp'
                        output_model_file = './models/bert-base-uncased.bin.tmp'
                        saved_dict = {'params1' : model1.module.state_dict()}
                        saved_dict['params2'] = model2.state_dict()
                        # saved_dict['opt1'] = opt1
                        # saved_dict['opt2'] = opt2
                        #torch.save(saved_dict, output_model_file)
                        # torch.save({
                        #   'epoch': num_epoch,
                        #   'model_state_dict': model1.module.state_dict(),
                        #   'optimizer_state_dict': opt1.state_dict(),
                        #   'loss': loss
                        #   }, output_model1_file)
                        # torch.save({
                        #     'epoch': num_epoch,
                        #     'model_state_dict': model2.state_dict(),
                        #     'optimizer_state_dict': opt2.state_dict(),
                        #     'loss': loss
                        # }, output_model2_file)
                        
            except Exception as err:
                traceback.print_exc()
                if mode == 'bundle':   
                    print(batch._id) 
    return (model1, model2)


In [None]:
with open('/content/drive/MyDrive/CogQA/CogQA/examples/hotpot_train_v1.1_refined.json' ,'r') as fin:
        dataset = json.load(fin)

In [None]:
dataset[2]

In [None]:
train_data = copy.deepcopy(dataset[:30000])

In [None]:
def main(output_model_file = './models/bert-base-uncased.bin', load = False, mode = 'tensors', batch_size = 4, 
            num_epoch = 1, gradient_accumulation_steps = 1, lr1 = 1e-4, lr2 = 1e-4, alpha = 0.2, train_data=train_data):
    
    BERT_MODEL = 'bert-base-uncased' # bert-large is too large for ordinary GPU on task #2
    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)
    # with open('/content/drive/MyDrive/CogQA/CogQA/examples/hotpot_train_v1.1_refined.json' ,'r') as fin:
    #     dataset = json.load(fin)
    bundles = []
    for data in tqdm(train_data):
        try:
            bundles.append(convert_question_to_samples_bundle(tokenizer, data))
        except ValueError as err:
            pass
        # except Exception as err:
        #     traceback.print_exc()
        #     pass
    device = torch.device('cpu') if not torch.cuda.is_available() else torch.device('cuda')
    if load:
        print('Loading model from {}'.format(output_model_file))
        model_state_dict = torch.load(output_model_file)
        model1 = BertForMultiHopQuestionAnswering.from_pretrained(BERT_MODEL, state_dict=model_state_dict['params1'])
        model2 = CognitiveGNN(model1.config.hidden_size)
        model2.load_state_dict(model_state_dict['params2'])

    else:
        model1 = BertForMultiHopQuestionAnswering.from_pretrained(BERT_MODEL,
                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1))
        model2 = CognitiveGNN(model1.config.hidden_size)
        # model_state_dict = torch.load(output_model_file)
        # model1 = BertForMultiHopQuestionAnswering.from_pretrained(BERT_MODEL, state_dict=model_state_dict['params1'])
        # model2 = CognitiveGNN(model1.config.hidden_size)
        # model2.load_state_dict(model_state_dict['params2'])

    print('Start Training... on {} GPUs'.format(torch.cuda.device_count()))
    model1 = torch.nn.DataParallel(model1, device_ids = range(torch.cuda.device_count()))
    model1, model2 = train(bundles, model1=model1, device=device, mode=mode, model2=model2, # Then pass hyperparams
        batch_size=batch_size, num_epoch=num_epoch, gradient_accumulation_steps=gradient_accumulation_steps,lr1=lr1, lr2=lr2, alpha=alpha)
    
    saving_output_model_file = output_model_file.replace('.bin', '_2.bin')
    print('Saving model to {}'.format(saving_output_model_file))
    saved_dict = {'params1' : model1.module.state_dict()}
    saved_dict['params2'] = model2.state_dict()
    # saved_dict['loss'] = loss
    torch.save(saved_dict, saving_output_model_file)

# import fire
# if __name__ == "__main__":
#     fire.Fire(main)

In [None]:
main(output_model_file = './models/bert-base-uncased.bin', load = False, mode = 'tensors', batch_size = 4, 
            num_epoch = 1, gradient_accumulation_steps = 1, lr1 = 1e-4, lr2 = 1e-4, alpha = 0.2, train_data=train_data)

In [None]:
main(output_model_file = './models/bert-base-uncased.bin', load = True, mode = 'bundle', batch_size = 8, 
            num_epoch = 1, gradient_accumulation_steps = 1, lr1 = 1e-4, lr2 = 1e-4, alpha = 0.2, train_data=train_data)

100%|██████████| 500/500 [00:08<00:00, 56.03it/s]


Loading model from ./models/bert-base-uncased.bin


100%|██████████| 407873900/407873900 [00:33<00:00, 12124129.77B/s]
Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/499 [00:00<?, ?it/s][A

Start Training... on 1 GPUs



ans_loss: 1.16, hop_loss: 0.33, final_loss: 1.62:   0%|          | 0/499 [00:00<?, ?it/s][A
ans_loss: 1.16, hop_loss: 0.33, final_loss: 1.62:   0%|          | 1/499 [00:02<20:02,  2.42s/it][A
ans_loss: 0.70, hop_loss: 0.28, final_loss: 2.18:   0%|          | 1/499 [00:02<20:02,  2.42s/it][A
ans_loss: 0.70, hop_loss: 0.28, final_loss: 2.18:   0%|          | 2/499 [00:02<15:15,  1.84s/it][A
ans_loss: 0.69, hop_loss: 0.20, final_loss: 1.48:   0%|          | 2/499 [00:03<15:15,  1.84s/it][A
ans_loss: 0.69, hop_loss: 0.20, final_loss: 1.48:   1%|          | 3/499 [00:03<11:40,  1.41s/it][A
ans_loss: 0.56, hop_loss: 0.25, final_loss: 1.30:   1%|          | 3/499 [00:04<11:40,  1.41s/it][A
ans_loss: 0.56, hop_loss: 0.25, final_loss: 1.30:   1%|          | 4/499 [00:04<10:27,  1.27s/it][A
ans_loss: 0.56, hop_loss: 0.21, final_loss: 1.73:   1%|          | 4/499 [00:04<10:27,  1.27s/it][A
ans_loss: 0.56, hop_loss: 0.21, final_loss: 1.73:   1%|          | 5/499 [00:04<09:07,  1.11s/it]

Saving model to ./models/bert-base-uncased.bin


# train file

In [7]:
import re
import json
from tqdm import tqdm, trange
import pdb
import random
from collections import namedtuple
import numpy as np
import copy
import torch
import traceback
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.optimization import BertAdam
from model import BertForMultiHopQuestionAnswering, CognitiveGNN
from utils import warmup_linear, find_start_end_after_tokenized, find_start_end_before_tokenized, bundle_part_to_batch, judge_question_type, fuzzy_retrieve, WindowMean, fuzz
from data import convert_question_to_samples_bundle, homebrew_data_loader

In [8]:
with open('/content/drive/MyDrive/CogQA/CogQA/examples/hotpot_train_v1.1_refined.json' ,'r') as fin:
        dataset = json.load(fin)

In [9]:
type(dataset)

list

In [10]:
dataset[0]['Q_edge']

[["Arthur's Magazine", "Arthur's Magazine", 33, 50],
 ['First for Women', 'First for Women', 54, 69]]

In [14]:
import random
random.shuffle(dataset)
test = copy.deepcopy(dataset[:10000])

In [15]:
len(test)

10000

In [16]:
test[0]['Q_edge']

[['Cherokee National Forest', 'National Forest', 24, 39],
 ['WAPK-CD', 'WAPK-CD', 0, 7]]

In [17]:
with open('/content/drive/MyDrive/CogQA/CogQA/examples/hotpot_train_v1.1_refined_50k.json', 'w') as fout:
    json.dump(test, fout)

# train task 1

In [18]:
!python train.py

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
ans_loss: 0.21, hop_loss: 0.10:  80% 9857/12356 [1:26:39<23:14,  1.79it/s][A
ans_loss: 0.21, hop_loss: 0.10:  80% 9857/12356 [1:26:39<23:14,  1.79it/s][A
ans_loss: 0.21, hop_loss: 0.10:  80% 9858/12356 [1:26:39<25:56,  1.60it/s][A
ans_loss: 0.21, hop_loss: 0.10:  80% 9858/12356 [1:26:40<25:56,  1.60it/s][A
ans_loss: 0.21, hop_loss: 0.10:  80% 9859/12356 [1:26:40<25:25,  1.64it/s][A
ans_loss: 0.21, hop_loss: 0.10:  80% 9859/12356 [1:26:40<25:25,  1.64it/s][A
ans_loss: 0.21, hop_loss: 0.10:  80% 9860/12356 [1:26:40<21:56,  1.90it/s][A
ans_loss: 0.21, hop_loss: 0.10:  80% 9860/12356 [1:26:41<21:56,  1.90it/s][A
ans_loss: 0.21, hop_loss: 0.10:  80% 9861/12356 [1:26:41<20:52,  1.99it/s][A
ans_loss: 0.21, hop_loss: 0.10:  80% 9861/12356 [1:26:41<20:52,  1.99it/s][A
ans_loss: 0.21, hop_loss: 0.10:  80% 9862/12356 [1:26:41<20:02,  2.07it/s][A
ans_loss: 0.21, hop_loss: 0.10:  80% 9862/12356 [1:26:41<20:02,  2.07it/s][A

In [None]:
!python train.py --load=True --mode='bundle'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
ans_loss: 0.15, hop_loss: 0.10, final_loss: 0.63:  64% 6395/9932 [1:11:28<37:59,  1.55it/s][A
ans_loss: 0.15, hop_loss: 0.10, final_loss: 0.63:  64% 6395/9932 [1:11:29<37:59,  1.55it/s][A
ans_loss: 0.15, hop_loss: 0.10, final_loss: 0.63:  64% 6396/9932 [1:11:29<37:26,  1.57it/s][A
ans_loss: 0.16, hop_loss: 0.10, final_loss: 0.68:  64% 6396/9932 [1:11:30<37:26,  1.57it/s][A
ans_loss: 0.16, hop_loss: 0.10, final_loss: 0.68:  64% 6397/9932 [1:11:30<38:27,  1.53it/s][A
ans_loss: 0.17, hop_loss: 0.11, final_loss: 0.68:  64% 6397/9932 [1:11:30<38:27,  1.53it/s][A
ans_loss: 0.17, hop_loss: 0.11, final_loss: 0.68:  64% 6398/9932 [1:11:30<37:13,  1.58it/s][A
ans_loss: 0.16, hop_loss: 0.11, final_loss: 0.68:  64% 6398/9932 [1:11:31<37:13,  1.58it/s][A
ans_loss: 0.16, hop_loss: 0.11, final_loss: 0.68:  64% 6399/9932 [1:11:31<37:08,  1.59it/s][A
ans_loss: 0.16, hop_loss: 0.10, final_loss: 0.66:  64% 6399/9932 [1:11:32<37:08,

In [None]:
!python cogqa.py --data_file='hotpot_dev_fullwiki_v1_merge.json'

Loading model from ./models/bert-base-uncased.bin
Start Training... on 1 GPUs
100% 500/500 [01:55<00:00,  4.04it/s]


In [None]:
!python hotpot_evaluate_v1.py hotpot_dev_fullwiki_v1_merge_pred.json hotpot_dev_fullwiki_v1_merge.json

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
missing sp fact 5ab878ba55429916710eb06c
missing answer 5ac2f34455429921a00ab0b6
missing sp fact 5ac2f34455429921a00ab0b6
missing answer 5a82626655429966c78a6a08
missing sp fact 5a82626655429966c78a6a08
missing answer 5a7cb84a554299683c1c6350
missing sp fact 5a7cb84a554299683c1c6350
missing answer 5a7aa07d55429941d65f2703
missing sp fact 5a7aa07d55429941d65f2703
missing answer 5ac0e74d554299294b219052
missing sp fact 5ac0e74d554299294b219052
missing answer 5a7b2c9155429927d897bf51
missing sp fact 5a7b2c9155429927d897bf51
missing answer 5ae7fa5a554299540e5a56e4
missing sp fact 5ae7fa5a554299540e5a56e4
missing answer 5a84b0705542991dd0999d86
missing sp fact 5a84b0705542991dd0999d86
missing answer 5ae712fa554299572ea546b9
missing sp fact 5ae712fa554299572ea546b9
missing answer 5abc3c545542993a06baf89f
missing sp fact 5abc3c545542993a06baf89f
missing answer 5a8135cc55429903bc27b943
missing sp fact 5a8135cc55429903bc27b943
mis