In [0]:
!pip install transformers
!pip install tokenizers



In [0]:
import torch
import pickle
from collections import Counter
from tqdm import tqdm
import itertools
import pandas as pd
from itertools import islice
import numpy as np
from pathlib import Path

from tokenizers import BertWordPieceTokenizer
from tokenizers.processors import BertProcessing

In [0]:
# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

# Set the random seed manually for reproducibility.
torch.manual_seed(1234)

path = "drive/My Drive/FiQA/"

Using device: cuda

Tesla K80
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [0]:
collection = pd.read_csv("/content/drive/My Drive/Thesis/data/retrieval/collection_cleaned.tsv", sep="\t", header=None)
collection = collection.rename(columns={0: 'docid', 1: 'doc'})

def load_questions(path):
    """
    Returns a dataframe of cols: qid, question
    """
    # Question ID and Question text
    query_df = pd.read_csv(path, sep="\t")
    queries = query_df[['qid', 'question']]

    return queries

queries = load_questions(path + "FiQA_train_question_final.tsv")

In [0]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

def remove_empty(test_set):
    for index, row in enumerate(test_set):
        for doc in row[1]:
            if doc in empty_docs:
                del test_set[index]
    return test_set

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def save_pickle(path, data):
    with open(path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
# Question to question text
qid_to_text = {}

for index, row in queries.iterrows():
    qid_to_text[row['qid']] = row['question']

docid_to_text = {}

for index, row in collection.iterrows():
    docid_to_text[row['docid']] = row['doc']

In [0]:
take(5, qid_to_text.items())

[(0, 'What is considered a business expense on a business trip?'),
 (1, 'Claiming business expenses for a business with no income'),
 (2,
  'Transferring money from One business checking to another business checking'),
 (3,
  'Having a separate bank account for business/investing, but not a “business account?”'),
 (4,
  'Business Expense - Car Insurance Deductible For Accident That Occurred During a Business Trip')]

In [0]:
empty_docs = load_pickle(path+'empty_docs.pickle')

In [0]:
dataset = pd.read_csv(path+"FiQA_train_question_doc_final.tsv", sep="\t")
dataset = dataset[["qid", "docid"]]
dataset = dataset[~dataset['docid'].isin(empty_docs)]
dataset['question'] = dataset['qid'].apply(lambda x: qid_to_text[x])
dataset['answer'] = dataset['docid'].apply(lambda x: docid_to_text[x])

In [0]:
def add_ques_token(string):
    question = string + " [SEP] "

    return question

In [0]:
dataset['question'] = dataset['question'].apply(add_ques_token)
dataset['seq'] = dataset['question'] + dataset['answer']
dataset = dataset[['seq']]

In [0]:
dataset['seq'] = dataset['question'] + dataset['answer']

In [0]:
dataset = dataset[['seq']]

In [0]:
test = dataset.at[17081, "seq"]

test

"Is it wise to switch investment strategy frequently? [SEP] My super fund and I would say many other funds give you one free switch of strategies per year.  Some suggest you should change from high growth option to a more balance option once you are say about 10 to 15 years from retirement, and then change to a more capital guaranteed option a few years from retirement. This is a more passive approach and has benefits as well as disadvantages. The benefit is that there is not much work involved, you just change your investment option based on your life stage, 2 to 3 times during your lifetime. This allows you to take more risk when you are young to aim for higher returns, take a balanced approach with moderate risk and returns during the middle part of your working life, and take less risk with lower returns (above inflation) during the latter part of your working life. A possible disadvantage of this strategy is you may be in the higher risk/ higher growth option during a market corre

In [0]:
import csv

dataset.to_csv(path+'bert-lm/data.txt',index=False,header=False, sep="\t", quoting=csv.QUOTE_NONE)

In [0]:
paths = [str(x) for x in Path(path+"bert-lm/").glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = BertWordPieceTokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=1, special_tokens=[
    "[UNK]",
    "[SEP]",
    "[CLS]",
    "[MASK]",
    "[PAD]",
])

In [0]:
tokenizer.save(path+"/bert-lm/tokenizer")

['drive/My Drive/FiQA//bert-lm/tokenizer/vocab.txt']

In [0]:
tokenizer = BertWordPieceTokenizer(path+"/bert-lm/tokenizer/vocab.txt")
tokenizer._tokenizer.post_processor = BertProcessing(
    ("[SEP]", tokenizer.token_to_id("[CLS]")),
    ("[CLS]", tokenizer.token_to_id("[SEP]")),
)
tokenizer.enable_truncation(max_length=512)

# Save files to disk
tokenizer.save(path+"bert-lm", "finbert-lm")


['drive/My Drive/FiQA/bert-lm/finbert-lm-vocab.txt']

In [0]:
test = "Is it wise to switch investment strategy frequently? [SEP] My super fund and I would say many other funds give you one free switch of strategies per year.  Some suggest you should change from high growth option to a more balance option once you are say about 10 to 15 years from retirement, and then change to a more capital guaranteed option a few years from retirement. This is a more passive approach and has benefits as well as disadvantages. The benefit is that there is not much work involved, you just change your investment option based on your life stage, 2 to 3 times during your lifetime. This allows you to take more risk when you are young to aim for higher returns, take a balanced approach with moderate risk and returns during the middle part of your working life, and take less risk with lower returns (above inflation) during the latter part of your working life. A possible disadvantage of this strategy is you may be in the higher risk/ higher growth option during a market correction and then change to a more balanced option just when the market starts to pick up again. So your funds will be hit with large losses whilst the market is in retreat and just when things look to be getting better you change to a more balanced portfolio and miss out on the big gains. A second more active approach would be to track the market and change investment option as the market changes. One approach which shouldn't take much time is to track the index such as the ASX200 (if you investment option is mainly invested in the Australian stock market) with a 200 day Simple Moving Average (SMA). The concept is that if the index crosses above the 200 day SMA the market is bullish and if it crosses below it is bearish. See the chart below:  This strategy will work well when the market is trending up or down but not very well when the market is going sideways, as you will be changing from aggressive to balanced and back too often. Possibly a more appropriate option would be a combination of the two. Use the first passive approach to change investment option from aggressive to balanced to capital guaranteed with your life stages, however use the second active approach to time the change. For example, if you were say in your late 40s now and were looking to change from aggressive to balanced in the near future, you could wait until the ASX200 crosses below the 200 day SMA before making the change. This way you could capture the majority of the uptrend (which could go on for years) before changing from the high growth/aggressive option to the balanced option. If you where after more control over your superannuation assets another option open to you is to start a SMSF, however I would recommend having at least $300K to $400K in assets before starting a SMSF, or else the annual costs would be too high as a percentage of your total super assets."

In [0]:
print(tokenizer.encode(test).tokens)

['[CLS]', 'is', 'it', 'wise', 'to', 'switch', 'investment', 'strategy', 'frequently', '?', '[SEP]', 'my', 'super', 'fund', 'and', 'i', 'would', 'say', 'many', 'other', 'funds', 'give', 'you', 'one', 'free', 'switch', 'of', 'strategies', 'per', 'year', '.', 'some', 'suggest', 'you', 'should', 'change', 'from', 'high', 'growth', 'option', 'to', 'a', 'more', 'balance', 'option', 'once', 'you', 'are', 'say', 'about', '10', 'to', '15', 'years', 'from', 'retirement', ',', 'and', 'then', 'change', 'to', 'a', 'more', 'capital', 'guaranteed', 'option', 'a', 'few', 'years', 'from', 'retirement', '.', 'this', 'is', 'a', 'more', 'passive', 'approach', 'and', 'has', 'benefits', 'as', 'well', 'as', 'disadvantages', '.', 'the', 'benefit', 'is', 'that', 'there', 'is', 'not', 'much', 'work', 'involved', ',', 'you', 'just', 'change', 'your', 'investment', 'option', 'based', 'on', 'your', 'life', 'stage', ',', '2', 'to', '3', 'times', 'during', 'your', 'lifetime', '.', 'this', 'allows', 'you', 'to', 'tak

In [0]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.1)

In [0]:
print(len(train))
len(test)

15364


1708

In [0]:
train.to_csv(path+'bert-lm/train.txt',index=False,header=False, sep="\t", quoting=csv.QUOTE_NONE)
test.to_csv(path+'bert-lm/eval.txt',index=False,header=False, sep="\t", quoting=csv.QUOTE_NONE)

In [0]:
import os

train_path = os.path.join("/content/" + path+'bert-lm/train.txt')
eval_path = os.path.join("/content/" + path+'bert-lm/eval.txt')

In [0]:
!git clone https://github.com/huggingface/transformers.git

Cloning into 'transformers'...
remote: Enumerating objects: 99, done.[K
remote: Counting objects: 100% (99/99), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 20817 (delta 56), reused 82 (delta 47), pack-reused 20718[K
Receiving objects: 100% (20817/20817), 12.34 MiB | 26.93 MiB/s, done.
Resolving deltas: 100% (15040/15040), done.


In [0]:
!nvidia-smi

Fri Feb 28 11:39:11 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.48.02    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P8     7W /  75W |     10MiB /  7611MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
import os

#Setting environment variables
os.environ["train_path"] = train_path
os.environ["eval_path"] = eval_path
os.environ["CUDA_LAUNCH_BLOCKING"]='1'  #Makes for easier debugging (just in case)
weights_dir = "/content/drive/'My Drive'/FiQA/bert-lm/weights"

In [0]:
weights_dir

"/content/drive/'My Drive'/FiQA/bert-lm/weights"

In [0]:
cmd = '''python /content/transformers/examples/run_language_modeling.py \
    --output_dir /content/drive/'My Drive'/FiQA/bert-lm/weights \
    --model_type bert \
    --mlm \
    --train_data_file /content/drive/'My Drive'/FiQA/bert-lm/train.txt \
    --eval_data_file /content/drive/'My Drive'/FiQA/bert-lm/eval.txt \
    --config_name /content/drive/'My Drive'/FiQA/bert-lm/tokenizer \
    --tokenizer_name /content/drive/'My Drive'/FiQA/bert-lm/tokenizer \
    --do_train \
    --line_by_line \
    --overwrite_output_dir \
    --do_eval \
    --learning_rate 1e-4 \
    --num_train_epochs 1 \
    --save_total_limit 2 \
    --block_size 512 \
    --save_steps 2000 \
    --per_gpu_eval_batch_size 8 \
    --per_gpu_train_batch_size 8 \
    --evaluate_during_training \
    --seed 42'''

In [0]:
!{cmd}

02/28/2020 11:39:34 - INFO - transformers.configuration_utils -   loading configuration file /content/drive/My Drive/FiQA/bert-lm/tokenizer/config.json
02/28/2020 11:39:34 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "do_sample": false,
  "eos_token_ids": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden