In [3]:
%load_ext autoreload

In [4]:
import sys
import os
import pandas as pd
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

pd.set_option('display.max_colwidth', 0)
sys.path.insert(0, "../src-py/")

In [5]:
%autoreload

import utils
from run_pplm_discrim_train import *
from run_pplm import *

from transformers import GPT2Tokenizer
from transformers.file_utils import cached_path
from transformers.modeling_gpt2 import GPT2LMHeadModel

In [6]:
import pandas as pd
import utils
import pickle

In [7]:
import json
from sklearn.model_selection import train_test_split

In [8]:
data_folder = '../../data'
device = 'cuda:0'

In [9]:
pretrained_argsme_model_path = data_folder + '/output/models/argsme_ft_new'

In [11]:
def get_model(name):
    tokenizer = GPT2Tokenizer.from_pretrained(name)
    model = GPT2LMHeadModel.from_pretrained(name, output_hidden_states=True)
    for param in model.parameters():
        param.requires_grad = False
    
    model.to(device)
    model.eval()
    return model, tokenizer

def gen_text(prefix, tokenizer, model, max_length=50):
    # encode context the generation is conditioned on
    input_ids = tokenizer.encode(prefix, return_tensors='pt')
    # generate text until the output length (which includes the context length) reaches 50
    greedy_output = model.generate(input_ids.to(device), max_length=max_length)
    print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

### Preparing data for fine-tuning gpt-2:

In [108]:
argsme_data = json.load(open(data_folder + '/args-me.json'))

In [109]:
context_ids = list(set([x['context']['sourceId'] for x in argsme_data['arguments']]))

In [110]:
training_ids, test_ids = train_test_split(context_ids, test_size=0.2)

In [None]:
training_args = [x['premises'][0]['text'] for x in argsme_data['arguments'] if x['context']['sourceId'] in training_ids]
test_args     = [x['premises'][0]['text'] for x in argsme_data['arguments'] if x['context']['sourceId'] in test_ids]

In [117]:
training_args = [x for x in training_args if x != '']
test_args     = [x for x in test_args if x != '']

In [120]:
len(training_args)

310084

In [123]:
training_args = [x for x in training_args if len(x.split()) > 5]
test_args     = [x for x in test_args if len(x.split()) > 5]

In [122]:
len(training_args)

287990

In [124]:
with open(data_folder + '/data/ft_training.txt', 'w', encoding='utf8') as training_file:
    for arg in training_args:
        training_file.write(arg)
        training_file.write('\n')

In [125]:
with open(data_folder + '/data/ft_test.txt', 'w', encoding='utf8') as test_file:
    for arg in test_args:
        test_file.write(arg)
        test_file.write('\n')

### Fine-tuning gpt-2:

In [9]:
%env CUDA_VISIBLE_DEVICES=3
%env MASTER_PORT=5000
%env MASTER_ADDR=localhost
%env WORLD_SIZE=3
%env RANK=0

env: CUDA_VISIBLE_DEVICES=3
env: MASTER_PORT=5000
env: MASTER_ADDR=localhost
env: WORLD_SIZE=3
env: RANK=0


#### On args.me data:

In [None]:
%run ../src-py/run_language_modeling.py \
    --output_dir='../../data/output/models' \
    --model_type=gpt2 \
    --local_rank=-1 \
    --save_total_limit 10 \
    --model_name_or_path=gpt2-medium \
    --do_train \
    --overwrite_output_dir \
    --per_gpu_train_batch_size=3 \
    --gradient_accumulation_steps=3 \
    --block_size=256\
    --train_data_file='../../data/ft_training.txt' \
    --do_eval \
    --eval_data_file='../../data/ft_test.txt' \

04/01/2020 15:24:02 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json from cache at /root/.cache/torch/transformers/98aa65385e18b0efd17acd8bf64dcdf21406bb0c99c801c2d3c9f6bfd1f48f29.266bb9683aedfcb1f7006ad2e6894fce82b8dbbae8125f4fc8570b818005b83d
04/01/2020 15:24:02 - INFO - transformers.configuration_utils -   Model config GPT2Config {
  "_num_labels": 2,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "do_sample": false,
  "early_stopping": false,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "finetuning_task": null,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_epsilon": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "min_length":