In [1]:
import sys
sys.path.append('../src-py/')
from tqdm import tqdm
tqdm.pandas()
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [2]:
from flair.data import Corpus, Sentence
from flair.datasets import ColumnCorpus    
from flair.embeddings import TransformerWordEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

In [3]:
from project_debater_api import *

In [4]:
data_folder = '../data-sample/claim-target-tagger/data/ibm_ds/'
model_folder = '../data-sample/claim-target-tagger/model'

### Train a Target tagger on IBM dataset:

In [5]:
columns = {0: 'text', 1: 'pos', 2: 'ct'}
# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train_ds.tsv',
                              test_file='test_ds.tsv')

2022-07-05 17:23:12,607 Reading data from ../../data-ceph/arguana/arg-generation/claim-target-tagger/data/ibm_ds
2022-07-05 17:23:12,608 Train: ../../data-ceph/arguana/arg-generation/claim-target-tagger/data/ibm_ds/train_ds.tsv
2022-07-05 17:23:12,609 Dev: None
2022-07-05 17:23:12,609 Test: ../../data-ceph/arguana/arg-generation/claim-target-tagger/data/ibm_ds/test_ds.tsv


In [6]:
label_type = 'ct'

label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

# 4. initialize fine-tuneable transformer embeddings WITH document context
embeddings = TransformerWordEmbeddings(model='xlm-roberta-large',
                                       layers="-1",
                                       subtoken_pooling="first",
                                       fine_tune=True,
                                       use_context=True,
                                       )

# 5. initialize bare-bones sequence tagger (no CRF, no RNN, no reprojection)
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type='ct',
                        use_crf=False,
                        use_rnn=False,
                        reproject_embeddings=False,
                        )

# 6. initialize trainer
trainer = ModelTrainer(tagger, corpus)

# 7. run fine-tuning
trainer.fine_tune(model_folder,
                  learning_rate=5.0e-6,
                  mini_batch_size=4,
                  #mini_batch_chunk_size=1,  # remove this parameter to speed up computation if you have a big GPU
                  )

2022-07-05 17:23:14,256 Computing label dictionary. Progress:


100%|██████████| 1157/1157 [00:00<00:00, 31388.44it/s]

2022-07-05 17:23:14,297 Corpus contains the labels: pos (#14127), ct (#14127)
2022-07-05 17:23:14,297 Created (for label 'ct') Dictionary with 4 tags: <unk>, B-CT, I-CT, O
Dictionary with 4 tags: <unk>, B-CT, I-CT, O





2022-07-05 17:23:34,900 ----------------------------------------------------------------------------------------------------
2022-07-05 17:23:34,926 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): XLMRobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(250002, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_feature

{'test_score': 0.8080402010050252,
 'dev_score_history': [0.5687500000000001,
  0.6789667896678967,
  0.7226277372262774,
  0.7481481481481481,
  0.7573529411764706,
  0.7680608365019012,
  0.7744360902255639,
  0.7739463601532566,
  0.7518796992481204,
  0.7575757575757576],
 'train_loss_history': [1.2713251365018412,
  0.4053837836965884,
  0.2871073011735839,
  0.24888050605821097,
  0.209907591208884,
  0.17178082751801205,
  0.16211893949897646,
  0.14497840200862383,
  0.11524690419061442,
  0.13723460010312408],
 'dev_loss_history': [tensor(0.4268, device='cuda:0'),
  tensor(0.3864, device='cuda:0'),
  tensor(0.3939, device='cuda:0'),
  tensor(0.5125, device='cuda:0'),
  tensor(0.4500, device='cuda:0'),
  tensor(0.5613, device='cuda:0'),
  tensor(0.5355, device='cuda:0'),
  tensor(0.6210, device='cuda:0'),
  tensor(0.6892, device='cuda:0'),
  tensor(0.7018, device='cuda:0')]}

### Extract targets from Reddit conclusions:

In [None]:
from ca_utils import *

In [6]:
def extract_targets_and_stances(df):
    unique_conclusions = df.title.unique().tolist()
    unique_conclusions_targets = extract_targets(unique_conclusions)
    unique_conclusions_stances = get_stances(unique_conclusions_targets, unique_conclusions)

    conc_to_targets = {x[0]: x[1] for x in zip(unique_conclusions, unique_conclusions_targets)}
    conc_to_stances = {x[0]: x[1] for x in zip(unique_conclusions, unique_conclusions_stances)}
    
    df['conclusion_targets'] = df.title.apply(lambda x: conc_to_targets[x])
    df['conclusion_stance']  = df.title.apply(lambda x: conc_to_stances[x])
    
    return df

----------------

In [23]:
dev_df = pd.read_pickle(data_path + '/valid_conclusion_all_preprocessed.pkl')

dev_df = dev_df[dev_df.title.str.len() > 0]
dev_df = extract_targets_and_stances(dev_df)
dev_df.to_pickle(data_path + '/valid_conclusion_all_preprocessed.pkl')

ProConClient: 100%|██████████| 1997/1997 [00:33<00:00, 60.30it/s]


In [7]:
#Extract conclusion target and stances for test_all
test_df = pd.read_pickle(data_path + '/test_conclusion_all_preprocessed.pkl')

test_df = test_df[test_df.title.str.len() > 0]
test_df = extract_targets_and_stances(test_df)
test_df.to_pickle(data_path + '/test_conclusion_all_preprocessed.pkl')

ProConClient: 100%|██████████| 8519/8519 [02:21<00:00, 60.32it/s]


In [9]:
test_df[['title', 'bart_conclusion', 'conclusion_targets', 'conclusion_stance']].head()

Unnamed: 0,title,bart_conclusion,conclusion_targets,conclusion_stance
410850,people should come with instructions,i think people should be required by law to use a cheat sheet if they meet someone they,people should come with instructions,0.997129
410858,People should not be heavily criticized for things they put on social media in the distant past,i think the internet should stop being as harsh on people for things they put on social,distant past,-0.952858
410902,We shouldn't focus on slowing climate change,joint statement:: there are other environmental issues that are a greater problem for,focus on slowing climate change,-0.997431
410910,The Australian PM was right to tell students to stop activism around global warming,I believe that activism is a terrible way to combat climate change,stop activism around global warming,0.999497
410916,Feeding cats or dogs a diet with meat is indefensible.,if a cat or dog eats her life then it's a animal killer and they should be,Feeding cats or dogs a diet with meat,-0.984038
