In [1]:
from IDRR_data import *

import os
import numpy as np
import pandas as pd
from pathlib import Path as path

from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import Dataset
from transformers import (Trainer, TrainingArguments, AutoModelForSequenceClassification, DataCollatorWithPadding, AutoTokenizer)
from transformers import TrainerCallback, TrainerState, TrainerControl

In [2]:
# 设置可见的GPU设备
os.environ["CUDA_VISIBLE_DEVICES"] = '4,5'

# 获取当前文件所在的目录和根目录
SRC_DIR = path(os.getcwd())
ROOT_DIR = SRC_DIR.parent

# === dataset ===
class CustomDataset(Dataset):
    def __init__(self, df, label_list, tokenizer) -> None:
        self.df:pd.DataFrame = df
        label_num = len(label_list)
        self.ys = np.eye(label_num, label_num)[self.df['label11id']]
        self.tokenizer = tokenizer
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        model_inputs = self.tokenizer(
            row['arg1'], row['arg2'],
            add_special_tokens=True, 
            padding=True,
            truncation='longest_first', 
            max_length=512,
        )
        model_inputs['labels'] = self.ys[index]
        return model_inputs
    
    def __len__(self):
        return self.df.shape[0]

# === metric ===
class ComputeMetrics:
    def __init__(self, label_list:list) -> None:
        self.label_list = label_list
        self.num_labels = len(label_list)
        self.metric_names = ['Macro-F1', 'Acc']
    
    def __call__(self, eval_pred):
        """
        n = label categories
        eval_pred: (pred, labels)
        # pred: np.array [datasize, ]
        pred: np.array [datasize, n]
        labels: np.array [datasize, n]
        X[p][q]=True, sample p belongs to label q (False otherwise)
        """
        pred, labels = eval_pred
        pred: np.ndarray
        labels: np.ndarray
        
        # TODO 
        pred = pred[..., :len(self.label_list)]
        labels = labels[..., :len(self.label_list)]
        
        # pred = pred!=0
        max_indices = np.argmax(pred, axis=1)
        bpred = np.zeros_like(pred, dtype=int)
        bpred[np.arange(pred.shape[0]), max_indices] = 1
        pred = bpred
        assert ( pred.sum(axis=1)<=1 ).sum() == pred.shape[0]
        labels = labels!=0
        
        res = {
            'macro-f1': f1_score(labels, pred, average='macro', zero_division=0),
            'acc': np.sum(pred*labels)/len(pred),
        } 
        return res

# === callback ===
class CustomCallback(TrainerCallback):
    def __init__(
        self, 
        log_filepath=None,
    ):
        super().__init__()
        if log_filepath:
            self.log_filepath = log_filepath
        else:
            self.log_filepath = ROOT_DIR / 'output_dir' / 'log.jsonl'
    
    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        with open(self.log_filepath, 'a', encoding='utf8')as f:
            f.write(str(kwargs['logs'])+'\n')

    def on_evaluate(self, args, state, control, metrics:Dict[str, float], **kwargs):
        pass


In [3]:
# === data ===
dfs = IDRRDataFrames(
    data_name='pdtb2',
    data_level='top',
    data_relation='Implicit',
    data_path='/data/sunwh/data/IDRR/used/pdtb2.p1.csv',
)
label_list = dfs.label_list

print(len(label_list))

checkpoint = '/data/sunwh/model/roberta-base'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# 加载训练集、验证集和测试集
train_dataset = CustomDataset(dfs.train_df, label_list, tokenizer)
dev_dataset = CustomDataset(dfs.dev_df, label_list, tokenizer)
test_dataset = CustomDataset(dfs.test_df, label_list, tokenizer)
for _, it in zip(range(4), train_dataset):
    for key, value in it.items():
        print(key, value)




4
input_ids [0, 1121, 41, 1700, 4, 753, 1551, 9, 22, 133, 256, 22471, 26636, 2379, 113, 23, 1568, 18, 17899, 5132, 6697, 38494, 8632, 1538, 37885, 4624, 5, 11897, 11, 7247, 219, 412, 60, 1063, 25145, 359, 4455, 238, 5, 774, 9, 13230, 757, 2552, 6, 702, 30, 1636, 230, 44156, 1250, 6, 21, 26506, 9702, 7, 15990, 4936, 1073, 2, 2, 13123, 4, 4936, 1073, 1974, 1448, 9488, 118, 2]
attention_mask [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
labels [1. 0. 0. 0.]
input_ids [0, 8827, 6006, 12569, 11120, 6934, 1723, 603, 4, 1130, 63, 3472, 7, 158, 3205, 31, 707, 3205, 10, 458, 2, 2, 133, 92, 731, 40, 28, 21467, 1927, 4, 379, 2]
attention_mask [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
labels [0. 0. 1. 0.]
input_ids [0, 133, 92, 731, 40, 28, 21467, 1927, 4, 379, 2, 2, 250, 638, 1248

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(label_list))

# === args ===
training_args = TrainingArguments(
    output_dir=ROOT_DIR/'output_dir',
    overwrite_output_dir=True,
    run_name='',
    
    # strategies of evaluation, logging, save
    evaluation_strategy = "steps", 
    eval_steps = 500,
    logging_strategy = 'steps',
    logging_steps = 10,
    save_strategy = 'steps',
    save_steps = 500,
    # max_steps=2,
    
    # optimizer and lr_scheduler
    optim = 'adamw_torch',
    # optim = 'sgd',
    learning_rate = 2e-5,
    weight_decay = 0.01,
    lr_scheduler_type = 'linear',
    warmup_ratio = 0.05,
    
    # epochs and batches 
    num_train_epochs = 10, 
    # max_steps = args.max_steps,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 8,
    gradient_accumulation_steps = 1,
    
    # train consumption
    eval_accumulation_steps=10,
    bf16=True,
    fp16=False,
)

# === train ===
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorWithPadding(tokenizer),
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=ComputeMetrics(dfs.label_list),
    callbacks=[CustomCallback()],
)

# 开始训练和评估
train_result = trainer.train()
test_result = trainer.evaluate(eval_dataset=test_dataset)
print(f'> train_result:\n  {train_result}')
print(f'> test_result:\n  {test_result}')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /data/sunwh/model/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  trainer = Trainer(
Detected kernel version 3.10.0, wh

Step,Training Loss,Validation Loss,Macro-f1,Acc
500,0.3436,0.380456,0.502219,0.625528
1000,0.2528,0.379013,0.564206,0.659341
1500,0.2245,0.415397,0.570217,0.666948
2000,0.103,0.501373,0.562684,0.652578
2500,0.1027,0.57046,0.546849,0.654269
3000,0.0589,0.609507,0.554605,0.647506
3500,0.0561,0.668324,0.571225,0.656805




> train_result:
  TrainOutput(global_step=3950, training_loss=0.18411906539688483, metrics={'train_runtime': 1104.8651, 'train_samples_per_second': 114.331, 'train_steps_per_second': 3.575, 'total_flos': 6528774075203328.0, 'train_loss': 0.18411906539688483, 'epoch': 10.0})
> test_result:
  {'eval_loss': 0.6380148446288656, 'eval_macro-f1': 0.6014715506085475, 'eval_acc': 0.6739961759082218, 'eval_runtime': 5.3902, 'eval_samples_per_second': 194.057, 'eval_steps_per_second': 12.244, 'epoch': 10.0}
