In [2]:
import pandas as pd
import numpy as np
import torch
import os

from transformers import AutoTokenizer, AutoConfig, TrainingArguments, Trainer, AutoModelForSequenceClassification
from classifier import ArcFaceIBModel
from mkdataset import ConfidenceDataset, TestDataset
from datasets import load_metric, load_dataset
from sklearn.model_selection import StratifiedKFold
from utils import set_allseed
import warnings
import pickle

In [3]:
seed = 777
batch_size = 8
save_steps = 103
set_allseed(seed)
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= "1"
gpu = torch.device("cuda")
cpu = torch.device("cpu")
warnings.filterwarnings(action='ignore')
model_name = "monologg/koelectra-base-v3-discriminator"

In [4]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [5]:
kfold_function = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
tokenizer = AutoTokenizer.from_pretrained(model_name ,max_length = 512)
config = AutoConfig.from_pretrained(model_name)
config.num_labels = len(train_df.확실성.value_counts())

In [6]:
def compute_metrics(pred):
    f1 = load_metric("f1")
    references = pred.label_ids
    predictions = pred.predictions.argmax(axis=1)
    metric = f1.compute(predictions=predictions, references=references, average="micro")
    return metric

In [7]:
training_args = TrainingArguments(
    output_dir="./output_confidence",
    seed=seed,
    save_total_limit=2,
    save_steps = save_steps,
    num_train_epochs = 5,
    learning_rate= 1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=16,
    weight_decay=1e-4,
    logging_dir="./logs",
    logging_steps = save_steps,
    evaluation_strategy = "steps",
    eval_steps = save_steps,
    load_best_model_at_end=True,
)
test_dataset = TestDataset(data=test_df, tokenizer=tokenizer)

In [8]:
logit = 0
for i, (train_index, test_index) in enumerate(kfold_function.split(train_df["문장"],train_df["확실성"])):
    model = ArcFaceIBModel.from_pretrained(model_name, config=config)
    train_corpus, valid_corpus = train_df["문장"][train_index], train_df["문장"][test_index]
    train_label, valid_label = train_df["확실성"][train_index], train_df["확실성"][test_index]
    fold_train = pd.concat([train_corpus, train_label], axis =1)
    fold_valid = pd.concat([valid_corpus, valid_label], axis =1)
    train_dataset = ConfidenceDataset(data=fold_train, tokenizer=tokenizer)
    valid_dataset = ConfidenceDataset(data=fold_valid, tokenizer=tokenizer)

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    )

    trainer.train()

    logit += trainer.predict(test_dataset).predictions / 5

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ArcFaceIBModel: ['electra.encoder.layer.5.attention.self.key.bias', 'electra.encoder.layer.11.attention.output.dense.weight', 'electra.encoder.layer.3.output.dense.weight', 'electra.encoder.layer.0.intermediate.dense.bias', 'electra.encoder.layer.0.attention.output.LayerNorm.weight', 'electra.encoder.layer.10.output.LayerNorm.weight', 'electra.encoder.layer.1.attention.self.value.bias', 'electra.encoder.layer.2.attention.output.dense.bias', 'electra.encoder.layer.11.output.dense.weight', 'electra.encoder.layer.10.output.LayerNorm.bias', 'discriminator_predictions.dense.weight', 'electra.encoder.layer.8.intermediate.dense.bias', 'electra.encoder.layer.9.attention.self.key.weight', 'electra.encoder.layer.3.attention.self.query.weight', 'electra.encoder.layer.0.attention.self.key.bias', 'electra.encoder.layer.1.attention.self.key.weight', 'electra.encoder.layer.5.attention.outp

Some weights of ArcFaceIBModel were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['electra.arcface.weight', 'electra.proj_fc_layer.linear.bias', 'electra.neck.1.bias', 'electra.neck.1.weight', 'electra.proj_fc_layer.linear.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
***** Running training *****
  Num examples = 13232
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 16
  Total optimization steps = 515


TypeError: forward() got an unexpected keyword argument 'token_type_ids'

In [8]:
with open("confidence_logit_de.pickle","wb") as f:
    pickle.dump(logit, f)

In [11]:
with open("confidence_logit_de.pickle","rb") as f:
    logit = pickle.load(f)

with open("confidence_logit_e.pickle","rb") as f:
    logit2 = pickle.load(f)
    
logit += logit2
logit /= 2
    
result = pd.DataFrame(logit.argmax(axis=1).tolist(), columns=["type"])
test_df_result = pd.concat([test_df,result],axis=1, ignore_index=True)
test_df_result.to_csv("result_confidence.csv")