### 기본 세팅

In [1]:
import logging
import os
import sys
import numpy as np
import pandas as pd
sys.path.insert(0, os.path.abspath('../'))

from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple
from transformers import (
    AlbertConfig,
    AlbertForSequenceClassification,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    set_seed,
)
from tokenization_kbalbert import KbAlbertCharTokenizer
from utils_stock import InputDataset, Split, get_label, convert_examples_to_features

### 감성분석 모델 학습

In [2]:
# 학습용 데이터셋(15000개) 불러오기 
train = pd.read_table('./input_data/ratings_train.txt')

# 검증용 데이터셋(1000개) 불러오기
test = pd.read_table('./input_data/ratings_test.txt')

In [3]:
logger = logging.getLogger(__name__)


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )

        
@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    data_dir: str = field(
        metadata={"help": "The input data dir. Should contain the .txt files for dataset."}
    )
    labels: Optional[str] = field(
        default=None,
        metadata={"help": "Path to a file containing all labels. If not specified, labels are used."},
    )
    max_seq_len: int = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
        

def main():
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_json_file(json_file='C:/Users/user/Desktop/ohae/OHAE_project/input_data/stock_config.json')

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    set_seed(training_args.seed)

    labels = get_label()
    print(labels)
    label_map = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)
    print(num_labels)
    config = AlbertConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels, id2label=label_map)
    tokenizer = KbAlbertCharTokenizer.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,)
    model = AlbertForSequenceClassification.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path, config=config)

    train_dataset = (
        InputDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            max_seq_len=data_args.max_seq_len,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train
        )
        if training_args.do_train
        else None
    )

    eval_dataset = (
        InputDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            max_seq_len=data_args.max_seq_len,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.test
        )
        if training_args.do_eval
        else None
    )

    def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
        preds = np.argmax(predictions, axis=1)

        return preds, label_ids

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
        return {
            'acc': (preds_list == out_label_list).mean()
        }

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model(training_args.output_dir)
        tokenizer.save_pretrained(training_args.output_dir)

    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key, value in result.items():
                logger.info("  %s = %s", key, value)
                writer.write("%s = %s\n" % (key, value))

        results.update(result)

    return results


def _mp_fn(index):
    main()


if __name__ == "__main__":
    main()