## Prepair

In [1]:
## config.yamlの読み込み
import yaml
with open("config.yaml", "r", encoding='utf-8') as file:
    config = yaml.safe_load(file)

In [12]:
# Import
import warnings
import gc
import os
import sys
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
from typing import Dict, Tuple
from sklearn.metrics import cohen_kappa_score
import torch
import polars as pl
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import transformers
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import Dataset
from tokenizers import AddedToken
warnings.simplefilter('ignore')

In [3]:
# 自作関数の読み込み
repo_dir = Path().resolve().parents[0]
root_dir = Path().resolve().parents[1]
s3_dir = root_dir / "s3storage/01_public/auto_essay_scorer_lab2/data/"
sys.path.append(str(repo_dir / "scripts/"))
from utils.path import PathManager
from utils.data import CreateDataset
from utils.metafeatures import GenerateMetaFeatures
from utils.model import Trainer
from utils.qwk import quadratic_weighted_kappa, qwk_obj

## パスの設定
mode = config["model_name"]
path_to = PathManager(s3_dir, mode)

# Prepare Test-Dataset

In [4]:
## データ読み込み＆特徴量加工
create_dataset = CreateDataset(s3_dir, config)
test = create_dataset.preprocessing_test() # 返り値をpandasではなくpolars.DataFrameにすべき

test = pl.from_pandas(test)
test = test.with_columns(pl.lit(0.0).alias("label"))

---Paragraph 特徴量作成完了---
---Sentence 特徴量作成完了---
---Word 特徴量作成完了---
■ testデータ作成完了


In [5]:
def load_features(input_dir):
    with open(input_dir, "rb") as f:
        feature_select = pickle.load(f)

    return feature_select

def prepare_test_data(input_data, feature_select):
    X = input_data[feature_select].astype(np.float32).values

    return X

# Prediction

In [6]:
# モデルパラメータ
model_params = {
    'lgbm': {
        'objective': qwk_obj,  # qwk_objは事前に定義されている関数を指定
        'metrics': 'None',
        'learning_rate': 0.05,
        'max_depth': 5,
        'num_leaves': 10,
        'colsample_bytree': 0.3,
        'reg_alpha': 0.7,
        'reg_lambda': 0.1,
        'n_estimators': 700,
        'random_state': 412,
        'extra_trees': True,
        'class_weight': 'balanced',
        'device': 'gpu' if torch.cuda.is_available() else 'cpu',
        'verbosity': - 1
    },
    'xgb': {
        'objective': qwk_obj,  # qwk_objは事前に定義されている関数を指定
        'metrics': 'None',
        'learning_rate': 0.1,
        'max_depth': 5,
        'num_leaves': 10,
        'colsample_bytree': 0.5,
        'reg_alpha': 1.0,
        'reg_lambda': 0.1,
        'n_estimators': 1024,
        'random_state': 42,
        'extra_trees': True,
        'class_weight': 'balanced',
        'tree_method': "hist",
        'device': "gpu" if torch.cuda.is_available() else "cpu"
    }
}

In [7]:
class CFG:
    # n_splits = 5
    seed = 42
    max_length = 1024 # to avoid truncating majority of essays.
    lr = 1e-5
    train_batch_size = 4
    eval_batch_size = 8
    train_epochs = 4
    weight_decay = 0.01
    warmup_ratio = 0.0
    num_labels = 6

In [8]:
class Tokenize(object):
    """train, valid, (test)データに対してトークン化処理"""

    def __init__(self, train, valid, tokenizer):
        self.tokenizer = tokenizer
        self.train = train
        self.valid = valid
        
    def get_dataset(self, df):
        ds = Dataset.from_dict({
                'essay_id': [e for e in df['essay_id']],
                'full_text': [ft for ft in df['full_text']],
                'label': [s for s in df['label']],
            })
        return ds
        
    def tokenize_function(self, example):
        tokenized_inputs = self.tokenizer(
            example['full_text'], truncation=True, max_length=CFG.max_length
        )
        return tokenized_inputs
    
    def __call__(self):
        train_ds = self.get_dataset(self.train)
        valid_ds = self.get_dataset(self.valid)
        
        tokenized_train = train_ds.map(
            self.tokenize_function, batched=True
        )
        tokenized_valid = valid_ds.map(
            self.tokenize_function, batched=True
        )

        return tokenized_train, tokenized_valid, self.tokenizer

In [14]:
probabilities = []
all_pred = []
for i in range(config['n_splits']):

    ## fold数の取得
    fold_num = str(i)
    print('fold', fold_num)

    # ## メタ特徴量を追加
    # generate_meta = GenerateMetaFeatures(s3_dir, config, fold_num, None, test)
    # test_data_add = generate_meta.preprocessing_test()

    # パスの指定
    # model_fold_path: Path = path_to.models_weight_dir / f'fold_{i}'
        
    # pickle ファイルから feature_select リストを読み込む
    # save_path: Path = model_fold_path / 'feature_select.pickle'

    # with open(save_path, 'rb') as f:
    #     feature_select = pickle.load(f)

    # test_X = prepare_test_data(test_data_add, feature_select)

    ## ディレクトリの準備
    model_fold_path: Path = path_to.deberta_v3_small_finetuned_dir / f'fold_{i}/'

    ## 前処理
    # トークンナイズ化
    tokenizer = AutoTokenizer.from_pretrained(model_fold_path)
    tokenize = Tokenize(test.to_pandas(), test.to_pandas(), tokenizer)
    tokenized_test, _, _ = tokenize()

    # TrainingArgumentsのロード
    training_args = torch.load(model_fold_path / 'training_args.bin')

    # モデルのロード
    model = AutoModelForSequenceClassification.from_pretrained(model_fold_path)
    # del model
    # INFER WITH TRAINER
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    trainer = transformers.Trainer( 
        model=model,
        args=training_args,
        train_dataset=tokenized_test,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )
    # SAVE PREDICTIONS
    predictions = trainer.predict(tokenized_test).predictions
    all_pred.append( predictions )

    # ## 予測実施実施
    # trainer = Trainer(config, model_params)
    # trainer.initialize_models()
    
    # trainer.load_weight(model_fold_path)

    # proba = trainer.predict(test_X) + config['avg_train_score']
    # probabilities.append(proba)

# Compute the average probabilities across all models
# predictions = np.mean(probabilities, axis=0)
# predictions = np.round(predictions.clip(1, 6))

_preds = np.mean(all_pred, axis=0)
preds = _preds.clip(0,5).round(0)+1
print('Predictions shape:',preds.shape)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


fold 0


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


fold 1


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


fold 2


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


fold 3


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


fold 4


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Predictions shape: (3,)


# Create submit-file

In [15]:
# submitファイルの整形
file_path = path_to.origin_sample_submit_dir
submission = pd.read_csv(file_path)
# submission['score'] = predictions
submission['score'] = preds
submission['score'] = submission['score'].astype(int)

# 保存
if not os.path.exists(path_to.output_dir):
    path_to.output_dir.mkdir()

save_path = path_to.submit_dir
submission.to_csv(save_path, index=None)
display(submission.head())

PermissionError: [Errno 1] Operation not permitted: '/kaggle/s3storage/01_public/auto_essay_scorer_lab2/data/output/submit.csv'

In [16]:
display(submission.head())

Unnamed: 0,essay_id,score
0,000d118,2
1,000fe60,3
2,001ab80,4
