## Prepair

In [15]:
## config.yamlの読み込み
import yaml
with open("config.yaml", "r", encoding='utf-8') as file:
    config = yaml.safe_load(file)

In [16]:
# Import
import gc
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
from typing import Dict, Tuple
from sklearn.metrics import cohen_kappa_score
import torch
import polars as pl
from pathlib import Path
import sys
from pathlib import Path

In [17]:
# 自作関数の読み込み
repo_dir = Path().resolve().parents[0]
root_dir = Path().resolve().parents[1]
s3_dir = root_dir / "s3storage/01_public/auto_essay_scorer_lab2/data/"
sys.path.append(str(repo_dir / "scripts/"))
from utils.path import PathManager
from utils.data import CreateDataset
from utils.metafeatures import GenerateMetaFeatures
from utils.model import Trainer
from utils.qwk import quadratic_weighted_kappa, qwk_obj

## パスの設定
mode = config["model_name"]
path_to = PathManager(s3_dir, mode)

# Prepare Test-Dataset

In [18]:
## データ読み込み＆特徴量加工
create_dataset = CreateDataset(s3_dir, config)
test = create_dataset.preprocessing_test() # 返り値をpandasではなくpolars.DataFrameにすべき

test = pl.from_pandas(test)

---Paragraph 特徴量作成完了---
---Sentence 特徴量作成完了---
---Word 特徴量作成完了---
■ testデータ作成完了


In [19]:
def load_features(input_dir):
    with open(input_dir, "rb") as f:
        feature_select = pickle.load(f)

    return feature_select

def prepare_test_data(input_data, feature_select):
    X = input_data[feature_select].astype(np.float32).values

    return X

# Prediction

In [20]:
# モデルパラメータ
model_params = {
    'lgbm': {
        'objective': qwk_obj,  # qwk_objは事前に定義されている関数を指定
        'metrics': 'None',
        'learning_rate': 0.05,
        'max_depth': 5,
        'num_leaves': 10,
        'colsample_bytree': 0.3,
        'reg_alpha': 0.7,
        'reg_lambda': 0.1,
        'n_estimators': 700,
        'random_state': 412,
        'extra_trees': True,
        'class_weight': 'balanced',
        'device': 'gpu' if torch.cuda.is_available() else 'cpu',
        'verbosity': - 1
    },
    'xgb': {
        'objective': qwk_obj,  # qwk_objは事前に定義されている関数を指定
        'metrics': 'None',
        'learning_rate': 0.1,
        'max_depth': 5,
        'num_leaves': 10,
        'colsample_bytree': 0.5,
        'reg_alpha': 1.0,
        'reg_lambda': 0.1,
        'n_estimators': 1024,
        'random_state': 42,
        'extra_trees': True,
        'class_weight': 'balanced',
        'tree_method': "hist",
        'device': "gpu" if torch.cuda.is_available() else "cpu"
    }
}

In [21]:
probabilities = []
for i in range(config['n_splits']):

    ## fold数の取得
    fold_num = str(i)
    print('fold', fold_num)

    ## メタ特徴量を追加
    generate_meta = GenerateMetaFeatures(s3_dir, config, fold_num, None, test)
    test_data_add = generate_meta.preprocessing_test()

    # パスの指定
    model_fold_path: Path = path_to.models_weight_dir / f'fold_{i}'
        
    # pickle ファイルから feature_select リストを読み込む
    save_path: Path = model_fold_path / 'feature_select.pickle'

    with open(save_path, 'rb') as f:
        feature_select = pickle.load(f)

    test_X = prepare_test_data(test_data_add, feature_select)
    
    ## 予測実施実施
    trainer = Trainer(config, model_params)
    trainer.initialize_models()
    
    trainer.load_weight(model_fold_path)

    proba = trainer.predict(test_X) + config['avg_train_score']
    probabilities.append(proba)

# Compute the average probabilities across all models
predictions = np.mean(probabilities, axis=0)
predictions = np.round(predictions.clip(1, 6))


fold 0
---TfidfVectorizer 特徴量作成完了---
---CountVectorizer 特徴量作成完了---
■ testデータ作成完了
fold 1
---TfidfVectorizer 特徴量作成完了---
---CountVectorizer 特徴量作成完了---
■ testデータ作成完了
fold 2
---TfidfVectorizer 特徴量作成完了---
---CountVectorizer 特徴量作成完了---
■ testデータ作成完了
fold 3
---TfidfVectorizer 特徴量作成完了---
---CountVectorizer 特徴量作成完了---
■ testデータ作成完了
fold 4
---TfidfVectorizer 特徴量作成完了---
---CountVectorizer 特徴量作成完了---
■ testデータ作成完了
fold 5
---TfidfVectorizer 特徴量作成完了---
---CountVectorizer 特徴量作成完了---
■ testデータ作成完了
fold 6
---TfidfVectorizer 特徴量作成完了---
---CountVectorizer 特徴量作成完了---
■ testデータ作成完了
fold 7
---TfidfVectorizer 特徴量作成完了---
---CountVectorizer 特徴量作成完了---
■ testデータ作成完了
fold 8
---TfidfVectorizer 特徴量作成完了---
---CountVectorizer 特徴量作成完了---
■ testデータ作成完了
fold 9
---TfidfVectorizer 特徴量作成完了---
---CountVectorizer 特徴量作成完了---
■ testデータ作成完了
fold 10
---TfidfVectorizer 特徴量作成完了---
---CountVectorizer 特徴量作成完了---
■ testデータ作成完了
fold 11
---TfidfVectorizer 特徴量作成完了---
---CountVectorizer 特徴量作成完了---
■ testデータ作成完了
fold 12
---TfidfVectorizer

# Create submit-file

In [22]:
# submitファイルの整形
file_path = path_to.origin_sample_submit_dir
submission = pd.read_csv(file_path)
submission['score'] = predictions
submission['score'] = submission['score'].astype(int)

# 保存
if not os.path.exists(path_to.output_dir):
    path_to.output_dir.mkdir()

save_path = path_to.submit_dir
submission.to_csv(save_path, index=None)
display(submission.head())

PermissionError: [Errno 1] Operation not permitted: '/kaggle/s3storage/01_public/auto_essay_scorer_lab2/data/output/submit.csv'