## Prepair

In [1]:
## config.yamlの読み込み
import yaml
with open("config.yaml", "r", encoding='utf-8') as file:
    config = yaml.safe_load(file)

In [2]:
# Import
import gc
import lightgbm as lgb
from sklearn.ensemble import VotingRegressor
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import random
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, cohen_kappa_score
from lightgbm import log_evaluation, early_stopping
import polars as pl
from pathlib import Path
import sys



In [3]:
# 自作関数の読み込み
repo_dir = Path().resolve().parents[0]
sys.path.append(str(repo_dir / "scripts/"))
from utils.path import PathManager
from utils.data import *
from utils.model import *

## パスの設定
mode = config["model_name"]
path_to = PathManager(repo_dir, mode)

# Prepare Test-Dataset

In [4]:
## データ読み込み＆特徴量加工
create_dataset = CreateDataset(repo_dir, config)
test = create_dataset.preprocessing_test()

In [5]:
def load_features(input_dir):
    with open(input_dir, "rb") as f:
        feature_select = pickle.load(f)

    return feature_select

def prepare_test_data(input_data, feature_select):
    feature_select = [feature for feature in feature_select if feature in input_data.columns]
    X = input_data[feature_select].astype(np.float32).values

    return X

load_path = path_to.aes2_cache_dir
feature_select = load_features(load_path)
test_X = prepare_test_data(test, feature_select)

# Prediction

In [6]:
model_params = {
    'lgbm': {
        'objective': qwk_obj,  # qwk_objは事前に定義されている関数を指定
        'metrics': 'None',
        'learning_rate': 0.05,
        'max_depth': 5,
        'num_leaves': 10,
        'colsample_bytree': 0.3,
        'reg_alpha': 2.,
        'reg_lambda': 0.1,
        'n_estimators': 700,
        'random_state': 42,
        'extra_trees': True,
        'class_weight': 'balanced',
        'device': 'gpu' if torch.cuda.is_available() else 'cpu',
        'verbosity': - 1
    },
    'xgb': {
        'objective': qwk_obj,  # qwk_objは事前に定義されている関数を指定
        'metrics': 'None',
        'learning_rate': 0.1,
        'max_depth': 5,
        'num_leaves': 10,
        'colsample_bytree': 0.5,
        'reg_alpha': 1.0,
        'reg_lambda': 0.1,
        'n_estimators': 1024,
        'random_state': 42,
        'extra_trees': True,
        'class_weight': 'balanced',
        'tree_method': "hist",
        'device': "gpu" if torch.cuda.is_available() else "cpu"
    }
}

In [7]:

probabilities = []
for i in range(config['n_splits']):
    
    ## 予測実施実施
    trainer = Trainer(config, model_params)
    trainer.initialize_models()
    model_fold_path = os.path.join(path_to.models_weight, f'fold_{i}')
    trainer.load_weight(model_fold_path)

    proba = trainer.predict(test_X) + config['a']
    probabilities.append(proba)

# Compute the average probabilities across all models
predictions = np.mean(probabilities, axis=0)
predictions = np.round(predictions.clip(1, 6))


# Create submit-file

In [9]:
# submitファイルの整形
file_path = path_to.origin_sample_submit_dir
submission = pd.read_csv(file_path)
submission['score'] = predictions
submission['score'] = submission['score'].astype(int)

# 保存
if not os.path.exists(path_to.output_dir):
    path_to.output_dir.mkdir()

save_path = path_to.submit_dir
submission.to_csv(save_path, index=None)
display(submission.head())

Unnamed: 0,essay_id,score
0,000d118,3
1,000fe60,3
2,001ab80,3
