# src - merge_predictions

## Notebook运行提示
- 代码已拆分为多个小单元, 按顺序运行即可在每一步观察输出与中间变量。
- 涉及 `Path(__file__)` 或相对路径的脚本会自动注入 `__file__` 解析逻辑, Notebook 环境下也能引用原项目资源。
- 可在每个单元下追加说明或参数试验记录, 以跟踪核心算法和数据处理步骤。


In [None]:
from .get_oofs_filenames import get_oofs_filenames
import os
import pandas as pd
import numpy as np

In [None]:


def get_merged_predictions(train_csv_path='../data/raw/train.csv',
                           oofs_dir_path='../data/oofs/csv'):
    train_df = pd.read_csv(train_csv_path)

    oofs_filenames = get_oofs_filenames(oofs_dir_path)
    for fn in oofs_filenames:
        path = os.path.join(oofs_dir_path, f'{fn}.csv')
        oof_df = pd.read_csv(path)

        if fn.startswith('exp'):
            oof_df.drop(['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
                        axis=1,
                        inplace=True)

        oof_df.rename(columns={'cohesion': 'pred_cohesion',
                               'syntax': 'pred_syntax',
                               'vocabulary': 'pred_vocabulary',
                               'phraseology': 'pred_phraseology',
                               'grammar': 'pred_grammar',
                               'conventions': 'pred_conventions'}, inplace=True)

        oof_df = oof_df[['text_id', 'pred_cohesion', 'pred_syntax',
                         'pred_vocabulary', 'pred_phraseology', 'pred_grammar', 'pred_conventions']]

        oof_df.columns = [col + '_' + fn if col != 'text_id' else col for col in oof_df.columns]
        train_df = pd.merge(train_df, oof_df, on=['text_id'], how='left')
    return train_df

In [None]:


def get_oofs_scores(dataframe, oofs_filenames, target_columns, criterion):
    submissions_cv_scores = {}
    for fn in oofs_filenames:
        pred_columns = ['pred_' + col + '_' + fn for col in target_columns]
        score, scores_all = criterion(dataframe[target_columns].values, dataframe[pred_columns].values)

        scores = {}
        for col, col_score in zip(target_columns, scores_all):
            scores[col] = col_score

        submissions_cv_scores[fn] = scores

    return submissions_cv_scores

In [None]:


def print_model_score(model, submission_cv_scores):
    scores = submission_cv_scores[model]
    print(f'================= {model} =================')
    print(f'CV score: {np.mean(list(scores.values()))}')
    for target, rmse in scores.items():
        print(f'\t{target}: {rmse}')

In [None]:


def print_scores(submission_cv_scores):
    for model in submission_cv_scores.keys():
        print_model_score(model, submission_cv_scores)