In [1]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
train_df = pd.read_csv("../../input/feedback-prize-english-language-learning/train.csv")
train_df.head(2)

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5


In [3]:
model_path_list = [
    '../../01_Baseline/exp/result/01_v1_27/oof_df.csv', # deberta-v3-base
    '../../01_Baseline/exp/result/01_v1_43/oof_df.csv', # deberta-v3-large
    '../../01_Baseline/exp/result/01_v1_24/oof_df.csv', # deberta-base
    '../../01_Baseline/exp/result/01_v1_25/oof_df.csv', # deberta-large
    '../../01_Baseline/exp/result/01_v1_45/oof_df.csv', # roberta-large
]

oof_df_list = [
    pd.read_csv(model_path) for model_path in model_path_list
]

for oof_df in oof_df_list:
    oof_df = train_df[['text_id']].merge(oof_df, how='left', on='text_id')
    
num_models = len(model_path_list)

TARGET_COLS = ['cohesion','syntax','vocabulary','phraseology','grammar','conventions']

preds = 0
for oof_df in oof_df_list:
    preds += oof_df[TARGET_COLS].values / num_models

In [14]:
oof_df = oof_df_list[0].copy()
for i,col in enumerate(TARGET_COLS):
    oof_df[col] = preds[:,i]

In [16]:
print('oof_df.shape = ', oof_df.shape)
oof_df.head(2)

oof_df.shape =  (3911, 14)


Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions,loss_cohesion,loss_syntax,loss_vocabulary,loss_phraseology,loss_grammar,loss_conventions,embed
0,0016926B079C,2.928521,2.907466,3.062548,3.135399,3.126118,2.885326,0.188834,0.204315,0.002587,0.002111,0.493542,0.00285,"[-0.0270579531788826, -0.0180321354418993, 0.0..."
1,0022683E9EA5,2.618682,2.491237,2.80222,2.624745,2.479205,2.517189,0.000223,0.001765,0.041647,0.183164,0.138403,0.000197,"[-0.030677301809191704, -0.02591944858431816, ..."


In [5]:
import numpy as np

def calc_metric(pred, gt):
    '''
    pred : (num_data, num_labels)
    gt : (num_data, num_labels)
    '''
    score = np.sqrt(np.mean((pred - gt)**2, axis=0))
    score = score.mean()
    return score

In [6]:
score = calc_metric(pred=oof_df[TARGET_COLS].values, gt=train_df[TARGET_COLS].values)
print('CV={:.4f}'.format(score))

CV=0.4480


In [19]:
oofs = []
for oof_df in oof_df_list:
    oofs.append(oof_df[TARGET_COLS].values)
    
ensemble_predictions = np.stack(oofs)
print('ensemble_predictions.shape = ', ensemble_predictions.shape)

ensemble_predictions.shape =  (5, 3911, 6)


In [23]:
from skopt import gp_minimize

score_list = []
best_weights_list = []
for i,col in enumerate(TARGET_COLS):
    print('*'*25)
    print(col)
    
    def ensemble_score(weights, return_pred=False):
        weights=np.array(weights)
        weights=weights.reshape(-1,1)/weights.sum()
        p=weights.reshape(-1,1)*ensemble_predictions[:,:,i]
        p=p.sum(0)
        score=calc_metric(p, train_df[col])
        if return_pred:
            return score,p
        else:
            return score 
    
    results=gp_minimize(ensemble_score, 
                        np.array([[0.1,1] for i in range(len(ensemble_predictions))]), 
                        verbose=True, random_state=2022)
    best_weights=np.array(results['x'])/sum(results['x'])
    score,ensemble_pred=ensemble_score(best_weights,True)
    best_weights_list.append(best_weights)
    score_list.append(score)

*************************
cohesion
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.0016
Function value obtained: 0.4795
Current minimum: 0.4795
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.0010
Function value obtained: 0.4791
Current minimum: 0.4791
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.0011
Function value obtained: 0.4789
Current minimum: 0.4789
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.0010
Function value obtained: 0.4792
Current minimum: 0.4789
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.0010
Function value obtained: 0.4790
Current minimum: 0.4789


Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 0.6854
Function value obtained: 0.4793
Current minimum: 0.4782
Iteration No: 42 started. Searching for the next optimal point.
Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 0.5479
Function value obtained: 0.4782
Current minimum: 0.4782
Iteration No: 43 started. Searching for the next optimal point.
Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 0.6397
Function value obtained: 0.4785
Current minimum: 0.4782
Iteration No: 44 started. Searching for the next optimal point.
Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 0.8028
Function value obtained: 0.4789
Current minimum: 0.4782
Iteration No: 45 started. Searching for the next optimal point.
Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 0.7233
Function value obtained: 0.4782
Current minimum: 0.4782
Iteration No: 46 started. Sea

Iteration No: 81 ended. Search finished for the next optimal point.
Time taken: 1.1617
Function value obtained: 0.4781
Current minimum: 0.4781
Iteration No: 82 started. Searching for the next optimal point.
Iteration No: 82 ended. Search finished for the next optimal point.
Time taken: 1.1436
Function value obtained: 0.4785
Current minimum: 0.4781
Iteration No: 83 started. Searching for the next optimal point.
Iteration No: 83 ended. Search finished for the next optimal point.
Time taken: 1.0155
Function value obtained: 0.4798
Current minimum: 0.4781
Iteration No: 84 started. Searching for the next optimal point.
Iteration No: 84 ended. Search finished for the next optimal point.
Time taken: 0.9509
Function value obtained: 0.4786
Current minimum: 0.4781
Iteration No: 85 started. Searching for the next optimal point.
Iteration No: 85 ended. Search finished for the next optimal point.
Time taken: 0.9544
Function value obtained: 0.4781
Current minimum: 0.4781
Iteration No: 86 started. Sea

Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 0.3581
Function value obtained: 0.4409
Current minimum: 0.4406
Iteration No: 23 started. Searching for the next optimal point.
Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 0.3548
Function value obtained: 0.4421
Current minimum: 0.4406
Iteration No: 24 started. Searching for the next optimal point.
Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 0.3924
Function value obtained: 0.4416
Current minimum: 0.4406
Iteration No: 25 started. Searching for the next optimal point.
Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 0.3487
Function value obtained: 0.4408
Current minimum: 0.4406
Iteration No: 26 started. Searching for the next optimal point.
Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 0.3301
Function value obtained: 0.4410
Current minimum: 0.4406
Iteration No: 27 started. Sea

Iteration No: 62 ended. Search finished for the next optimal point.
Time taken: 0.7588
Function value obtained: 0.4406
Current minimum: 0.4406
Iteration No: 63 started. Searching for the next optimal point.
Iteration No: 63 ended. Search finished for the next optimal point.
Time taken: 0.7293
Function value obtained: 0.4407
Current minimum: 0.4406
Iteration No: 64 started. Searching for the next optimal point.
Iteration No: 64 ended. Search finished for the next optimal point.
Time taken: 0.7761
Function value obtained: 0.4409
Current minimum: 0.4406
Iteration No: 65 started. Searching for the next optimal point.
Iteration No: 65 ended. Search finished for the next optimal point.
Time taken: 0.7234
Function value obtained: 0.4406
Current minimum: 0.4406
Iteration No: 66 started. Searching for the next optimal point.
Iteration No: 66 ended. Search finished for the next optimal point.
Time taken: 0.7958
Function value obtained: 0.4413
Current minimum: 0.4406
Iteration No: 67 started. Sea

Iteration No: 10 ended. Evaluation done at random point.
Time taken: 0.2466
Function value obtained: 0.4114
Current minimum: 0.4104
Iteration No: 11 started. Searching for the next optimal point.
Iteration No: 11 ended. Search finished for the next optimal point.
Time taken: 0.2482
Function value obtained: 0.4109
Current minimum: 0.4104
Iteration No: 12 started. Searching for the next optimal point.
Iteration No: 12 ended. Search finished for the next optimal point.
Time taken: 0.2169
Function value obtained: 0.4117
Current minimum: 0.4104
Iteration No: 13 started. Searching for the next optimal point.
Iteration No: 13 ended. Search finished for the next optimal point.
Time taken: 0.2653
Function value obtained: 0.4103
Current minimum: 0.4103
Iteration No: 14 started. Searching for the next optimal point.
Iteration No: 14 ended. Search finished for the next optimal point.
Time taken: 0.2556
Function value obtained: 0.4121
Current minimum: 0.4103
Iteration No: 15 started. Searching for 

Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 0.6649
Function value obtained: 0.4109
Current minimum: 0.4101
Iteration No: 51 started. Searching for the next optimal point.
Iteration No: 51 ended. Search finished for the next optimal point.
Time taken: 0.6869
Function value obtained: 0.4122
Current minimum: 0.4101
Iteration No: 52 started. Searching for the next optimal point.
Iteration No: 52 ended. Search finished for the next optimal point.
Time taken: 0.6431
Function value obtained: 0.4107
Current minimum: 0.4101
Iteration No: 53 started. Searching for the next optimal point.
Iteration No: 53 ended. Search finished for the next optimal point.
Time taken: 0.7076
Function value obtained: 0.4121
Current minimum: 0.4101
Iteration No: 54 started. Searching for the next optimal point.
Iteration No: 54 ended. Search finished for the next optimal point.
Time taken: 0.6776
Function value obtained: 0.4122
Current minimum: 0.4101
Iteration No: 55 started. Sea

Iteration No: 90 ended. Search finished for the next optimal point.
Time taken: 0.9801
Function value obtained: 0.4101
Current minimum: 0.4101
Iteration No: 91 started. Searching for the next optimal point.
Iteration No: 91 ended. Search finished for the next optimal point.
Time taken: 1.1367
Function value obtained: 0.4112
Current minimum: 0.4101
Iteration No: 92 started. Searching for the next optimal point.
Iteration No: 92 ended. Search finished for the next optimal point.
Time taken: 1.0988
Function value obtained: 0.4111
Current minimum: 0.4101
Iteration No: 93 started. Searching for the next optimal point.
Iteration No: 93 ended. Search finished for the next optimal point.
Time taken: 1.0851
Function value obtained: 0.4110
Current minimum: 0.4101
Iteration No: 94 started. Searching for the next optimal point.
Iteration No: 94 ended. Search finished for the next optimal point.
Time taken: 1.1465
Function value obtained: 0.4101
Current minimum: 0.4101
Iteration No: 95 started. Sea

Iteration No: 31 ended. Search finished for the next optimal point.
Time taken: 0.3850
Function value obtained: 0.4486
Current minimum: 0.4484
Iteration No: 32 started. Searching for the next optimal point.
Iteration No: 32 ended. Search finished for the next optimal point.
Time taken: 0.5131
Function value obtained: 0.4486
Current minimum: 0.4484
Iteration No: 33 started. Searching for the next optimal point.
Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 0.6114
Function value obtained: 0.4487
Current minimum: 0.4484
Iteration No: 34 started. Searching for the next optimal point.
Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 0.5556
Function value obtained: 0.4502
Current minimum: 0.4484
Iteration No: 35 started. Searching for the next optimal point.
Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 0.6213
Function value obtained: 0.4489
Current minimum: 0.4484
Iteration No: 36 started. Sea

Iteration No: 71 ended. Search finished for the next optimal point.
Time taken: 0.7656
Function value obtained: 0.4484
Current minimum: 0.4484
Iteration No: 72 started. Searching for the next optimal point.
Iteration No: 72 ended. Search finished for the next optimal point.
Time taken: 0.8428
Function value obtained: 0.4487
Current minimum: 0.4484
Iteration No: 73 started. Searching for the next optimal point.
Iteration No: 73 ended. Search finished for the next optimal point.
Time taken: 0.9443
Function value obtained: 0.4488
Current minimum: 0.4484
Iteration No: 74 started. Searching for the next optimal point.
Iteration No: 74 ended. Search finished for the next optimal point.
Time taken: 0.8634
Function value obtained: 0.4503
Current minimum: 0.4484
Iteration No: 75 started. Searching for the next optimal point.
Iteration No: 75 ended. Search finished for the next optimal point.
Time taken: 0.8973
Function value obtained: 0.4484
Current minimum: 0.4484
Iteration No: 76 started. Sea

Iteration No: 12 ended. Search finished for the next optimal point.
Time taken: 0.2885
Function value obtained: 0.4682
Current minimum: 0.4678
Iteration No: 13 started. Searching for the next optimal point.
Iteration No: 13 ended. Search finished for the next optimal point.
Time taken: 0.2651
Function value obtained: 0.4677
Current minimum: 0.4677
Iteration No: 14 started. Searching for the next optimal point.
Iteration No: 14 ended. Search finished for the next optimal point.
Time taken: 0.3147
Function value obtained: 0.4704
Current minimum: 0.4677
Iteration No: 15 started. Searching for the next optimal point.
Iteration No: 15 ended. Search finished for the next optimal point.
Time taken: 0.2806
Function value obtained: 0.4709
Current minimum: 0.4677
Iteration No: 16 started. Searching for the next optimal point.
Iteration No: 16 ended. Search finished for the next optimal point.
Time taken: 0.3781
Function value obtained: 0.4675
Current minimum: 0.4675
Iteration No: 17 started. Sea

Iteration No: 52 ended. Search finished for the next optimal point.
Time taken: 0.7124
Function value obtained: 0.4691
Current minimum: 0.4673
Iteration No: 53 started. Searching for the next optimal point.
Iteration No: 53 ended. Search finished for the next optimal point.
Time taken: 0.7441
Function value obtained: 0.4704
Current minimum: 0.4673
Iteration No: 54 started. Searching for the next optimal point.
Iteration No: 54 ended. Search finished for the next optimal point.
Time taken: 0.6919
Function value obtained: 0.4675
Current minimum: 0.4673
Iteration No: 55 started. Searching for the next optimal point.
Iteration No: 55 ended. Search finished for the next optimal point.
Time taken: 0.6556
Function value obtained: 0.4674
Current minimum: 0.4673
Iteration No: 56 started. Searching for the next optimal point.
Iteration No: 56 ended. Search finished for the next optimal point.
Time taken: 0.6927
Function value obtained: 0.4674
Current minimum: 0.4673
Iteration No: 57 started. Sea

Iteration No: 92 ended. Search finished for the next optimal point.
Time taken: 1.0895
Function value obtained: 0.4693
Current minimum: 0.4673
Iteration No: 93 started. Searching for the next optimal point.
Iteration No: 93 ended. Search finished for the next optimal point.
Time taken: 1.1292
Function value obtained: 0.4682
Current minimum: 0.4673
Iteration No: 94 started. Searching for the next optimal point.
Iteration No: 94 ended. Search finished for the next optimal point.
Time taken: 1.1188
Function value obtained: 0.4673
Current minimum: 0.4673
Iteration No: 95 started. Searching for the next optimal point.
Iteration No: 95 ended. Search finished for the next optimal point.
Time taken: 1.0179
Function value obtained: 0.4686
Current minimum: 0.4673
Iteration No: 96 started. Searching for the next optimal point.
Iteration No: 96 ended. Search finished for the next optimal point.
Time taken: 1.0599
Function value obtained: 0.4673
Current minimum: 0.4673
Iteration No: 97 started. Sea

Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 0.6968
Function value obtained: 0.4408
Current minimum: 0.4403
Iteration No: 34 started. Searching for the next optimal point.
Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 0.5606
Function value obtained: 0.4412
Current minimum: 0.4403
Iteration No: 35 started. Searching for the next optimal point.
Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 0.5464
Function value obtained: 0.4414
Current minimum: 0.4403
Iteration No: 36 started. Searching for the next optimal point.
Iteration No: 36 ended. Search finished for the next optimal point.
Time taken: 0.6480
Function value obtained: 0.4416
Current minimum: 0.4403
Iteration No: 37 started. Searching for the next optimal point.
Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 0.5796
Function value obtained: 0.4407
Current minimum: 0.4403
Iteration No: 38 started. Sea

Iteration No: 73 ended. Search finished for the next optimal point.
Time taken: 0.9341
Function value obtained: 0.4410
Current minimum: 0.4403
Iteration No: 74 started. Searching for the next optimal point.
Iteration No: 74 ended. Search finished for the next optimal point.
Time taken: 0.9978
Function value obtained: 0.4419
Current minimum: 0.4403
Iteration No: 75 started. Searching for the next optimal point.
Iteration No: 75 ended. Search finished for the next optimal point.
Time taken: 1.0286
Function value obtained: 0.4404
Current minimum: 0.4403
Iteration No: 76 started. Searching for the next optimal point.
Iteration No: 76 ended. Search finished for the next optimal point.
Time taken: 1.0329
Function value obtained: 0.4412
Current minimum: 0.4403
Iteration No: 77 started. Searching for the next optimal point.
Iteration No: 77 ended. Search finished for the next optimal point.
Time taken: 0.9855
Function value obtained: 0.4407
Current minimum: 0.4403
Iteration No: 78 started. Sea

In [24]:
score = sum(score_list) / len(score_list)
print('CV={:.4f}'.format(score))
print(best_weights_list)
print('\n')

CV=0.4475
[array([0.10920084, 0.30448752, 0.09567561, 0.26060934, 0.23002668]), array([0.11077406, 0.29082079, 0.09301763, 0.27732397, 0.22806355]), array([0.41951741, 0.04195174, 0.04195174, 0.24719077, 0.24938834]), array([0.23637077, 0.27549216, 0.06267783, 0.21054884, 0.21491039]), array([0.22587442, 0.37410089, 0.03741009, 0.11305696, 0.24955764]), array([0.28224474, 0.28224474, 0.06045177, 0.15533809, 0.21972066])]


