In [1]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
train_df = pd.read_csv("../../input/feedback-prize-english-language-learning/train.csv")
train_df.head(2)

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5


In [3]:
model_path_list = [
    ('../../14_Baseline4/exp/result/14_v1_01/oof_df.csv',1), # deberta-v3-base
    ('../../14_Baseline4/exp/result/14_v1_08/oof_df.csv',1), # deberta-base
    ('../../14_Baseline4/exp/result/14_v1_09/oof_df.csv',1), # roberta-base
    ('../../14_Baseline4/exp/result/14_v1_10/oof_df.csv',1), # deberta-v3-large
    ('../../14_Baseline4/exp/result/14_v1_11/oof_df.csv',1), # xlm-roberta-base
    ('../../14_Baseline4/exp/result/14_v1_12/oof_df.csv',1), # deberta-large
    ('../../14_Baseline4/exp/result/14_v1_13/oof_df.csv',1), # roberta-large
    ('../../14_Baseline4/exp/result/14_v1_14/oof_df.csv',1), # xlm-roberta-large
    ('../../14_Baseline4/exp/result/14_v1_15/oof_df.csv',1), # deberta-v2-xlarge
    ('../../14_Baseline4/exp/result/14_v1_16/oof_df.csv',1), # deberta-xlarge
    
    ('../../14_Baseline4/exp/result/14_v2_01/oof_df.csv',1), # deberta-v3-base, seed200
    ('../../14_Baseline4/exp/result/14_v2_02/oof_df.csv',1), # deberta-v3-large, seed200
    ('../../14_Baseline4/exp/result/14_v2_03/oof_df.csv',1), # deberta-base, seed200
    
    #('../../17_Pseudo3/exp/result/17_v1_01/oof_df.csv',1), # deberta-v3-base
    #('../../17_Pseudo3/exp/result/17_v1_02/oof_df.csv',1), # deberta-v3-base
    #('../../17_Pseudo3/exp/result/17_v1_03/oof_df.csv',1), # deberta-v3-large
    #('../../17_Pseudo3/exp/result/17_v1_04/oof_df.csv',1), # deberta-v3-base, seed200
]

oof_df_list = [
    pd.read_csv(model_path) for model_path,_ in model_path_list
]
weights_list = [
    w for _,w in model_path_list
]
weights_list = [w/sum(weights_list) for w in weights_list]

for oof_df in oof_df_list:
    oof_df = train_df[['text_id']].merge(oof_df, how='left', on='text_id')
    
num_models = len(model_path_list)

TARGET_COLS = ['cohesion','syntax','vocabulary','phraseology','grammar','conventions']

preds = 0
for oof_df, w in zip(oof_df_list, weights_list):
    preds += oof_df[TARGET_COLS].values * w

oofs = []
for oof_df in oof_df_list:
    oofs.append(oof_df[TARGET_COLS].values)

In [4]:
oof_df = oof_df_list[0].copy()
for i,col in enumerate(TARGET_COLS):
    oof_df[col] = preds[:,i]

In [5]:
import numpy as np

def calc_metric(pred, gt):
    '''
    pred : (num_data, num_labels)
    gt : (num_data, num_labels)
    '''
    score = np.sqrt(np.mean((pred - gt)**2, axis=0))
    score = score.mean()
    return score

In [6]:
score = calc_metric(pred=oof_df[TARGET_COLS].values, gt=train_df[TARGET_COLS].values)
print('CV={:.4f}'.format(score))

CV=0.4440


# Optimize class-wise

In [7]:
ensemble_predictions=np.stack(oofs)
ensemble_predictions.shape

(13, 3911, 6)

In [8]:
from skopt import gp_minimize

def run_optimize(i):
    target_col = TARGET_COLS[i]
    print("*"*50)
    print(target_col)
    print("*"*50)
    def ensemble_score(weights,return_pred=False):
        weights=np.array(weights)
        weights=weights.reshape(-1,1,1)/weights.sum()
        p=weights.reshape(-1,1,1)*ensemble_predictions[:,:,i,None]
        p=p.sum(0)
        score=calc_metric(p, train_df[[target_col]])
        if return_pred:
            return score,p
        else:
            return score   
    results = gp_minimize(ensemble_score, np.array([[0.1,1] for i in range(len(ensemble_predictions))]),
                          verbose=True,random_state=2022)
    best_weights=np.array(results['x'])/sum(results['x'])
    print(best_weights)
    # smaller is better for this metric
    score, ensemble_pred=ensemble_score(best_weights,True)
    print("score={:.4f}", score)
    return score, best_weights

In [9]:
scores = {}
best_weights = {}
for i,target_col in enumerate(TARGET_COLS):
    score, weights = run_optimize(i)
    scores[target_col] = score
    best_weights[target_col] = weights

**************************************************
cohesion
**************************************************
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.0251
Function value obtained: 0.4714
Current minimum: 0.4714
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.0025
Function value obtained: 0.4730
Current minimum: 0.4714
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.0024
Function value obtained: 0.4719
Current minimum: 0.4714
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.0024
Function value obtained: 0.4715
Current minimum: 0.4714
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.

Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 0.8954
Function value obtained: 0.4711
Current minimum: 0.4708
Iteration No: 42 started. Searching for the next optimal point.
Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 0.8084
Function value obtained: 0.4723
Current minimum: 0.4708
Iteration No: 43 started. Searching for the next optimal point.
Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 0.9061
Function value obtained: 0.4729
Current minimum: 0.4708
Iteration No: 44 started. Searching for the next optimal point.
Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 0.8615
Function value obtained: 0.4721
Current minimum: 0.4708
Iteration No: 45 started. Searching for the next optimal point.
Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 0.8630
Function value obtained: 0.4723
Current minimum: 0.4708
Iteration No: 46 started. Sea

Iteration No: 81 ended. Search finished for the next optimal point.
Time taken: 1.3795
Function value obtained: 0.4728
Current minimum: 0.4706
Iteration No: 82 started. Searching for the next optimal point.
Iteration No: 82 ended. Search finished for the next optimal point.
Time taken: 1.5265
Function value obtained: 0.4718
Current minimum: 0.4706
Iteration No: 83 started. Searching for the next optimal point.
Iteration No: 83 ended. Search finished for the next optimal point.
Time taken: 1.4503
Function value obtained: 0.4720
Current minimum: 0.4706
Iteration No: 84 started. Searching for the next optimal point.
Iteration No: 84 ended. Search finished for the next optimal point.
Time taken: 1.4934
Function value obtained: 0.4705
Current minimum: 0.4705
Iteration No: 85 started. Searching for the next optimal point.
Iteration No: 85 ended. Search finished for the next optimal point.
Time taken: 1.5171
Function value obtained: 0.4723
Current minimum: 0.4705
Iteration No: 86 started. Sea

Iteration No: 20 ended. Search finished for the next optimal point.
Time taken: 0.5483
Function value obtained: 0.4379
Current minimum: 0.4372
Iteration No: 21 started. Searching for the next optimal point.
Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 0.4976
Function value obtained: 0.4378
Current minimum: 0.4372
Iteration No: 22 started. Searching for the next optimal point.
Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 0.5103
Function value obtained: 0.4378
Current minimum: 0.4372
Iteration No: 23 started. Searching for the next optimal point.
Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 0.4745
Function value obtained: 0.4381
Current minimum: 0.4372
Iteration No: 24 started. Searching for the next optimal point.
Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 0.5872
Function value obtained: 0.4384
Current minimum: 0.4372
Iteration No: 25 started. Sea

Iteration No: 60 ended. Search finished for the next optimal point.
Time taken: 1.0414
Function value obtained: 0.4379
Current minimum: 0.4367
Iteration No: 61 started. Searching for the next optimal point.
Iteration No: 61 ended. Search finished for the next optimal point.
Time taken: 1.0833
Function value obtained: 0.4373
Current minimum: 0.4367
Iteration No: 62 started. Searching for the next optimal point.
Iteration No: 62 ended. Search finished for the next optimal point.
Time taken: 1.0843
Function value obtained: 0.4381
Current minimum: 0.4367
Iteration No: 63 started. Searching for the next optimal point.
Iteration No: 63 ended. Search finished for the next optimal point.
Time taken: 1.0775
Function value obtained: 0.4376
Current minimum: 0.4367
Iteration No: 64 started. Searching for the next optimal point.
Iteration No: 64 ended. Search finished for the next optimal point.
Time taken: 0.9521
Function value obtained: 0.4372
Current minimum: 0.4367
Iteration No: 65 started. Sea

Iteration No: 100 ended. Search finished for the next optimal point.
Time taken: 1.6509
Function value obtained: 0.4393
Current minimum: 0.4366
[0.14782308 0.01607879 0.02868567 0.15991539 0.01607879 0.16078792
 0.05280733 0.13981886 0.10733949 0.01607879 0.01607879 0.12242831
 0.01607879]
score={:.4f} 0.4366061841142417
**************************************************
vocabulary
**************************************************
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.0023
Function value obtained: 0.4054
Current minimum: 0.4054
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.0022
Function value obtained: 0.4060
Current minimum: 0.4054
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.0022
Function value obtained: 0.4061
Current m

Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 1.0500
Function value obtained: 0.4053
Current minimum: 0.4046
Iteration No: 40 started. Searching for the next optimal point.
Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 1.1741
Function value obtained: 0.4059
Current minimum: 0.4046
Iteration No: 41 started. Searching for the next optimal point.
Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 1.1678
Function value obtained: 0.4046
Current minimum: 0.4046
Iteration No: 42 started. Searching for the next optimal point.
Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 0.7909
Function value obtained: 0.4062
Current minimum: 0.4046
Iteration No: 43 started. Searching for the next optimal point.
Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 0.8959
Function value obtained: 0.4060
Current minimum: 0.4046
Iteration No: 44 started. Sea

Iteration No: 79 ended. Search finished for the next optimal point.
Time taken: 1.2850
Function value obtained: 0.4057
Current minimum: 0.4044
Iteration No: 80 started. Searching for the next optimal point.
Iteration No: 80 ended. Search finished for the next optimal point.
Time taken: 1.3330
Function value obtained: 0.4043
Current minimum: 0.4043
Iteration No: 81 started. Searching for the next optimal point.
Iteration No: 81 ended. Search finished for the next optimal point.
Time taken: 1.2683
Function value obtained: 0.4056
Current minimum: 0.4043
Iteration No: 82 started. Searching for the next optimal point.
Iteration No: 82 ended. Search finished for the next optimal point.
Time taken: 1.3856
Function value obtained: 0.4054
Current minimum: 0.4043
Iteration No: 83 started. Searching for the next optimal point.
Iteration No: 83 ended. Search finished for the next optimal point.
Time taken: 1.3906
Function value obtained: 0.4053
Current minimum: 0.4043
Iteration No: 84 started. Sea

Iteration No: 18 ended. Search finished for the next optimal point.
Time taken: 0.5486
Function value obtained: 0.4469
Current minimum: 0.4465
Iteration No: 19 started. Searching for the next optimal point.
Iteration No: 19 ended. Search finished for the next optimal point.
Time taken: 0.5015
Function value obtained: 0.4474
Current minimum: 0.4465
Iteration No: 20 started. Searching for the next optimal point.
Iteration No: 20 ended. Search finished for the next optimal point.
Time taken: 0.5393
Function value obtained: 0.4470
Current minimum: 0.4465
Iteration No: 21 started. Searching for the next optimal point.
Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 0.5452
Function value obtained: 0.4475
Current minimum: 0.4465
Iteration No: 22 started. Searching for the next optimal point.
Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 0.6129
Function value obtained: 0.4473
Current minimum: 0.4465
Iteration No: 23 started. Sea

Iteration No: 58 ended. Search finished for the next optimal point.
Time taken: 1.0946
Function value obtained: 0.4469
Current minimum: 0.4459
Iteration No: 59 started. Searching for the next optimal point.
Iteration No: 59 ended. Search finished for the next optimal point.
Time taken: 1.0660
Function value obtained: 0.4460
Current minimum: 0.4459
Iteration No: 60 started. Searching for the next optimal point.
Iteration No: 60 ended. Search finished for the next optimal point.
Time taken: 1.0436
Function value obtained: 0.4477
Current minimum: 0.4459
Iteration No: 61 started. Searching for the next optimal point.
Iteration No: 61 ended. Search finished for the next optimal point.
Time taken: 1.2812
Function value obtained: 0.4473
Current minimum: 0.4459
Iteration No: 62 started. Searching for the next optimal point.
Iteration No: 62 ended. Search finished for the next optimal point.
Time taken: 1.1191
Function value obtained: 0.4474
Current minimum: 0.4459
Iteration No: 63 started. Sea

Iteration No: 98 ended. Search finished for the next optimal point.
Time taken: 1.8847
Function value obtained: 0.4459
Current minimum: 0.4458
Iteration No: 99 started. Searching for the next optimal point.
Iteration No: 99 ended. Search finished for the next optimal point.
Time taken: 1.8682
Function value obtained: 0.4459
Current minimum: 0.4458
Iteration No: 100 started. Searching for the next optimal point.
Iteration No: 100 ended. Search finished for the next optimal point.
Time taken: 1.7837
Function value obtained: 0.4495
Current minimum: 0.4458
[0.14498503 0.0144985  0.0144985  0.11654183 0.0144985  0.14498503
 0.0144985  0.0144985  0.14498503 0.07572844 0.14498503 0.09813614
 0.05716095]
score={:.4f} 0.44582971054074233
**************************************************
grammar
**************************************************
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.0029
Function value

Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 0.7309
Function value obtained: 0.4625
Current minimum: 0.4624
Iteration No: 38 started. Searching for the next optimal point.
Iteration No: 38 ended. Search finished for the next optimal point.
Time taken: 0.9222
Function value obtained: 0.4651
Current minimum: 0.4624
Iteration No: 39 started. Searching for the next optimal point.
Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 0.8649
Function value obtained: 0.4646
Current minimum: 0.4624
Iteration No: 40 started. Searching for the next optimal point.
Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 1.1282
Function value obtained: 0.4650
Current minimum: 0.4624
Iteration No: 41 started. Searching for the next optimal point.
Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 0.8868
Function value obtained: 0.4627
Current minimum: 0.4624
Iteration No: 42 started. Sea

Iteration No: 77 ended. Search finished for the next optimal point.
Time taken: 1.1294
Function value obtained: 0.4653
Current minimum: 0.4622
Iteration No: 78 started. Searching for the next optimal point.
Iteration No: 78 ended. Search finished for the next optimal point.
Time taken: 1.1931
Function value obtained: 0.4647
Current minimum: 0.4622
Iteration No: 79 started. Searching for the next optimal point.
Iteration No: 79 ended. Search finished for the next optimal point.
Time taken: 1.2553
Function value obtained: 0.4643
Current minimum: 0.4622
Iteration No: 80 started. Searching for the next optimal point.
Iteration No: 80 ended. Search finished for the next optimal point.
Time taken: 1.1865
Function value obtained: 0.4625
Current minimum: 0.4622
Iteration No: 81 started. Searching for the next optimal point.
Iteration No: 81 ended. Search finished for the next optimal point.
Time taken: 1.2388
Function value obtained: 0.4652
Current minimum: 0.4622
Iteration No: 82 started. Sea

Iteration No: 16 ended. Search finished for the next optimal point.
Time taken: 0.4510
Function value obtained: 0.4382
Current minimum: 0.4371
Iteration No: 17 started. Searching for the next optimal point.
Iteration No: 17 ended. Search finished for the next optimal point.
Time taken: 0.4084
Function value obtained: 0.4376
Current minimum: 0.4371
Iteration No: 18 started. Searching for the next optimal point.
Iteration No: 18 ended. Search finished for the next optimal point.
Time taken: 0.4846
Function value obtained: 0.4374
Current minimum: 0.4371
Iteration No: 19 started. Searching for the next optimal point.
Iteration No: 19 ended. Search finished for the next optimal point.
Time taken: 0.4822
Function value obtained: 0.4377
Current minimum: 0.4371
Iteration No: 20 started. Searching for the next optimal point.
Iteration No: 20 ended. Search finished for the next optimal point.
Time taken: 0.4603
Function value obtained: 0.4376
Current minimum: 0.4371
Iteration No: 21 started. Sea

Iteration No: 56 ended. Search finished for the next optimal point.
Time taken: 1.3748
Function value obtained: 0.4373
Current minimum: 0.4370
Iteration No: 57 started. Searching for the next optimal point.
Iteration No: 57 ended. Search finished for the next optimal point.
Time taken: 1.1329
Function value obtained: 0.4374
Current minimum: 0.4370
Iteration No: 58 started. Searching for the next optimal point.
Iteration No: 58 ended. Search finished for the next optimal point.
Time taken: 1.0683
Function value obtained: 0.4374
Current minimum: 0.4370
Iteration No: 59 started. Searching for the next optimal point.
Iteration No: 59 ended. Search finished for the next optimal point.
Time taken: 1.2897
Function value obtained: 0.4372
Current minimum: 0.4370
Iteration No: 60 started. Searching for the next optimal point.
Iteration No: 60 ended. Search finished for the next optimal point.
Time taken: 1.1591
Function value obtained: 0.4380
Current minimum: 0.4370
Iteration No: 61 started. Sea

Iteration No: 96 ended. Search finished for the next optimal point.
Time taken: 1.7088
Function value obtained: 0.4368
Current minimum: 0.4368
Iteration No: 97 started. Searching for the next optimal point.
Iteration No: 97 ended. Search finished for the next optimal point.
Time taken: 1.9025
Function value obtained: 0.4376
Current minimum: 0.4368
Iteration No: 98 started. Searching for the next optimal point.
Iteration No: 98 ended. Search finished for the next optimal point.
Time taken: 1.6617
Function value obtained: 0.4368
Current minimum: 0.4368
Iteration No: 99 started. Searching for the next optimal point.
Iteration No: 99 ended. Search finished for the next optimal point.
Time taken: 1.7808
Function value obtained: 0.4369
Current minimum: 0.4368
Iteration No: 100 started. Searching for the next optimal point.
Iteration No: 100 ended. Search finished for the next optimal point.
Time taken: 1.8661
Function value obtained: 0.4391
Current minimum: 0.4368
[0.09375696 0.01479459 0.01

In [10]:
best_weights

{'cohesion': array([0.05173881, 0.01773507, 0.01773507, 0.01773507, 0.01773507,
        0.10229667, 0.01773507, 0.01773507, 0.1087426 , 0.1773507 ,
        0.1773507 , 0.1773507 , 0.09875939]),
 'syntax': array([0.14782308, 0.01607879, 0.02868567, 0.15991539, 0.01607879,
        0.16078792, 0.05280733, 0.13981886, 0.10733949, 0.01607879,
        0.01607879, 0.12242831, 0.01607879]),
 'vocabulary': array([0.19079376, 0.02288078, 0.02288078, 0.22880783, 0.05517349,
        0.02288078, 0.04854337, 0.08770824, 0.02288078, 0.02288078,
        0.02288078, 0.22880783, 0.02288078]),
 'phraseology': array([0.14498503, 0.0144985 , 0.0144985 , 0.11654183, 0.0144985 ,
        0.14498503, 0.0144985 , 0.0144985 , 0.14498503, 0.07572844,
        0.14498503, 0.09813614, 0.05716095]),
 'grammar': array([0.08212558, 0.02184473, 0.02184473, 0.21844727, 0.02184473,
        0.02184473, 0.02184473, 0.02184473, 0.11122991, 0.02184473,
        0.19499216, 0.21844727, 0.02184473]),
 'conventions': array([0.093

In [11]:
scores

{'cohesion': 0.4704628324490142,
 'syntax': 0.4366061841142417,
 'vocabulary': 0.4042597619172252,
 'phraseology': 0.44582971054074233,
 'grammar': 0.46222819471307947,
 'conventions': 0.43677768154739155}

In [12]:
CV = sum(scores.values()) / len(scores)
print("CV={:.4f}".format(CV))

CV=0.4427


# Error Analysis - Check Corr

In [13]:
cols = ['cohesion','syntax','vocabulary','phraseology','grammar','conventions']
train_df[cols].corr()

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
cohesion,1.0,0.695459,0.666151,0.690058,0.638689,0.666151
syntax,0.695459,1.0,0.680562,0.725467,0.709525,0.700025
vocabulary,0.666151,0.680562,1.0,0.735261,0.654852,0.664292
phraseology,0.690058,0.725467,0.735261,1.0,0.719746,0.666842
grammar,0.638689,0.709525,0.654852,0.719746,1.0,0.673301
conventions,0.666151,0.700025,0.664292,0.666842,0.673301,1.0


In [14]:
cols = ['cohesion','syntax','vocabulary','phraseology','grammar','conventions']
oof_df[cols].corr()

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
cohesion,1.0,0.946417,0.914852,0.912021,0.826856,0.887563
syntax,0.946417,1.0,0.937344,0.960062,0.916111,0.928967
vocabulary,0.914852,0.937344,1.0,0.958614,0.874254,0.892753
phraseology,0.912021,0.960062,0.958614,1.0,0.942686,0.88126
grammar,0.826856,0.916111,0.874254,0.942686,1.0,0.855633
conventions,0.887563,0.928967,0.892753,0.88126,0.855633,1.0
