In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss

In [None]:
folds = [
    0,
    1,
    2,
    3,
    4
]

orig = pd.read_csv("../data/train_folded.csv").set_index("discourse_id")
orig = orig[orig.fold.isin(folds)]

exps = [    
    "lgb_models/lgb_v0",
    "yauhen/olivine-spaniel",
    "yauhen/saffron-rook",
    "yauhen/meteoric-bettong",
    "yauhen/big-ocelot",
    "yauhen/shrewd-rook-3ep",
    "yauhen/conscious-uakari",
    "valiant-degu", 
    "axiomatic-vulture",
    "smart-bumblebee",
    "awesome-rose",
    "honest-apple",
    "funky-funk",
    "lame-flame",
    "pastel-frog",
]

preds = []
label_cols = ["Adequate", "Effective", "Ineffective"]
for exp_name in exps:
    
    print(exp_name)
    
    pps = []
    if "yauhen" in exp_name:
        for seed in range(1,4):
            v = pd.read_csv(f"../data/yauhen/{exp_name.split('/')[-1]}_seed_{seed}.csv")
            v = v.set_index("discourse_id")
            v = v.loc[orig.index]

            pps.append(v[label_cols].values)
    elif "lgb_models" in exp_name:
        for seed in range(1):
            vs = []
            for fold in folds:
                v = pd.read_csv(f"../data/lgb_models/{exp_name.split('/')[-1]}/fold{fold}/validation_predictions_seed{seed}.csv")
                vs.append(v)
                
            v = pd.concat(vs)
            v = v.set_index("discourse_id")
            v = v.loc[orig.index]
            
            pps.append(v[label_cols].values)
    else:
        exp_names = []
        for j in range(3):
            if j == 0:
                exp_names.append(exp_name)
            else:
                exp_names.append(f"{exp_name}.{j}")

        for exp_name in exp_names[:3]:
            vs = []
            for fold in folds:
                v = pd.read_csv(f"../data/philipp/{exp_name}/fold{fold}/validation_predictions.csv")

                p = v[[f"pred_discourse_effectiveness_{c}" for c in label_cols]].values

                v["Adequate"] = p[:, 0]
                v["Effective"] = p[:, 1]
                v["Ineffective"] = p[:, 2]

                vs.append(v)

            v = pd.concat(vs)
            v = v.set_index("discourse_id")
            v = v.loc[orig.index]

            pps.append(v[label_cols].values)

    pps = np.mean(pps, axis=0)

    preds.append(pps)

In [None]:
y = np.zeros_like(preds[0])
for ii, jj in enumerate([label_cols.index(x) for x in orig["discourse_effectiveness"].values]):
    y[ii,jj] = 1
ps = np.array(preds).copy()

def scale_probs(pp_single):
    pp = pp_single.copy()

    for _ in range(100):
        pp = pp * (y.mean(axis=0).reshape(1,3) / pp.mean(axis=0))
        pp = pp / pp.sum(axis=1, keepdims=True)
        
    return pp

for i,ppp in enumerate(ps):
    preds[i] = scale_probs(ppp)

In [None]:
def weights_tune(weights, preds):
    pp = np.average(preds, axis=0, weights=weights)
    
    eps = 0.0001
    pp = pp.clip(eps, 1 - eps)
    pp = pp / pp.sum(axis=1, keepdims=True)

        
    pp2 = pp.copy()
    for _ in range(10):
        pp2 = pp2 * (y.mean(axis=0) / pp2.mean(axis=0))
        pp2 = pp2 / pp2.sum(axis=1, keepdims=True)
    pp = pp2

    err = log_loss(y, pp)
    return err

from scipy.optimize import minimize
weights_init = [1] * len(preds)

res = minimize(weights_tune, weights_init, args=(preds), method='Nelder-Mead', tol=1e-6)
print("Optimized weights: ", res.x)
weights = res.x

pp = np.average(preds, axis=0, weights=weights)

In [None]:
eps = 0.0001
pp = pp.clip(eps, 1 - eps)
pp = pp / pp.sum(axis=1, keepdims=True)

pp = scale_probs(pp)

y = np.zeros_like(pp)

for ii, jj in enumerate([label_cols.index(x) for x in orig["discourse_effectiveness"].values]):
    y[ii,jj] = 1

print(log_loss(y, pp))

In [None]:
df = v[["essay_id", "discourse_type", "discourse_effectiveness", "Adequate", "Effective", "Ineffective"]].copy()
df["Adequate"] = pp[:,0]
df["Effective"] = pp[:,1]
df["Ineffective"] = pp[:,2]

df.to_csv("../data/oof_151_after_scaling.csv")

In [None]:
for model_id, model_pred in enumerate(preds):
    df[f"Adequate_{model_id}"] = model_pred[:,0]
    df[f"Effective_{model_id}"] = model_pred[:,1]
    df[f"Ineffective_{model_id}"] = model_pred[:,2]

df.to_csv("../data/oof_151_after_scaling_ind_models.csv")

In [None]:
np.save("../data/first_lvl_ensemble.npy", pp)

In [None]:
import pickle

with open("../data/first_lvl_ensemble.pkl", "wb") as f:
    pickle.dump(preds, f)