In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import torch
import numpy as np
import seaborn
from sklearn.metrics import log_loss
from tqdm.auto import tqdm
from transformers import (
    get_constant_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
)
import os
import pickle

In [None]:
with open(f"../../data/first_lvl_ensemble.pkl", "rb") as f:
    preds = pickle.load(f)
    
pp = np.load("../../data/first_lvl_ensemble.npy")

In [None]:
df = pd.read_csv("../../data/train_folded.csv")
label_cols = ["Adequate", "Effective", "Ineffective"]

y = np.zeros((len(df),3))

for ii, jj in enumerate([label_cols.index(x) for x in df["discourse_effectiveness"].values]):
    y[ii,jj] = 1

for i,l in enumerate(label_cols):
    df[l] = y[:,i]

In [None]:
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
import torch

class FeedbackStackerDataset(Dataset):

    def __init__(self, df, mode):
        self.df = df.copy()
        self.mode = mode

        self.label_cols = label_cols.copy()
        
        p = [p[self.df.index.values] for p in preds.copy()]
        p = np.stack(p)
        
        df = self.df
        
        X = []
        for j in range(p.shape[0]):
            cols = []
            for jj, l in enumerate(label_cols):

                df[f"oof_{l}"] = p[j,:,jj]
                cols.append(f"oof_{l}")
                
                df[f"oof_{l}_mean"] = df.groupby("essay_id")[f"oof_{l}"].transform("mean")
                cols.append(f"oof_{l}_mean")

                df[f"oof_{l}_t_mean"] = df.groupby(["essay_id", "discourse_type"])[f"oof_{l}"].transform("mean")
                cols.append(f"oof_{l}_t_mean")
                
            df[f"len"] = df.groupby("essay_id")[f"discourse_id"].transform("count") / 10
            cols.append(f"len")
        
            
            X.append(df[cols].values)
         
        X = np.stack(X).transpose(1,2,0)
        print(X.shape)

        self.X = X
        self.y = self.df[self.label_cols].values

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        X = self.X[idx]
        y = self.y[idx]
        
        return torch.FloatTensor(X), torch.FloatTensor(y)


    def __len__(self):
        return self.df.shape[0]
    
ds = FeedbackStackerDataset(df.copy(), mode="train")
print(ds[0][0].shape)
print(ds[0])

In [None]:
import torch.nn as nn


class FeedbackStackerModel(nn.Module):
    def __init__(self, n_features):
        super(FeedbackStackerModel, self).__init__()
        
        self.sizes = [256, 128, 64]
        
        layers = []
        for j,s in enumerate(self.sizes):
            if j == 0:
                layers.append(nn.Conv1d(n_features, s, 1))
            else:
                layers.append(nn.Conv1d(self.sizes[j-1], s, 1))
            layers.append(nn.PReLU())
            layers.append(nn.Dropout(0.2))
        
        self.features = nn.Sequential(*layers)
        
        self.head = nn.Linear(self.sizes[-1], 3)
        
        self.loss_fn = nn.CrossEntropyLoss()
        
    def forward(self, x, y):
        x = self.features(x)
        x = x.mean(dim=2)
        x = self.head(x)        
        
        output = {}
        
        output["logits"] = x
        
        if self.training:
            output["loss"] = self.loss_fn(x, y.argmax(dim=1))
        
        return output

In [None]:
LR = 0.001
BATCH_SIZE = 8
EPOCHS = 50

exp_name = "nn_v11_blend151_ff"

if not os.path.exists(f"nn_models/{exp_name}"):
    os.makedirs(f"nn_models/{exp_name}")
    
oof = np.zeros_like(y)

for FOLD in range(5):
    
    print()
    print(f"======FOLD {FOLD}=====")
    print()
    df_train = df[df.fold!=FOLD]
    df_val = df[df.fold==FOLD]

    seed_preds = []
    for seed in range(1):

        DEVICE = "cuda:0"

        train_ds = FeedbackStackerDataset(df, mode="train")
        train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0,
                                      pin_memory=False, drop_last=True)

        val_ds = FeedbackStackerDataset(df_val, mode="valid")
        val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0,
                                      pin_memory=False, drop_last=False)

        model = FeedbackStackerModel(n_features=train_ds.X.shape[1])
        model.to(DEVICE)

        optimizer = torch.optim.Adam(params=model.parameters(), lr = LR)

        scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=EPOCHS * len(train_loader))

        model.train()

        for e in tqdm(range(EPOCHS)):
            #print(f"-----EPOCH {e} -----")
            tbar = tqdm(train_loader, disable=True)


            loss_list = []
            p = []
            y_train = []

            model.train()

            for idx, data in enumerate(tbar):
                data = [x.to(DEVICE) for x in data]
                inputs, target = data

                optimizer.zero_grad()
                output = model(inputs, target)

                loss = output["loss"]

                loss.backward()
                optimizer.step()

                loss_list.append(loss.detach().cpu().item())

                avg_loss = np.round(np.mean(loss_list), 4)

                tbar.set_description(f"Epoch {e+1} Loss: {avg_loss} lr: {optimizer.param_groups[0]['lr']:.5f}")

                scheduler.step()

                p.append(output["logits"].softmax(dim=1).detach().cpu().numpy())
                y_train.append(target.detach().cpu().numpy())

            y_train = np.concatenate(y_train, axis=0)
            p = np.concatenate(p, axis=0)
            err = log_loss(y_train.argmax(axis=1), p)

            model.eval()

            p = []
            for idx, data in enumerate(val_loader):
                data = [x.to(DEVICE) for x in data]
                inputs, target = data

                output = model(inputs, target)

                p.append(output["logits"].softmax(dim=1).detach().cpu().numpy())


            y_val = y[df_val.index.values]
            p = np.concatenate(p, axis=0)
            err = log_loss(y_val.argmax(axis=1), p)

        print("-----")
        seed_preds.append(p.copy())
        p = np.mean(seed_preds, axis=0)
        
        oof[df_val.index.values] = p
        
        err = log_loss(y[df_val.index.values].argmax(axis=1), p)
        print("VAL SEED BLEND", err)

        ppp = 0.5*p.copy() + 0.5*pp[df_val.index.values]
        for _ in range(10):

            ppp = ppp * (y.mean(axis=0) / ppp.mean(axis=0)) #* (y.std(axis=0)/ppp.std(axis=0))


            ppp = ppp / ppp.sum(axis=1, keepdims=True)

        err = log_loss(y[df_val.index.values].argmax(axis=1), ppp)
        print("VAL SEED BLEND OPT", err)
        
        torch.save(model.state_dict(), f"nn_models/{exp_name}/checkpoint_fold{FOLD}_seed{seed}.pth")

In [None]:
df_p = df.copy()
df_p[label_cols] = oof

In [None]:
df_p[["discourse_id"]+label_cols].to_csv(f"nn_models/{exp_name}_validation_predictions.csv", index=False)