In [1]:
#!kaggle datasets download -d takamichitoda/fb3-exp200-output
#!kaggle datasets download -d takamichitoda/fb3-exp203-output
#!kaggle datasets download -d takamichitoda/fb3-round-model
#!kaggle competitions download -c feedback-prize-effectiveness

Downloading fb3-exp200-output.zip to /home/jupyter
100%|██████████████████████████████████████▉| 4.26G/4.27G [00:55<00:00, 107MB/s]
100%|██████████████████████████████████████| 4.27G/4.27G [00:55<00:00, 82.3MB/s]
Downloading fb3-exp203-output.zip to /home/jupyter
100%|██████████████████████████████████████▊| 4.25G/4.27G [00:42<00:00, 118MB/s]
100%|███████████████████████████████████████| 4.27G/4.27G [00:42<00:00, 109MB/s]
Downloading fb3-round-model.zip to /home/jupyter
100%|█████████████████████████████████████▊| 4.51G/4.52G [01:00<00:00, 45.9MB/s]
100%|██████████████████████████████████████| 4.52G/4.52G [01:01<00:00, 79.6MB/s]
Downloading feedback-prize-effectiveness.zip to /home/jupyter
 61%|███████████████████████▎              | 5.00M/8.13M [00:00<00:00, 42.7MB/s]
100%|██████████████████████████████████████| 8.13M/8.13M [00:00<00:00, 63.3MB/s]


In [7]:
#!mkdir fb3-exp200-output
#!unzip fb3-exp200-output.zip -d fb3-exp200-output/
#!rm -rf fb3-exp200-output.zip

#!mkdir fb3-exp203-output
#!unzip fb3-exp203-output.zip -d fb3-exp203-output/
#!rm -rf fb3-exp203-output.zip

#!mkdir fb3-round-model
#!unzip fb3-round-model.zip -d fb3-round-model/
#!rm -rf fb3-round-model.zip

#!mkdir feedback-prize-effectiveness
#!unzip feedback-prize-effectiveness.zip -d feedback-prize-effectiveness/
#!rm -rf feedback-prize-effectiveness.zip

In [21]:
import gc
import os
import random
import numpy as np
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")

from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModel, AutoConfig

from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=false

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env: TOKENIZERS_PARALLELISM=false


In [22]:
class CFG:
    BS = 1
    OUTPUT = f"/home/jupyter/distillation_data_v4/"
    
    N_WORKERS = 0
    CONFIG = './fb3-exp200-output/config.pth'
    N_FOLD = 4
    
    ROUND_MODELS = [
        {
            "max_len": None,
            "tail": False,
            "model_name": "microsoft/deberta-v3-base",
            "model_type": "WLP",
            "path":[
                f"./fb3-round-model/microsoft-deberta-v3-base_seed0_fold{f}" for f in range(4)
            ]
        },{
            "max_len": None,
            "tail": False,
            "model_name": "microsoft/deberta-v3-base",
            "model_type": "normal",
            "path":[
                f"./fb3-exp200-output/microsoft-deberta-v3-base_seed0_fold{f}" for f in range(4)
            ]
        },{
            "max_len": None,
            "tail": False,
            "model_name": "microsoft/deberta-v3-base",
            "model_type": "normal",
            "path":[
                f"./fb3-exp203-output/microsoft-deberta-v3-base_seed0_fold{f}" for f in range(4)
            ]
        },
    ]
    TARGETS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    SEED = 0
    

TOKENIZER = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

#TOKENIZER.save_pretrained(CFG.OUTPUT+'/tokenizer/')
CFG.TOKENIZER = TOKENIZER
del TOKENIZER

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [23]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [24]:
train_df = pd.read_csv(f"./feedback-prize-english-language-learning/train.csv")

cv = MultilabelStratifiedKFold(n_splits=CFG.N_FOLD, shuffle=True, random_state=CFG.SEED)
for n, (train_index, valid_index) in enumerate(cv.split(train_df, train_df[CFG.TARGETS])):
    train_df.loc[valid_index, 'fold'] = int(n)
train_df['fold'] = train_df['fold'].astype(int)

train_df.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,fold
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,2
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,3
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,0
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,2


In [26]:
fb1 = pd.read_csv(f"./feedback-prize-effectiveness/train.csv")
display(fb1.head())

unlabel_lst = []
fb1_only_ids = set(fb1['essay_id']) - set(train_df['text_id'])
for _id in fb1_only_ids:
    lst = []
    with open(f"./feedback-prize-effectiveness/train/{_id}.txt", 'r') as f:
        for line in f:
            line = line.rstrip()
            lst.append(line)
    text = "\n".join(lst)
    d = (_id, text, 0, 0, 0, 0, 0, 0)
    unlabel_lst.append(d)
    
unlabel_df = pd.DataFrame(unlabel_lst, columns=['text_id', 'full_text']+CFG.TARGETS)
del unlabel_lst, lst, d, text, _id, fb1_only_ids, fb1
unlabel_df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,301D48946CB9,Can you really trust a car to take you from po...,0,0,0,0,0,0
1,13BD2F675384,"Dear Principle,\n\nAllow students to have phon...",0,0,0,0,0,0
2,D4E3104E7591,To assure students are continuing to learn ove...,0,0,0,0,0,0
3,24BCEBE0EEF1,I believe that the school cell phone policy sh...,0,0,0,0,0,0
4,B568E9BC99FD,Dear state senetor i am writing this letter to...,0,0,0,0,0,0


In [27]:
unlabel_df['fold'] = -1
train_df['origin'] = True
unlabel_df['origin'] = False

all_data_df = pd.concat([train_df, unlabel_df], axis=0)
all_data_df

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,fold,origin
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,2,True
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,3,True
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,0,True
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,0,True
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,2,True
...,...,...,...,...,...,...,...,...,...,...
4068,B7C17E1993BA,Does the Electoral College work?\n\nThe Electo...,0.0,0.0,0.0,0.0,0.0,0.0,-1,False
4069,030F2D5F039D,"Yes, the use of this technology to read the em...",0.0,0.0,0.0,0.0,0.0,0.0,-1,False
4070,20E41EFD4FB8,The author does a very good job of proving his...,0.0,0.0,0.0,0.0,0.0,0.0,-1,False
4071,D1B97A55865A,"Dear TEACHER_NAME,\n\nI think students should ...",0.0,0.0,0.0,0.0,0.0,0.0,-1,False


In [28]:
class TestDataset(Dataset):

    def __init__(self, df, maxlen=None, tail=False):
        self.texts = df['full_text'].values
        self.maxlen = maxlen
        self.tail = tail
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        if self.maxlen is None:
            inputs = CFG.TOKENIZER.encode_plus(
                self.texts[item], 
                return_tensors=None, 
                add_special_tokens=True, 

            )
        else:
            if not self.tail:
                inputs = CFG.TOKENIZER.encode_plus(
                    self.texts[item], 
                    return_tensors=None, 
                    add_special_tokens=True, 
                    max_length=self.maxlen,
                    pad_to_max_length=True,
                    truncation=True
                )
            else:
                inputs = CFG.TOKENIZER.encode_plus(
                    self.texts[item], 
                    return_tensors=None, 
                    add_special_tokens=False, 
                )
                lim = self.maxlen - 2
                inputs['input_ids'] = [1] + inputs['input_ids'][-lim:] + [2]
                inputs['attention_mask'] = [1] + inputs['attention_mask'][-lim:] + [1]
                inputs['token_type_ids'] = [0] + inputs['token_type_ids'][-lim:] + [0]
        
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        
        return inputs

In [29]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

class CustomModel(nn.Module):
    def __init__(self, config_path, model_name, model_type):
        super().__init__()

        self.config = torch.load(config_path)
        self.model = AutoModel.from_config(self.config)

        self.model_type = model_type

        if self.model_type in ["WLP", "WLP_scale"]:
            self.layer_pool = WeightedLayerPooling(
                self.config.num_hidden_layers, 
                layer_start=self.config.num_hidden_layers, layer_weights=None
            )
        
        self.pool = MeanPooling()
        
        if model_name == "microsoft/deberta-v3-xsmall":
            self.fc = nn.Sequential(
                nn.Linear(self.config.hidden_size, self.config.hidden_size),
                nn.ReLU(),
                nn.Linear(self.config.hidden_size, 6),
            )
        else:
            self.fc = nn.Linear(self.config.hidden_size, 6)


    def forward(self, inputs):
        outputs = self.model(**inputs)
        
        if self.model_type in ["WLP", "WLP_scale"]:
            all_hidden_states = torch.stack(outputs[1])
            hidden_states = self.layer_pool(all_hidden_states)
        else:
            hidden_states = outputs[0]
        
        feature = self.pool(hidden_states, inputs['attention_mask'])
        output = self.fc(feature)
        if self.model_type == "WLP_scale":
            output = output.sigmoid() * 4 + 1
        return output

In [30]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [31]:
predictions = []
mname_lst = []
for info in CFG.ROUND_MODELS:
    mname = info['path'][0].split('/')[2]
    print(mname)
    mname_lst.append(mname)
    _predictions = []
    test_dataset = TestDataset(all_data_df, maxlen=info['max_len'], tail=info['tail'])
    test_loader = DataLoader(test_dataset,
                         batch_size=CFG.BS,
                         shuffle=False,
                         collate_fn=DataCollatorWithPadding(tokenizer=CFG.TOKENIZER, padding='longest'),
                         num_workers=CFG.N_WORKERS, pin_memory=True, drop_last=False)
    c = CFG.CONFIG if info['model_name'] == 'microsoft/deberta-v3-base' else "../input/fb3-distribution/config.pth"
    model = CustomModel(config_path=c, model_name=info['model_name'], model_type=info['model_type'])
    r_preds = []
    for path in info['path']:
        r_preds = []
        for r in ['up', 'cut']:
            state = torch.load(f"{path}_{r}_best.pth", map_location=torch.device('cpu'))
            model.load_state_dict(state['model'])
            prediction = inference_fn(test_loader, model, device)
            r_preds.append(prediction)
        prediction = np.mean(r_preds, axis=0)
        _predictions.append(prediction)
        
        del state, prediction
        gc.collect()
        torch.cuda.empty_cache()
    predictions.append(_predictions)
    
predictions = np.array(predictions)

microsoft-deberta-v3-base_seed0_fold0


  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

microsoft-deberta-v3-base_seed0_fold0


  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

microsoft-deberta-v3-base_seed0_fold0


  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

  0%|          | 0/7984 [00:00<?, ?it/s]

In [32]:
all_data_df.to_csv(f"{CFG.OUTPUT}/text_for_distillation.csv", index=None)
np.save(f"{CFG.OUTPUT}/label_for_distillation", predictions)
!echo "{mname_lst}" > "{CFG.OUTPUT}/model_names.txt"

In [41]:
predictions.shape

(3, 4, 7984, 6)

In [37]:
import json
!kaggle datasets init -p {CFG.OUTPUT}

with open(f"{CFG.OUTPUT}/dataset-metadata.json", "r") as f:
    d = json.load(f)
    
t = f"FB3 distillation data v4"
d['title'] = t
d['id'] = "takamichitoda/"+"-".join(t.split())

with open(f"{CFG.OUTPUT}/dataset-metadata.json", "w") as f:
    json.dump(d, f)

Data package template written to: /home/jupyter/distillation_data_v4/dataset-metadata.json


In [38]:
!kaggle datasets create -p {CFG.OUTPUT}
#!kaggle datasets version -m "test" -p {CFG.OUTPUT}/

!kaggle datasets list -m --sort-by "updated"

Starting upload for file label_for_distillation.npy
100%|███████████████████████████████████████| 2.19M/2.19M [00:03<00:00, 675kB/s]
Upload successful: label_for_distillation.npy (2MB)
Skipping folder: tokenizer; use '--dir-mode' to upload folders
Starting upload for file model_names.txt
100%|███████████████████████████████████████████| 124/124 [00:02<00:00, 56.6B/s]
Upload successful: model_names.txt (124B)
Starting upload for file text_for_distillation.csv
100%|██████████████████████████████████████| 18.0M/18.0M [00:03<00:00, 5.53MB/s]
Upload successful: text_for_distillation.csv (18MB)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/datasets/takamichitoda/FB3-distillation-data-v4
ref                                                title                                 size  lastUpdated          downloadCount  voteCount  usabilityRating  
-------------------------------------------------  -----------------------------------  -----  --------------