In [None]:
!nvidia-smi

Thu Aug 18 15:52:33 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -qqq --upgrade wandb
!pip install -qqq transformers
!pip install -qqq sentencepiece
!pip install -qqq colorama

[K     |████████████████████████████████| 1.8 MB 33.4 MB/s 
[K     |████████████████████████████████| 181 kB 62.4 MB/s 
[K     |████████████████████████████████| 157 kB 68.2 MB/s 
[K     |████████████████████████████████| 63 kB 1.7 MB/s 
[K     |████████████████████████████████| 157 kB 68.1 MB/s 
[K     |████████████████████████████████| 157 kB 66.1 MB/s 
[K     |████████████████████████████████| 157 kB 72.4 MB/s 
[K     |████████████████████████████████| 157 kB 65.5 MB/s 
[K     |████████████████████████████████| 156 kB 70.4 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 4.7 MB 28.3 MB/s 
[K     |████████████████████████████████| 101 kB 7.2 MB/s 
[K     |████████████████████████████████| 6.6 MB 56.4 MB/s 
[K     |████████████████████████████████| 596 kB 67.2 MB/s 
[K     |████████████████████████████████| 1.3 MB 35.0 MB/s 
[?25h

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Import Required Libraries 📚</h1></span>

In [None]:
import os
import gc
import copy
import time
import random
import string
import joblib
import re

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

# Utils
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW
from transformers import DataCollatorWithPadding

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

<img src="https://i.imgur.com/gb6B4ig.png" width="400" alt="Weights & Biases" />

<span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;"> Weights & Biases (W&B) is a set of machine learning tools that helps you build better models faster. <strong>Kaggle competitions require fast-paced model development and evaluation</strong>. There are a lot of components: exploring the training data, training different models, combining trained models in different combinations (ensembling), and so on.</span>

> <span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">⏳ Lots of components = Lots of places to go wrong = Lots of time spent debugging</span>

<span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">W&B can be useful for Kaggle competition with it's lightweight and interoperable tools:</span>

* <span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">Quickly track experiments,<br></span>
* <span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">Version and iterate on datasets, <br></span>
* <span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">Evaluate model performance,<br></span>
* <span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">Reproduce models,<br></span>
* <span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">Visualize results and spot regressions,<br></span>
* <span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">Share findings with colleagues.</span>

<span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">To learn more about Weights and Biases check out this <strong><a href="https://www.kaggle.com/ayuraj/experiment-tracking-with-weights-and-biases">kernel</a></strong>.</span>

In [None]:
import wandb

try:
    wandb.login(key="ae4e5eee4ca7500441c1fe2755f5318f06e8b409")
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
# def id_generator(size=12, chars=string.ascii_lowercase + string.digits):
#     return ''.join(random.SystemRandom().choice(chars) for _ in range(size))

# HASH_NAME = id_generator(size=12)
# print(HASH_NAME)

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Competitions/SIGNATE/STUDENT_CUP/penguin/input/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Competitions/SIGNATE/STUDENT_CUP/penguin/input/test.csv")
# sub = pd.read_csv("/content/drive/MyDrive/Competitions/SIGNATE/STUDENT_CUP/penguin/output/exp_043(MultiLabel).csv")
df.head()

Unnamed: 0,id,description,jobflag
0,0,<li>Develop cutting-edge web applications that...,3
1,1,"<li> Designs and develops high quality, scalab...",3
2,2,<li>Functions as a point person for Network St...,4
3,3,"<li> Work on the technical design, development...",3
4,4,<li>Quantify the resources required for a task...,4


In [None]:
# pl_df = pd.concat([test, sub], axis=1).drop(columns="0").rename(columns={"1": "jobflag"})
# pl_df

In [None]:
CONFIG = {"seed": 2022,
          "epochs": 5,
          "model_name": "microsoft/deberta-v3-large",
          "train_batch_size": 8,
          "valid_batch_size": 16,
          "max_length": 512,
          "learning_rate": 1e-5,
          "scheduler": 'CosineAnnealingLR',
          "min_lr": 1e-6,
          "T_max": 500, #スケジューラーにおける学習率の周期
          "weight_decay": 1e-6,
          "n_fold": 10,
          "n_accumulate": 4,
          "num_classes": 4,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
        #   "hash_name": HASH_NAME,
          "competition": "Signate",
          "_wandb_kernel": "deb",
          "dropout": 0.1,
          "output_name": "exp_049(MultiLabel)",
          "pooling": "mean pooling"
          }

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
CONFIG['group'] = f'{CONFIG["output_name"]}'

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#特徴量エンジニアリング

In [None]:
#htmlタグの抽出

l = []
for text in df["description"]:
    new_l = re.findall(r"<.{0,4}>", text)
    l.append(new_l)
l = list(set(sum(l, [])))
l

['<em>',
 '<p>',
 '<span>',
 '</ol>',
 '<li>',
 '</li>',
 '</ul>',
 '<ul>',
 '</em>',
 '</p>']

In [None]:
#普通に語彙として追加してみる

CONFIG["tokenizer"].add_tokens(l, special_tokens=True)

10

In [None]:
def get_over_sentence(tokenizer, train, max_length=256):
    over_sentence = []
    over_index = []
    train["len"]  = train["description"].apply(lambda x: len(x))
    train["len"].sort_values(ascending=False)
    over_train = train[train["len"] > max_length]
    for index, t in zip(over_train.index, over_train.description):
        tokenized = tokenizer.encode_plus(t)
        if len(tokenized.input_ids) > max_length:
            s = tokenizer.decode(tokenized.input_ids[max_length:])
            if "<li>" in s:
                idx = s.find("<li>")
                s = s[idx:]
                over_sentence.append(s)
                over_index.append(index)
    over_df = pd.DataFrame({"description": over_sentence, "jobflag": train.loc[over_index, "jobflag"]})

    return over_df

In [None]:
df["len"]  = df["description"].apply(lambda x: len(x))
df["len"].sort_values(ascending=False)
df["len_category"] = pd.qcut(df["len"], 10, labels=[i for i in range(1, 11)])

In [None]:
#効果なし
# over_df = get_over_sentence(CONFIG["tokenizer"], df, max_length=256)
# df = pd.concat([over_df, df[["description", "jobflag"]]], axis=0).reset_index()
# df

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Set Seed for Reproducibility</h1></span>

In [None]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Create Folds</h1></span>

#Multi label kfold

In [None]:
df

Unnamed: 0,id,description,jobflag,len,len_category
0,0,<li>Develop cutting-edge web applications that...,3,867,9
1,1,"<li> Designs and develops high quality, scalab...",3,1735,10
2,2,<li>Functions as a point person for Network St...,4,1448,10
3,3,"<li> Work on the technical design, development...",3,632,7
4,4,<li>Quantify the resources required for a task...,4,276,3
...,...,...,...,...,...
1511,1511,"<li>Support detailed reporting, statistical an...",1,599,7
1512,1512,<li>Collaborate with teams to support the ML t...,2,363,4
1513,1513,<li> Work with executives and other business l...,1,852,9
1514,1514,<li>Leading design ideation sessions to ensure...,3,331,4


In [None]:
#cv for pure train df
df["len"]  = df["description"].apply(lambda x: len(x))
df["len"].sort_values(ascending=False)
df["len_category"] = pd.qcut(df["len"], 10, labels=[i for i in range(1, 11)])

df = df.astype({"jobflag": "str", "len_category": "str"})

df["multi_label"] = df["jobflag"].str.cat(df["len_category"])

df = df.astype({"jobflag": "int", "len_category": "int"})


# kf = KFold(n_splits=CONFIG["n_fold"], shuffle=True, random_state=2022)
skf = StratifiedKFold(n_splits=CONFIG["n_fold"], shuffle=True, random_state=2022)

for fold, ( _, val_) in enumerate(skf.split(X=df, y=df["multi_label"])):
    df.loc[val_ , "kfold"] = int(fold)
    
df["kfold"] = df["kfold"].astype(int)
df.head()
df

Unnamed: 0,id,description,jobflag,len,len_category,multi_label,kfold
0,0,<li>Develop cutting-edge web applications that...,3,867,9,39,6
1,1,"<li> Designs and develops high quality, scalab...",3,1735,10,310,6
2,2,<li>Functions as a point person for Network St...,4,1448,10,410,9
3,3,"<li> Work on the technical design, development...",3,632,7,37,2
4,4,<li>Quantify the resources required for a task...,4,276,3,43,5
...,...,...,...,...,...,...,...
1511,1511,"<li>Support detailed reporting, statistical an...",1,599,7,17,9
1512,1512,<li>Collaborate with teams to support the ML t...,2,363,4,24,1
1513,1513,<li> Work with executives and other business l...,1,852,9,19,8
1514,1514,<li>Leading design ideation sessions to ensure...,3,331,4,34,1


In [None]:
#疑似ラベリングは効果なし
# #cv for pseudo labeling df
# pl_df["len"]  = pl_df["description"].apply(lambda x: len(x))
# pl_df["len"].sort_values(ascending=False)
# pl_df["len_category"] = pd.qcut(pl_df["len"], 10, labels=[i for i in range(1, 11)])

# pl_df = pl_df.astype({"jobflag": "str", "len_category": "str"})

# pl_df["multi_label"] = pl_df["jobflag"].str.cat(pl_df["len_category"])

# pl_df = pl_df.astype({"jobflag": "int", "len_category": "int"})


# # kf = KFold(n_splits=CONFIG["n_fold"], shuffle=True, random_state=2022)
# skf = StratifiedKFold(n_splits=CONFIG["n_fold"], shuffle=True, random_state=2022)

# for fold, ( _, val_) in enumerate(skf.split(X=pl_df, y=pl_df["multi_label"])):
#     pl_df.loc[val_ , "kfold"] = int(fold)
    
# pl_df["kfold"] = pl_df["kfold"].astype(int)
# pl_df.head()
# pl_df

#Stratified Kfold




In [None]:
# kf = KFold(n_splits=CONFIG["n_fold"], shuffle=True, random_state=2022)
# skf = StratifiedKFold(n_splits=CONFIG["n_fold"], shuffle=True, random_state=2022)

# for fold, ( _, val_) in enumerate(skf.split(X=df, y=df["jobflag"])):
#     df.loc[val_ , "kfold"] = int(fold)
    
# df["kfold"] = df["kfold"].astype(int)
# df.head()

In [None]:
###data augmentation###は下に移動しました。

def get_split(x):
    return list(filter(None, re.split(r"<.{1,4}>", x)))

df["split"] = df["description"].apply(get_split)
df["split"]

0       [Develop cutting-edge web applications that pe...
1       [ Designs and develops high quality, scalable ...
2       [Functions as a point person for Network Strat...
3       [ Work on the technical design, development, r...
4       [Quantify the resources required for a task/pr...
                              ...                        
1511    [Support detailed reporting, statistical analy...
1512    [Collaborate with teams to support the ML tech...
1513    [ Work with executives and other business lead...
1514    [Leading design ideation sessions to ensure we...
1515    [Detection of Issues &amp; Impact Assessments ...
Name: split, Length: 1516, dtype: object

In [None]:
df.loc[378, "description"]

'<li>Collaborates with internal stakeholders (e.g., Solution Architect, Account Delivery Executive, Pursuit Lead, Sales Solution Specialist) in the pre-sale process by understanding business requirements and providing industry and technical input and/or solution offerings to help shape the deal. Supports drafting proposals and/or statement of work (SOW). </li><li>Provides input on staffing and skill requirements for delivery to Resource Deployment, Technical Delivery Managers (TDMs), and/or Project Managers. </li></ul>Technical Delivery<ul><li>Follows capacity process outlined by Global Capacity Management team. Maintains tools with up-to-date skills and availability. </li><li>Leads meetings with customers/partners to understand business needs. Uses business, industry and technology strategies to map customer/partner requirements to the adoption and optimization of Microsoft technology solutions. Engages others appropriately to understand and define customer requirements. </li><li>Part

In [None]:
# df["description"] = df["description"].str[4:].str.strip()
# df["description"] = df["description"].str.replace("<..>", "")
# df["description"] = df["description"].str.replace("<...>", "")

# df["description"]

#Create Weights


In [None]:
weights = compute_class_weight(class_weight="balanced", classes=[1, 2, 3, 4], y=df["jobflag"])
weights

array([0.80982906, 4.30681818, 0.83296703, 0.75049505])

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Dataset Class</h1></span>

In [None]:
class SignateDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.description = df["description"].values
        self.targets = df['jobflag'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        description = self.description[index]
        inputs = self.tokenizer.encode_plus(
                        description,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len
                    )
        
        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'target': self.targets[index]
        }

In [None]:
collate_fn = DataCollatorWithPadding(tokenizer=CONFIG['tokenizer'])

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Create Model</h1></span>

In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [None]:
class SignateModel(nn.Module):
    def __init__(self, model_name, dropout):
        super(SignateModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=dropout)
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 4)        
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return outputs

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Loss Function</h1></span>

In [None]:
def criterion(outputs, labels, device):
    return nn.CrossEntropyLoss(weight=torch.Tensor(weights).to(device))(outputs, labels - 1)

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Training Function</h1></span>

In [None]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = ids.size(0)

        outputs = model(ids, mask)
        
        loss = criterion(outputs, targets, device)
        loss = loss / CONFIG['n_accumulate']
        loss.backward()
    
        if (step + 1) % CONFIG['n_accumulate'] == 0:
            optimizer.step()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Validation Function</h1></span>

In [None]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    output_list = []
    target_list = []
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = ids.size(0)

        outputs = model(ids, mask)
        
        loss = criterion(outputs, targets, device)
        # print(outputs, outputs.shape)
        # print(targets, targets.shape)
        # f1_macro = f1_score(torch.argmax(outputs, dim=1).cpu().detach().numpy(), (targets-1).cpu().detach().numpy(), average="macro")
        # print("F1_macro: ", f1_macro)
        output_list.append(torch.argmax(outputs, dim=1).cpu().detach().numpy().tolist())
        target_list.append((targets.cpu().detach().numpy()-1).tolist())

        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])

    #appendしたoutputとtargetを１次元化する
    output_list = sum(output_list, [])
    target_list = sum(target_list, [])
    f1_macro = f1_score(output_list, target_list, average="macro")
    
    gc.collect()
    
    return epoch_loss, f1_macro

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Run Training</h1></span>

In [None]:
def run_training(model, optimizer, scheduler, device, num_epochs, fold):
    # To automatically log gradients
    wandb.watch(model, log_freq=100)
    
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))

    path = f"/content/drive/MyDrive/kaggle/StudentCup/model/{CONFIG['output_name']}"
    if not os.path.exists(path):
        os.mkdir(path)
    elif not [f for f in os.listdir(path) if not f.startswith(".")]:
        pass
    # else:
    #     raise ValueError("既にそのフォルダあるよ")
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    best_epoch_f1 = -np.inf
    history = defaultdict(list)


    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)
        
        val_epoch_loss, f1 = valid_one_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch)
        
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        
        # Log the metrics
        wandb.log({"Train Loss": train_epoch_loss})
        wandb.log({"Valid Loss": val_epoch_loss})
        wandb.log({"F1-macro": f1})
        
        # deep copy the model
        if val_epoch_loss <= best_epoch_loss:
            print(f"{b_}Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            f1_macro = f1
            run.summary["Best Loss"] = best_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = f"/content/drive/MyDrive/kaggle/StudentCup/model/{CONFIG['output_name']}/Loss-Fold-{fold}.bin"

            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved{sr_}")

################################################
#損失関数に関わらず、F1スコアが上がればモデルを保存する
        if f1 >= best_epoch_f1:
            best_epoch_f1 = f1
            print(f"{b_}F1-score Improved({best_epoch_f1})")
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = f"/content/drive/MyDrive/kaggle/StudentCup/model/{CONFIG['output_name']}/BestF1Model/Loss-Fold-{fold}.bin"
            DIR = f"/content/drive/MyDrive/kaggle/StudentCup/model/{CONFIG['output_name']}/BestF1Model"

            if not os.path.exists(DIR):
                os.mkdir(DIR)

            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved{sr_}")
################################################


            
        print(f"Epoch {epoch} f1_score: ", f1)
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    print("Best F1: {:.4f}".format(best_epoch_f1))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history, f1_macro, best_epoch_f1

In [None]:
def prepare_loaders(fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    # df_pl = pl_df[pl_df.kfold != fold].reset_index(drop=True)

####df_trainだけdata augmentation############################################

    n_sample = 1 #sampleする数

    for texts, flag, fold in zip(df_train["split"], df_train["jobflag"], df_train["kfold"]):
        if len(texts) > n_sample:
            texts.remove(random.choice(texts))
            # texts = list(set(texts) - set(random.sample(texts, n_sample)))
        # random.shuffle(texts)
        des = []
        for text in texts:
            text = "<li>"+text+"</li>"
            des.append(text)
        texts = "".join(des)
        df_train = df_train.append({"description": texts, "jobflag": flag, "kfold": fold}, ignore_index=True)

    
    # df_train = pd.concat([df_train, df_pl], axis=0) #pseudo labelをtrainデータに結合
    train_dataset = SignateDataset(df_train, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
    valid_dataset = SignateDataset(df_valid, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], collate_fn=collate_fn, 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], collate_fn=collate_fn,
                              num_workers=2, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

In [None]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

<span style="color: #000508; font-family: Segoe UI; font-size: 1.5em; font-weight: 300;">Start Training</span>

In [None]:
if torch.cuda.is_available:
  print('GPU available')
else:
  print('Please set GPU via Edit -> Notebook Settings.')

GPU available


In [None]:
f1_average = []
best_f1_average = []
for fold in range(0, CONFIG['n_fold']):
    print(f"{y_}====== Fold: {fold} ======{sr_}")
    run = wandb.init(project='Signate', 
                     config=CONFIG,
                     job_type='Train',
                     group=CONFIG['group'],
                     tags=[CONFIG['model_name'], f'{CONFIG["output_name"]}'],
                     name=f'{CONFIG["output_name"]}-fold-{fold}',
                     anonymous='must')
    
    # Create Dataloaders
    train_loader, valid_loader = prepare_loaders(fold=fold)
    
    model = SignateModel(CONFIG['model_name'], CONFIG["dropout"])
    model.to(CONFIG['device'])
    
    # Define Optimizer and Scheduler
    optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])
    scheduler = fetch_scheduler(optimizer)
    
    model, history, best_f1_fold, best_epoch_f1 = run_training(model, optimizer, scheduler,
                                  device=CONFIG['device'],
                                  num_epochs=CONFIG['epochs'],
                                  fold=fold)
    
    print("F1-macro: ", best_f1_fold)
    f1_average.append(best_f1_fold)
    best_f1_average.append(best_epoch_f1)
    if fold == CONFIG["n_fold"] - 1:
        f1_cv = sum(f1_average) / CONFIG["n_fold"]
        best_f1_cv = sum(best_f1_average) / CONFIG["n_fold"]
        run.summary["CV"] = f1_cv
        run.summary["BestF1CV"] = best_f1_cv
    run.finish()
    
    del model, history, train_loader, valid_loader
    _ = gc.collect()
    print()


print(f"CV: {f1_cv:5f}")
print(f"Best f1 CV: {best_f1_cv:5f}")

In [None]:
url = f"https://wandb.ai/ando0718/Signate/groups/{CONFIG['group']}/"

# This is just to display the W&B run page in this interactive session.
from IPython import display

# we create an IFrame and set the width and height
# iF = display.IFrame(url, width=1080, height=720)
iF = display.IFrame(url, width=1500, height=600)

iF