In [1]:
! nvidia-smi

Sat Jul 30 04:01:50 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    25W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os

class Config:
    AUTHOR = "wanwan7123"

    NAME = "feedback-mlm-deberta-large"
    MODEL_PATH = "microsoft/deberta-large"
    DATASET_PATH = []

    COMPETITION = "feedback-prize-effectiveness"
    COLAB_PATH = "/content/drive/MyDrive/DataAnalysis/competicion/competicion_feedback" 
    DRIVE_PATH = os.path.join(COLAB_PATH, AUTHOR)

    api_path = "/content/drive/MyDrive/kaggle.json"

    seed = 41
    num_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    fc_dropout = 0.1
    weight_decay = 0.001
    beta = (0.9, 0.98)
    num_warmup_steps_rate = 0.01
    clip_grad_norm = None
    gradient_accumulation_steps = 1
    
    upload_from_colab = True

In [3]:
import os
import re
import gc
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval
from tqdm.auto import tqdm
from pathlib import Path
from glob import glob

import numpy as np
import pandas as pd
import scipy 
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import log_loss
!pip install torch==1.10

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
from torch.nn.utils.rnn import pad_sequence

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.10
  Downloading torch-1.10.0-cp37-cp37m-manylinux1_x86_64.whl (881.9 MB)
[K     |██████████████████████████████▎ | 834.1 MB 1.1 MB/s eta 0:00:43tcmalloc: large alloc 1147494400 bytes == 0x6487e000 @  0x7efcaae75615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x593dd7
[K     |████████████████████████████████| 881.9 MB 17 kB/s 
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.12.0+cu113
    Uninstalling torch-1.12.0+cu113:
      Successfully uninstalled torch-1.12.0+cu113
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour 

In [4]:
def setup(cfg):
    cfg.COLAB = 'google.colab' in sys.modules
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if cfg.COLAB:
        print('This environment is Google Colab')

        # mount
        from google.colab import drive
        if not os.path.isdir('/content/drive'):
            drive.mount('/content/drive') 

        # pip install
        ! pip install transformers==4.16.2
        ! pip install tokenizers==0.11.6
        ! pip install transformers[sentencepiece]

        # use kaggle api (need kaggle token)
        f = open(cfg.api_path, 'r')
        json_data = json.load(f) 
        os.environ['KAGGLE_USERNAME'] = json_data['username']
        os.environ['KAGGLE_KEY'] = json_data['key']

        # set dirs
        cfg.DRIVE = cfg.DRIVE_PATH
        cfg.EXP = (cfg.NAME if cfg.NAME is not None 
            else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
        )
        cfg.INPUT = os.path.join(cfg.DRIVE, 'Input')
        cfg.OUTPUT = os.path.join(cfg.DRIVE, 'Output')
        cfg.SUBMISSION = os.path.join(cfg.DRIVE, 'Submission')
        cfg.DATASET = os.path.join(cfg.DRIVE, 'Dataset')

        cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
        cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

        # make dirs
        for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
        
        if not os.path.isfile(os.path.join(cfg.INPUT, 'train.csv')):
            # load dataset
            ! pip install --upgrade --force-reinstall --no-deps kaggle
            ! kaggle competitions download -c $cfg.COMPETITION -p $cfg.INPUT
            filepath = os.path.join(cfg.INPUT,cfg.COMPETITION+'.zip')
            ! unzip -d $cfg.INPUT $filepath
            
        
        for path in cfg.DATASET_PATH:
            datasetpath = os.path.join(cfg.DATASET,  path.split('/')[1])
            if not os.path.exists(datasetpath):
                os.makedirs(datasetpath, exist_ok=True)
                ! kaggle datasets download $path -p $datasetpath
                filepath = os.path.join(datasetpath, path.split("/")[1]+'.zip')
                ! unzip -d $datasetpath $filepath

    else:
        print('This environment is Kaggle Kernel')

        # set dirs
        cfg.INPUT = f'../input/{cfg.COMPETITION}'
        cfg.EXP = cfg.NAME
        cfg.OUTPUT_EXP = cfg.NAME
        cfg.SUBMISSION = './'
        cfg.DATASET = '../input/'
        
        cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

        # make dirs
        for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
    return cfg

In [5]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [6]:
# setup
cfg = setup(Config)

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import (AutoModelForMaskedLM,
                          AutoTokenizer, LineByLineTextDataset,
                          DataCollatorForLanguageModeling,
                          Trainer, TrainingArguments)
import tokenizers
import sentencepiece
%env TOKENIZERS_PARALLELISM=true
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")

df=pd.read_csv(os.path.join(cfg.INPUT, 'mlm/mlm_for_feedback.csv'))
df=df.dropna().reset_index(drop=True)
display(df.head())

This environment is Google Colab
Mounted at /content/drive
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.16.2
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 14.8 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 80.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 88.7 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |██████████████████████

Unnamed: 0,full_text
0,Phones & Driving\n\nDrivers should not be able...
1,Cell Phone Operation While Driving\n\nThe abil...
2,People are debating whether if drivers should ...
3,Texting and driving\n\nOver half of drivers in...
4,Operating a motor vehicle while on your cell p...


In [7]:
df['full_text'] = df['full_text'].apply(lambda x:x.replace('\n', '[BR]'))

In [8]:
display(df.head())

Unnamed: 0,full_text
0,Phones & Driving[BR][BR]Drivers should not be ...
1,Cell Phone Operation While Driving[BR][BR]The ...
2,People are debating whether if drivers should ...
3,Texting and driving[BR][BR]Over half of driver...
4,Operating a motor vehicle while on your cell p...


In [9]:
with open(os.path.join(cfg.INPUT, 'mlm/corpus.txt'),'w',encoding='utf-8') as f:
    for ab in df['full_text']:
        f.write(ab+'\n')

In [10]:
model_name = 'microsoft/deberta-large'

model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/475 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaForMaskedLM: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight']
- This IS expected if you are initializing DebertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['cls.predictions.transform.dense.we

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

In [11]:
print("vocab size", len(tokenizer))

vocab size 50265


In [12]:
tokenizer.add_tokens(['[BR]'], special_tokens=True)
print("vocab size", len(tokenizer))

# ベクトルを追加
model.resize_token_embeddings(len(tokenizer))

vocab size 50266


Embedding(50266, 1024)

In [13]:
tokenizer.save_pretrained(os.path.join(cfg.OUTPUT_EXP, 'tokenizer'))

('/content/drive/MyDrive/DataAnalysis/competicion/competicion_feedback/wanwan7123/Output/feedback-mlm-deberta-large/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/DataAnalysis/competicion/competicion_feedback/wanwan7123/Output/feedback-mlm-deberta-large/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/DataAnalysis/competicion/competicion_feedback/wanwan7123/Output/feedback-mlm-deberta-large/tokenizer/vocab.json',
 '/content/drive/MyDrive/DataAnalysis/competicion/competicion_feedback/wanwan7123/Output/feedback-mlm-deberta-large/tokenizer/merges.txt',
 '/content/drive/MyDrive/DataAnalysis/competicion/competicion_feedback/wanwan7123/Output/feedback-mlm-deberta-large/tokenizer/added_tokens.json',
 '/content/drive/MyDrive/DataAnalysis/competicion/competicion_feedback/wanwan7123/Output/feedback-mlm-deberta-large/tokenizer/tokenizer.json')

In [14]:
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=os.path.join(cfg.INPUT, 'mlm/corpus.txt'),  # mention train text file here
    block_size=256)

valid_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=os.path.join(cfg.INPUT, 'mlm/corpus.txt'),  # mention valid text file here
    block_size=256)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

training_args = TrainingArguments(
    output_dir= cfg.EXP_MODEL,  # select model path for checkpoint
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    weight_decay=0.001,
    evaluation_strategy='steps',
    save_total_limit=1,
    eval_steps=500,
    save_steps=500,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end=False,
    report_to="none",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset)

trainer.train()
trainer.save_model(cfg.EXP_MODEL)

Using amp half precision backend
***** Running training *****
  Num examples = 11406
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 14260


Step,Training Loss,Validation Loss
500,3.6044,2.309995
1000,2.2054,1.86247
1500,1.8893,1.680219
2000,1.7241,1.553409
2500,1.614,1.472004
3000,1.5517,1.414549
3500,1.4876,1.366595
4000,1.4478,1.314063
4500,1.3874,1.288961
5000,1.3735,1.253332


***** Running Evaluation *****
  Num examples = 11406
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/DataAnalysis/competicion/competicion_feedback/wanwan7123/Output/feedback-mlm-deberta-large/model/checkpoint-500
Configuration saved in /content/drive/MyDrive/DataAnalysis/competicion/competicion_feedback/wanwan7123/Output/feedback-mlm-deberta-large/model/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/DataAnalysis/competicion/competicion_feedback/wanwan7123/Output/feedback-mlm-deberta-large/model/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 11406
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/DataAnalysis/competicion/competicion_feedback/wanwan7123/Output/feedback-mlm-deberta-large/model/checkpoint-1000
Configuration saved in /content/drive/MyDrive/DataAnalysis/competicion/competicion_feedback/wanwan7123/Output/feedback-mlm-deberta-large/model/checkpoint-1000/config.json
Model weights sa