In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import gc

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel, AdamW


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
! nvidia-smi

/usr/bin/sh: nvidia-smi: 未找到命令


In [2]:
#数据分析预处理
df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv")
df.head()

预处理方式
1. 特殊符号过滤
2. 大小写
3. 停用词

In [3]:
def special_filter():
    pass
def lowercase():
    pass
def stop_word():
    pass
def no_action():
    pass
#baseline 不做预处理
no_action()

In [4]:
import matplotlib.pyplot as plt
#数据分析
#句子长度统计
sentences = list(set(df['less_toxic'].values.tolist()+df['more_toxic'].values.tolist()))
print(len(sentences))

sen_lengths = [len(i) for i in sentences]
plt.hist(sen_lengths,bins = 100)
plt.show()

# 去除过长的数据

In [5]:
sentences[-1]

In [6]:
#训练集和测试集分割
train_data = df.sample(frac=0.9,random_state=200) #random state is a seed value
test_data = df.drop(train_data.index)
print(train_data.shape)
print(test_data.shape)

In [7]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.more_toxic = df['more_toxic'].values
        self.less_toxic = df['less_toxic'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        more_toxic = self.more_toxic[index]
        less_toxic = self.less_toxic[index]
        inputs_more_toxic = self.tokenizer.encode_plus(
                                more_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        inputs_less_toxic = self.tokenizer.encode_plus(
                                less_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        target = 1
        
        more_toxic_ids = inputs_more_toxic['input_ids']
        more_toxic_mask = inputs_more_toxic['attention_mask']
        
        less_toxic_ids = inputs_less_toxic['input_ids']
        less_toxic_mask = inputs_less_toxic['attention_mask']
        
        
        return {
            'more_toxic_ids': torch.tensor(more_toxic_ids, dtype=torch.long),
            'more_toxic_mask': torch.tensor(more_toxic_mask, dtype=torch.long),
            'less_toxic_ids': torch.tensor(less_toxic_ids, dtype=torch.long),
            'less_toxic_mask': torch.tensor(less_toxic_mask, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.long)
        }

In [8]:
class MyModel(nn.Module):
    def __init__(self, config):
        super(MyModel, self).__init__()
        self.model = AutoModel.from_pretrained(config.model_name)
        self.drop = nn.Dropout(p=config.dropout)
        self.fc = nn.Linear(config.hidden_dim,config.num_labels)
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask)
        out = self.drop(out[1])
        outputs = self.fc(out)
        return outputs

In [9]:
def criterion(outputs1, outputs2, targets):
    return (outputs1, outputs2, targets)

In [10]:
def train_one_epoch(model,config, optimizer, criterion,scheduler, dataloader, device, epoch):
    model.train()
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
        more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.long)
        less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
        less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = more_toxic_ids.size(0)

        more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
        
        loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)
        loss = loss / config.n_accumulate
        loss.backward()
    
        if (step + 1) % config.n_accumulate == 0:
            optimizer.step()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss

In [11]:
@torch.no_grad()
def test_one_epoch(model,criterion,dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    right = 0
    total = 0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
        more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.long)
        less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
        less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = more_toxic_ids.size(0)

        more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
        right += (more_toxic_outputs>less_toxic_outputs).sum()
        total += batch_size 
        
        loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])   
        
    print("acc:{}".format(right/total))
    
    gc.collect()
    
    return epoch_loss

In [12]:
from dataclasses import dataclass

@dataclass
class Config:
    """train config"""
    model_name: str = "bert-base-cased"
    hidden_dim: int = 768
    max_length: int = 512
    dropout: float = 0.3
    num_labels: int = 1
    lr: float = 0.001
    epoch: int = 2
    n_accumulate: int = 1
    weight_decay: float = 1e-6
config = Config()
config

In [13]:
!CUDA_LAUNCH_BLOCKING=1

In [14]:
#train!

# def train_one_epoch(model, config,optimizer,criterion, scheduler, dataloader, device, epoch)
# def test_one_epoch(model, criterion,dataloader, device, epoch):
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
model = MyModel(config)
criterion = nn.MarginRankingLoss(margin=0.5)
optimizer = AdamW(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
scheduler = None
#class MyDataset(Dataset):
#     def __init__(self, df, tokenizer, max_length)
tmp_train_data = train_data.sample(frac=0.01,random_state=200)
train_loader = DataLoader(MyDataset(tmp_train_data,tokenizer,config.max_length),batch_size=16,shuffle=True)
test_loader = DataLoader(MyDataset(tmp_train_data,tokenizer,config.max_length),batch_size=16)

model.to(device)

for i in range(config.epoch):
    train_one_epoch(model,config,optimizer,criterion,scheduler,train_loader,device,i)
    test_one_epoch(model,criterion,test_loader,device,i)


In [None]:
!nvidia-smi

# !pip install GPUtil

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()    

|===========================================================================|\n|                  PyTorch CUDA memory summary, device ID 0                 |\n|---------------------------------------------------------------------------|\n|            CUDA OOMs: 1            |        cudaMalloc retries: 1         |\n|===========================================================================|\n|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |\n|---------------------------------------------------------------------------|\n| Allocated memory      |   15011 MB |   15107 MB |   23364 MB |    8352 MB |\n|       from large pool |   15008 MB |   15104 MB |   23360 MB |    8352 MB |\n|       from small pool |       3 MB |       3 MB |       3 MB |       0 MB |\n|---------------------------------------------------------------------------|\n| Active memory         |   15011 MB |   15107 MB |   23364 MB |    8352 MB |\n|       from large pool |   15008 MB |   15104 MB |   23360 MB |    8352 MB |\n|       from small pool |       3 MB |       3 MB |       3 MB |       0 MB |\n|---------------------------------------------------------------------------|\n| GPU reserved memory   |   15076 MB |   15172 MB |   15172 MB |   98304 KB |\n|       from large pool |   15072 MB |   15168 MB |   15168 MB |   98304 KB |\n|       from small pool |       4 MB |       4 MB |       4 MB |       0 KB |\n|---------------------------------------------------------------------------|\n| Non-releasable memory |   66135 KB |   79575 KB |  415431 KB |  349295 KB |\n|       from large pool |   65280 KB |   77568 KB |  410618 KB |  345338 KB |\n|       from small pool |     855 KB |    2045 KB |    4813 KB |    3957 KB |\n|---------------------------------------------------------------------------|\n| Allocations           |     358    |     360    |     444    |      86    |\n|       from large pool |     193    |     195    |     269    |      76    |\n|       from small pool |     165    |     165    |     175    |      10    |\n|---------------------------------------------------------------------------|\n| Active allocs         |     358    |     360    |     444    |      86    |\n|       from large pool |     193    |     195    |     269    |      76    |\n|       from small pool |     165    |     165    |     175    |      10    |\n|---------------------------------------------------------------------------|\n| GPU reserved segments |     129    |     131    |     131    |       2    |\n|       from large pool |     127    |     129    |     129    |       2    |\n|       from small pool |       2    |       2    |       2    |       0    |\n|---------------------------------------------------------------------------|\n| Non-releasable allocs |      21    |      22    |      30    |       9    |\n|       from large pool |      19    |      19    |      23    |       4    |\n|       from small pool |       2    |       4    |       7    |       5    |\n|===========================================================================|\n

In [None]:
for i in range(10):
    train_one_epoch(model,config,optimizer,criterion,scheduler,train_loader,device,i)
    test_one_epoch(model,criterion,test_loader,device,i)

In [None]:
right = 0
total = 0
for step, data in enumerate(train_loader):
    more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
    more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.long)
    less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
    less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.long)
    targets = data['target'].to(device, dtype=torch.long)
        
    batch_size = more_toxic_ids.size(0)
    print(batch_size)

    more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
    print(more_toxic_outputs)
    less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
    print(less_toxic_outputs)
    right += (more_toxic_outputs>less_toxic_outputs).sum()
    print(right)
    total += batch_size 
        
    loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)
    print(loss)
    break