In [None]:
%pylab inline
plt.style.use("bmh")

In [None]:
import pathlib
import pandas as pd

from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from torch.utils.data import Dataset, SubsetRandomSampler

import torch
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
RS = 7345
np.random.seed(42)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#device = torch.device('cpu')

# Загрузка данных

In [None]:
DATA_DIR = pathlib.Path("./")

In [None]:
train = pd.read_csv(DATA_DIR.joinpath("/kaggle/input/sibur20-naming-data/train.csv"), index_col="pair_id")
test = pd.read_csv(DATA_DIR.joinpath("/kaggle/input/sibur20-naming-data/test.csv"), index_col="pair_id")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
test.info()

# Статистика таргета

In [None]:
train.is_duplicate.value_counts()

In [None]:
train[train.is_duplicate==1].sample(15)

In [None]:
train[train.is_duplicate==0].sample(15)

# Очистка данных

In [None]:
import pycountry
import re

In [None]:
countries = [country.name.lower() for country in pycountry.countries]

In [None]:
train["name_1"] = train["name_1"].str.lower()
train["name_2"] = train["name_2"].str.lower()

test["name_1"] = test["name_1"].str.lower()
test["name_2"] = test["name_2"].str.lower()

In [None]:
train[train.name_1.str.contains("gmbh")]

In [None]:
legal_entities = ["ltd.", "co.", "inc.", "b.v.", "s.c.r.l.", "gmbh", "pvt."]

for entity in tqdm(legal_entities):
    train.replace(re.compile(f"\s+{entity}\s*"), "", inplace=True)
    test.replace(re.compile(f"\s+{entity}\s*"), "", inplace=True)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.replace(re.compile(r"\s+\(.*\)"), "", inplace=True)
test.replace(re.compile(r"\s+\(.*\)"), "", inplace=True)

In [None]:
for country in tqdm(countries):
    train.replace(re.compile(country), "", inplace=True)
    test.replace(re.compile(country), "", inplace=True)

In [None]:
train.replace(re.compile(r"[^\w\s]"), "", inplace=True)
test.replace(re.compile(r"[^\w\s]"), "", inplace=True)

In [None]:
train.sample(25)

In [None]:
test.sample(25)

In [None]:
train['full_name'] = train['name_1'] + ' # ' + train['name_2']
test['full_name'] = test['name_1'] + ' # ' + test['name_2']

# Токенизатор

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [None]:
train['full_name'].iloc[0]

In [None]:
def tokenize(s):
    encoded_dict = tokenizer.encode_plus(s,
                                         add_special_tokens=True,
                                         max_length=32,
                                         pad_to_max_length=True,
                                         return_attention_mask=True,
                                         return_tensors='pt',
                                         truncation=True)
    return encoded_dict['input_ids'], encoded_dict['attention_mask']

### Побуквенный токенизатор

In [None]:
# Идея простая - в качестве токена символа используем номер символа Юникода
# Для CLS токен 99998, для SEP 99999
def char_tokenize(sample, length=512):
    if len(sample) > length:
        sample = sample[:length]
    
    res = np.array([0] * length)
    att = np.array([0] * length)
    
    for i, char in enumerate(sample):
        res[i + 1] = ord(char)
        att[i + 1] = 1
    res[0] = 99998
    res[i + 2] = 99999
    att[0] = 1
    att[i + 2] = 1
    
    res = torch.tensor(res, dtype=torch.int64).reshape(1, -1)
    att = torch.tensor(att, dtype=torch.int64).reshape(1, -1)

    return res, att 

# Датасет

In [None]:
class SiburDataset(Dataset):
    """Делаем бутстрапированную (с повторениями) выборку, сбалансированную по классам.
    Для экономии памяти делаем токанизацию в процессе выборки сэмплов."""
    def __init__(self, ones, zeros, size=2048, p=0.5, tokenizer=char_tokenize):
        """
        size - размер выборки, т.е. фактически размер датасета для одной эпохи.
        ones - все положительные сэмплы датасета.
        zeros - все отрицательные сэмплы датасета.
        p - вероятность положительного сэмпла. 0.5 для сбалансированной выборки.
        """
        self.size = size
        self.ones = ones
        self.zeros = zeros
        self.ones_len = len(ones)
        self.zeros_len = len(zeros)
        self.p = p
        self.tokenizer = tokenizer
        
    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        if idx >= self.size:
            raise StopIteration
        
        # Решаем, какой сэмпл выдать
        if np.random.random() < self.p:
            sample = self.ones[np.random.randint(0, self.ones_len)]
            target = 1
        else:
            sample = self.zeros[np.random.randint(0, self.zeros_len)]
            target = 0
        
        X_1, att_1 = self.tokenizer(sample[0])
        X_2, att_2 = att = self.tokenizer(sample[1])

        target = torch.tensor(target, dtype=torch.long)
        
        return X_1.reshape(-1), att_1.reshape(-1), X_2.reshape(-1), att_2.reshape(-1), target

In [None]:
class SiburFullDataset(Dataset):
    """Полный датасет без подвыборок."""
    def __init__(self, data, tokenizer=char_tokenize):
        self.data = data
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data.iloc[idx]
        word_1 = sample['name_1']
        word_2 = sample['name_2']
        target = sample['is_duplicate']
        X_1, att_1 = self.tokenizer(word_1)
        X_2, att_2 = att = self.tokenizer(word_2)
        target = torch.tensor(target, dtype=torch.long)
        
        return X_1.reshape(-1), att_1.reshape(-1), X_2.reshape(-1), att_2.reshape(-1), target

In [None]:
class SiburPredictDataset(Dataset):
    """Датасет для финального предикта."""
    def __init__(self, data, tokenizer=char_tokenize):
        self.data = data
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        
        X_1, att_1 = self.tokenizer(sample[0])
        X_2, att_2 = att = self.tokenizer(sample[1])
        
        return X_1.reshape(-1), att_1.reshape(-1), X_2.reshape(-1), att_2.reshape(-1)

In [None]:
#тренировочная, валидационная выборки
split = StratifiedShuffleSplit(1, train_size=0.8, random_state=42)
tridx, cvidx = list(split.split(train, train["is_duplicate"]))[0]

In [None]:
trdat = train.iloc[tridx]
valdat = train.loc[cvidx]

In [None]:
#columns = ['name_1', 'name_2']
#batch_size=128
#ones = trdat.loc[trdat['is_duplicate'] == 1, columns].values
#zeros = trdat.loc[trdat['is_duplicate'] == 0, columns].values
#train_dataset = SiburDataset(ones, zeros, p=0.01, size=20480, tokenizer=tokenize)
#train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)

In [None]:
#batch_size=128
#ones = valdat.loc[valdat['is_duplicate'] == 1, columns].values
#zeros = valdat.loc[valdat['is_duplicate'] == 0, columns].values
#valid_dataset = SiburDataset(ones, zeros, p=0.01, size=10240, tokenizer=tokenize)
#valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size)

In [None]:
batch_size = 128
train_dataset = SiburFullDataset(trdat, tokenizer=tokenize)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)

valid_dataset = SiburFullDataset(valdat, tokenizer=tokenize)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size)

In [None]:
# Проверка датасета
X_1, att_1, X_2, att_2, target = next(iter(train_loader))
X_1.shape, att_1.shape, X_2.shape, att_2.shape, target.shape

# Модель

In [None]:
class SiburBerta(torch.nn.Module):
    """Используем мультиязычную Берту в качестве основы, прикручиваем два
    линейных слоя на вторую голову."""
  
    def __init__(self, freeze=False):
        super(SiburBerta, self).__init__()
        
        self.backbone = BertModel.from_pretrained('bert-base-multilingual-cased')

        # замораживаем Берту
        if freeze:
            for param in self.backbone.parameters():
                param.requires_grad = False
        
        self.linear_1 = torch.nn.Linear(in_features=1536, out_features=256)
        self.linear_2 = torch.nn.Linear(in_features=256, out_features=2)
        self.softmax = torch.nn.Softmax(dim=1)
        self.relu = torch.nn.ReLU()
        self.flatten = torch.nn.Flatten()

    def forward(self, X_1, att_1, X_2, att_2):
        X_1 = self.backbone(X_1, att_1)[1]
        X_2 = self.backbone(X_2, att_2)[1]

        X = torch.cat((X_1, X_2), 1)
        X = self.flatten(X)
       
        X = self.linear_1(X)
        X = self.relu(X)
        
        X = self.linear_2(X)
        X = self.softmax(X)
      
        return X   
    
    
class NNModel():
    
    def __init__(self, model, metric_func=f1_score, best_model_name='_best_model.pt'):
        """
        model - модель pytorch
        metric_func - функция метрики "больше-лучше". 
        best_model_name - название файла для промежуточного сохранения лучшей модели
        """
        self.model = model
        self.metric_func = metric_func
        self.best_score = 0
        self.best_ep = 0
        self.best_model_name = best_model_name
        self.preds = np.array([])
        self.target = np.array([])
        
    def train(self, epochs, learning_rate, weight_decay, schedule_rate, loss,
              train_loader, validation_loader=None, early_stopping=5):
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate,
                                          weight_decay=weight_decay)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer,
                                                         step_size=1,
                                                         gamma=schedule_rate)
        loss_history = []
        train_history = []
        val_history = []
        stop_counter = 0
        
        for i in range(epochs):
            num_ep = i + 1
            loss, metric = self._train_epoch(train_loader, num_ep)
            loss_history.append(loss)
            train_history.append(metric)
            
            if validation_loader is not None:
                val_metric = self.evaluate(validation_loader, num_ep)
                val_history.append(val_metric)
                
                if val_metric > self.best_score:
                    self.save(self.best_model_name)
                    self.best_ep = num_ep
                    self.best_score = val_metric
                    stop_counter = 0
                else:
                    stop_counter += 1
                    
                if stop_counter >= early_stopping:
                    print('Early stopping on validation score.')
                    break
                
            self.scheduler.step()
            
        if self.best_score > 0:
            self.load(self.best_model_name)
            print(f'Best model from {self.best_ep} iteration loaded.')
                
        return loss_history, train_history, val_history
 
    def _train_epoch(self, train_loader, num_ep):
        self._clear_score()
        self.model.train()
        losses = []
        with tqdm(total=len(train_loader)) as progress_bar:
            for X_1, att_1, X_2, att_2, target in train_loader:
                X_1 = X_1.to(device)
                X_2 = X_2.to(device)
                att_1 = att_1.to(device)
                att_2 = att_2.to(device)
                target = target.to(device)
                
                preds = self.model(X_1, att_1, X_2, att_2)
                loss_val = loss(preds, target)
                
                self.optimizer.zero_grad()
                loss_val.backward()
                self.optimizer.step()

                losses.append(loss_val.item())
                metric = self._score(target, preds[:, 1])

                progress_bar.update()
                progress_bar.set_description('Epoch {}: {:>5s} loss = {:.5f}, metric = {:.2f}'.format(
                                             num_ep, 'train', np.mean(losses),  metric))
                
        return np.mean(losses), metric
                
    def _score(self, target, preds, threshold=0.5):
        preds = preds.cpu().detach().numpy()
        target = target.cpu().detach().numpy()
                
        preds = (preds > threshold).astype(int)
        
        self.preds = np.append(self.preds, preds)
        self.target = np.append(self.target, target)
        
        metric = self.metric_func(self.target, self.preds)
        
        return metric
    
    def _clear_score(self):
        self.preds = np.array([])
        self.target = np.array([])
    
    def evaluate(self, validation_loader, num_ep):
        self._clear_score()
        self.model.eval()                                     

        with tqdm(total=len(validation_loader)) as progress_bar:
            with torch.no_grad():
                for X_1, att_1, X_2, att_2, target in validation_loader:
                    X_1 = X_1.to(device)
                    X_2 = X_2.to(device)
                    att_1 = att_1.to(device)
                    att_2 = att_2.to(device)
                    target = target.to(device)
                
                    preds = self.model(X_1, att_1, X_2, att_2)[:, 1]

                    metric = self._score(target, preds)
                    
                
                    progress_bar.update()
                    progress_bar.set_description('Epoch {}: {:>5s} metric = {:.2f}'.format(
                                                 num_ep, 'validation', metric))
                
        return metric
    
    def predict(self, data_loader):
        result = np.array([])
        self.model.eval()   
                                                 
        with tqdm(total=len(data_loader)) as progress_bar:
            with torch.no_grad():
                for X_1, att_1, X_2, att_2 in data_loader:
                    X_1 = X_1.to(device)
                    X_2 = X_2.to(device)
                    att_1 = att_1.to(device)
                    att_2 = att_2.to(device)

                    preds = self.model(X_1, att_1, X_2, att_2)[:, 1]
                    preds = preds.cpu().detach().numpy()
                    result = np.append(result, preds)                             
                    
                    progress_bar.update()
                    progress_bar.set_description('{:>5s}'.format('Prediction'))
        return result
    
    def load(self, path):
        self.model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
        return self
        
    def save(self, path):
        torch.save(self.model.state_dict(), path)
        return self
        
    def to(self, device):
        self.model.to(device)
        return self

In [None]:
model = NNModel(SiburBerta(freeze=True).to(device))
weights = torch.tensor([1, 100], dtype=torch.float32).to(device)
loss = torch.nn.CrossEntropyLoss(weight=weights)

In [None]:
loss_history, train_history, val_history = model.train(epochs=20, learning_rate=5e-4, weight_decay=1e-3,
                                                       schedule_rate=0.95, loss=loss, 
                                                       train_loader=train_loader,
                                                       validation_loader=valid_loader, early_stopping=3)

In [None]:
#не нужно, т.к. если указан валидационный сет, то лучшая модель сохраняется сама
#model.save('Berta_1.pt')

In [None]:
plt.plot(loss_history)
plt.title('Loss')

In [None]:
plt.plot(train_history, label='train', c='y')
plt.plot(val_history, label='test', c='blue')
plt.title('Learning curves')
plt.legend();

# Сабмит

In [None]:
test_dataset = SiburPredictDataset(test.loc[:, ['name_1', 'name_2']].values,  tokenizer=tokenize)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128)

In [None]:
# тест функции
#preds = np.random.random(len(test))

In [None]:
#model.load('/kaggle/input/berta-weights-sibur/_best_model.pt')

In [None]:
preds = model.predict(test_loader)

In [None]:
def submit(preds, threshold=0.5, filename='submit.csv'):
    labels = (preds > threshold).astype(int)
    result = pd.DataFrame({'pair_id': test.index,
                           'is_duplicate': labels})
    print(f'Число положительных классов для threshold={threshold}: {result["is_duplicate"].sum()} / {result["is_duplicate"].mean():.2%}')
    result.to_csv(filename, index=False)
    print('Done!')

In [None]:
for thr in [0.5, 0.6, 0.7, 0.8, 0.9]:
    filename = f'submit_{thr}.csv'
    submit(preds, threshold=thr, filename=filename)