## Config

Prepping things up for TPU

In [None]:
import os
assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

In [None]:
VERSION = "nightly"  #@param ["1.5" , "20200516", "nightly"]
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version $VERSION

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  4139  100  4139    0     0  17103      0 --:--:-- --:--:-- --:--:-- 17032
Updating TPU and VM. This may take around 2 minutes.
Updating TPU runtime to pytorch-nightly ...
Collecting cloud-tpu-client
  Downloading https://files.pythonhosted.org/packages/56/9f/7b1958c2886db06feb5de5b2c191096f9e619914b6c31fdf93999fdbbd8b/cloud_tpu_client-0.10-py3-none-any.whl
Collecting google-api-python-client==1.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/9a/b4/a955f393b838bc47cbb6ae4643b9d0f90333d3b4db4dc1e819f36aad18cc/google_api_python_client-1.8.0-py3-none-any.whl (57kB)
[K     |████████████████████████████████| 61kB 2.0MB/s 
Uninstalling torch-1.5.1+cu101:
Insta

In [None]:
import os
from pathlib import Path
from google.colab import drive
import shutil

def create_path(path):
    if not os.path.isdir(path):
        path.mkdir(parents=True, exist_ok=True)
    return path

colab_path = Path('/content')

In [None]:
create_path(colab_path/'dataset');
create_path(colab_path/'models');

!git clone --quiet 'https://github.com/tezike/download_google_drive.git'
os.chdir('download_google_drive')
!python download_gdrive.py '10rH0nAxa7mWS289xIyRP-mOOowqiIolL' '../dataset/temp.tgz'
shutil.rmtree('../download_google_drive')
os.chdir('..')

## Colab_setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')
root_dir = Path('/content/drive/My Drive')
base_path = create_path(root_dir/'Rakuten')
base_path

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


PosixPath('/content/drive/My Drive/Rakuten')

## Imports

In [None]:
!pip install transformers -q

[K     |████████████████████████████████| 778kB 3.4MB/s 
[K     |████████████████████████████████| 890kB 14.7MB/s 
[K     |████████████████████████████████| 1.1MB 24.5MB/s 
[K     |████████████████████████████████| 3.0MB 33.3MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
import os
import re
import time
import string
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup

from tqdm import notebook
from pathlib import Path

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

prepping things up for TPU

In [None]:
import torch_xla
import torch_xla.core.xla_model as xm #handles most of the basic tasks
import torch_xla.debug.metrics as met
import torch_xla.distributed.parallel_loader as pl #handles dataloading on multiple processes
import torch_xla.distributed.xla_multiprocessing as xmp

## Config

In [None]:
class Config():
    def __init__(self):
        self.MODEL_NAME = 'camembert-base'
        # self.LM_MODEL = transformers.CambertForMaskedLM.from_pretrained(self.MODEL_NAME)
        self.CLAS_MODEL = transformers.CamembertModel #.from_pretrained(MODEL_NAME)
        self.TOKENIZER = transformers.CamembertTokenizer.from_pretrained(
                    pretrained_model_name_or_path=self.MODEL_NAME,
                    do_lower_case=True,
                    )
        self.MODEL_CONFIG = transformers.CamembertConfig.from_pretrained(self.MODEL_NAME)
        self.COLAB_PATH = Path('/content')
        self.BASE_PATH = base_path
        self.DATA_PATH = create_path(base_path/'dataset')
        self.MODEL_PATH = create_path(base_path/'models')
        self.TEST_FILE = self.COLAB_PATH/'SIGIR-2020-EComDC-release/data/x_test_task1_phase1.tsv'
        self.CLEAN_DF = self.DATA_PATH/'clean_folds_df.csv'
        self.MAX_SEQ_LEN = 256
        self.DEVICE = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
        self.TRAIN_BATCH_SIZE = 32
        self.VALID_BATCH_SIZE = 16
        self.NUM_EPOCHS = 10

config = Config()

## DATASET

In [None]:
class BertDataset(torch.utils.data.Dataset):
    def __init__(self, text, label):
        self.text, self.label = text, label
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_SEQ_LEN

    def __len__(self):
        return len(self.label)

    def __getitem__(self, i):
        # sanity check
        text = ' '.join(self.text[i].split())

        # tokenize using Huggingface tokenizers
        out = self.tokenizer.encode_plus(text, None, 
                                   add_special_tokens=True, 
                                   max_length = self.max_len,
                                   truncation = True)
        
        ids = out['input_ids']
        mask = out['attention_mask']
        
        padding_length = self.max_len - len(ids)
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.label[i], dtype=torch.float)
        }

## Utils

In [None]:
class AverageMeter():
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
class EarlyStopping():
    def __init__(self, patience=7, mode="max", delta=0.001):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):

        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            xm.master_print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            xm.master_print('Validation score improved ({} --> {}). Saving model!'.format(self.val_score, epoch_score))
#             torch.save(model.state_dict(), model_path)
            xm.save(model.state_dict(), model_path)
        self.val_score = epoch_score

## MODEL

In [None]:
class ClasModel(nn.Module):
    def __init__(self, pretrained=True, num_classes=27):
        super(ClasModel, self).__init__()
        if pretrained:
            self.model = config.CLAS_MODEL.from_pretrained(config.MODEL_NAME, config=config.MODEL_CONFIG)
        else: 
            self.model = config.CLAS_MODEL(config.MODEL_CONFIG)
                  
        self.drop = nn.Dropout(0.4)

        self.lin = nn.Linear(768*2, num_classes)
    
    def forward(self, ids, mask):

        h_0, _ = self.model(ids, attention_mask=mask)
        
        mean_pool = torch.mean(h_0, 1)

        max_pool = torch.max(h_0, 1)[0]

        out = torch.cat([mean_pool, max_pool], 1)

        out = self.lin(self.drop(out))

        return out

    # def load_lm_encoder(self,clas_model, lm_path=None):
    #     clas_model_dict = clas_model.state_dict()
    #     if lm_path is not None:
    #         lm_model_dict = torch.load(lm_path).model.state_dict()
    #         needed_dict = {k[6:]:v for k, v in lm_model_dict.items() if str(k)[6:] in clas_model_dict.keys()}
    #         clas_model_dict.update(needed_dict)
    #     clas_model.load_state_dict(clas_model_dict)

## Prep data

In [None]:
!tar -xzf $colab_path/'dataset/temp.tgz'

In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
# df_all = pd.read_csv(config.BASE_PATH/'dataset/df_all.csv').sample(frac=1).reset_index(drop=True); 
# df_all.fillna(' ', inplace=True)
# df_all.Title = df_all.Title.apply(str).apply(lambda x: clean_text(x))
# df_all.Description = df_all.Description.apply(str).apply(lambda x: clean_text(x))
# y = df_all.Prdlbl.values
# df_all.sample(1)

In [None]:
# le = LabelEncoder()
# le.fit(df_all.Prdlbl)
# le.classes_

In [None]:
# df_all.Prdlbl = le.transform(df_all.Prdlbl)
# df_all['fold'] = -1

# kfold = StratifiedKFold(5, shuffle=True)

# for i, (trn, val) in enumerate(kfold.split(X=df_all, y=y)):
#     df_all.loc[val, 'fold'] = i

# df_all.to_csv(config.CLEAN_DF, index=False)

# df_all = pd.read_csv(config.CLEAN_DF)
# df_all.fillna(' ', inplace=True)

In [None]:
#run this instead
df_all = pd.read_csv(config.CLEAN_DF)
df_all.fillna(' ', inplace=True)

temp_df = pd.read_csv(config.BASE_PATH/'dataset/df_all.csv').sample(frac=1).reset_index(drop=True)
le = LabelEncoder()
le.fit(temp_df.Prdlbl)
le.transform(le.classes_)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26])

##Train

In [None]:
WRAPPED_MODEL = xmp.MpModelWrapper(ClasModel(pretrained=True))
def world_func(fold):
    xm.master_print(f'Running Fold {fold}')
    xtra_config = config.MODEL_CONFIG
    device = xm.xla_device()
    model_path = config.MODEL_PATH/f'torch_xla_pretrained_xla_roberta_fold{fold}.bin'

    global WRAPPED_MODEL
    model = WRAPPED_MODEL.to(device)

    def loss_fn(y_pred, y_true):
        return nn.CrossEntropyLoss()(y_pred, y_true)

    def train(train_dl, model, optimizer, device, scheduler=None):
        xm.master_print('Training...')
        model.train()
        loss_all = AverageMeter()

        # p_bar = notebook.tqdm(train_dl, total=len(train_dl))
        for i, batch in enumerate(train_dl):
            ids = batch['ids']
            mask = batch['mask']
            targets = batch['targets']

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            optimizer.zero_grad()
            model.zero_grad()
            out = model(ids, mask)

            loss = loss_fn(out, targets)
            loss_all.update(loss.detach().item(), ids.size(0))

            if i % 100 == 0:
                xm.master_print(f'Batch: {i}, Training loss: {loss_all.avg}')

            loss.backward()
            xm.optimizer_step(optimizer)

            if scheduler is not None:
                scheduler.step()
            
            # break
        
        # p_bar.set_postfix(loss=loss_all.avg)

    def evaluate(valid_dl, model, device):
        xm.master_print('Evaluating...')
        model.eval()
        fin_targs = []
        fin_outs = []
        losses = 0.
        loss_all = AverageMeter()

        with torch.no_grad():
            # p_bar = notebook.tqdm(valid_dl, total=len(valid_dl))
            for i, batch in enumerate(valid_dl):
                ids = batch['ids']
                mask = batch['mask']
                targets = batch['targets']

                ids = ids.to(device, dtype=torch.long)
                mask = mask.to(device, dtype=torch.long)
                targets = targets.to(device, dtype=torch.float)

                out = model(ids, mask)
                loss = loss_fn(out, targets)

                loss_all.update(loss.detach().item(), ids.size(0))

                if i % 100 == 0:
                    xm.master_print(f'Batch: {i}, Evaluation loss: {loss_all.avg}')

                targ_np = targets.cpu().detach().numpy().tolist()

                soft_out = nn.Softmax(dim=1)(out.cpu().detach())
                out_np = soft_out.argmax(-1).numpy().tolist()

                fin_targs.extend(targ_np)
                fin_outs.extend(out_np)
                # break

            # p_bar.set_postfix(loss=loss_all.avg)
        return fin_targs, fin_outs, losses


    train_df, valid_df = df_all.query(f'fold != {fold}'), df_all.query(f'fold == {fold}')
    train_df.reset_index(drop=True, inplace=True), valid_df.reset_index(drop=True, inplace=True)

    train_ds = BertDataset((train_df.Title + 'xxfld' + train_df.Description).values, train_df.Prdlbl.values)
    valid_ds = BertDataset((valid_df.Title + 'xxfld' + valid_df.Description).values, valid_df.Prdlbl.values)

    ###########change happens here#####################
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_ds, 
                                                                    num_replicas=xm.xrt_world_size(),
                                                                    rank=xm.get_ordinal(),
                                                                    shuffle=True)
    ###################################################

    train_dl = torch.utils.data.DataLoader(train_ds, batch_size=config.TRAIN_BATCH_SIZE, 
                                        drop_last=True, num_workers=4, 
                                        sampler=train_sampler)

    ###########change happens here#####################
    valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_ds, 
                                                                    num_replicas=xm.xrt_world_size(),
                                                                    rank=xm.get_ordinal(),
                                                                    shuffle=False)
    ###################################################

    valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=config.VALID_BATCH_SIZE, 
                                        drop_last=False, num_workers=4,
                                        sampler=valid_sampler)


    ############change happens here################
    lr = 1e-05 * xm.xrt_world_size()
    #############################

    es = EarlyStopping(patience=4, mode='max')

    model_params = list(model.named_parameters())
    # print('after model')

    # we don't want weight decay for these
    no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']

    optimizer_params = [
        {'params': [p for n, p in model_params if n not in no_decay], 
        'weight_decay':0.001},
        #  no weight decay should be applied
        {'params': [p for n, p in model_params if n in no_decay],
        'weight_decay':0.0}
    ]

    optimizer = AdamW(optimizer_params, lr=lr)

    ###############change happens here#########################
    # scheduler
    num_train_steps = int(len(train_df)/ config.TRAIN_BATCH_SIZE / xm.xrt_world_size() * config.NUM_EPOCHS)
    ###########################################################

    xm.master_print(f'num_train_steps = {num_train_steps}, world_size = {xm.xrt_world_size()}')

    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, 
                                                num_warmup_steps=0, 
                                                num_training_steps=num_train_steps)

    best_accuracy = 0.
    for epoch in range(config.NUM_EPOCHS):
        start_epoch = time.time()
        xm.master_print('-'*50)
        xm.master_print(f'Running Epoch #{epoch} {"🔥"*epoch}')
        xm.master_print(f'{"-"*50} \n')

        train_para_loader = pl.ParallelLoader(train_dl, [device])
        valid_para_loader = pl.ParallelLoader(train_dl, [device])

        start = time.time()
        train(train_para_loader.per_device_loader(device), model, optimizer, device, scheduler)
        end = time.time()
        xm.master_print(f'Training time: {round(end-start, 2)} secs')

        start = time.time()
        fin_targs, fin_outs, losses = evaluate(valid_para_loader.per_device_loader(device), model, device)
        end = time.time()
        xm.master_print(f'Evaluation time: {round(end-start, 2)} secs \n')

        # calc metrics
        accuracy = accuracy_score(fin_targs, fin_outs)
        macro_f1 = f1_score(fin_targs, fin_outs, average='macro')

        xm.master_print(f'VALID ACCURACY: {accuracy}')
        xm.master_print(f'VALID MACRO_F1: {macro_f1}')
        
        # if accuracy > best_accuracy:
        #     best_accuracy = accuracy
        #     xm.save(model.state_dict(), model_path)

        es(macro_f1, model, model_path=model_path)
        if es.early_stop:
            xm.master_print('Early Stopping...')
            break

        end_epoch = time.time()

        xm.master_print(f'Total time: {round(end-start, 2)} secs \n')

        # break

## Inference

In [None]:
def make_inference(fold):
    test_df = pd.read_csv(config.COLAB_PATH/'SIGIR-2020-EComDC-release/data/x_test_task1_phase1.tsv', sep='\t').fillna(' ')
    model_path = config.MODEL_PATH/f'torch_xla_pretrained_xla_roberta_fold{fold}.bin'
    test_df['Prdlbl'] = 0

    test_ds = BertDataset((test_df.Title + 'xxfld' + test_df.Description).values, test_df.Prdlbl.values)

    test_dl = torch.utils.data.DataLoader(test_ds, batch_size=config.VALID_BATCH_SIZE, 
                                            num_workers=4, shuffle=False)
    
    device = xm.xla_device()
    model = ClasModel(pretrained=True).to(device)
    model.load_state_dict(torch.load(str(model_path)))

    model.eval()

    fin_outs = []

    with torch.no_grad():
        for bi, batch in tqdm(enumerate(test_dl), total=len(test_dl)):
            ids = batch['ids']
            mask = batch['mask']
            targets = batch['targets']

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            # model.zero_grad()
            out = model(ids, mask)

            soft_out = nn.Softmax(dim=1)(out.cpu().detach())
            out_np = soft_out.argmax(-1).numpy().tolist()

            fin_outs.extend(out_np)


    sub_df = test_df.copy()
    sub_df['Prdtypecode'] = le.inverse_transform(fin_outs)
    sub_df.head()
    sub_df[['Integer_id', 'Image_id', 'Product_id', 'Prdtypecode']].to_csv(config.DATA_PATH/f'y_test_task1_phase1_pred_fold{fold}.tsv', index=False, sep='\t')

    # Also save to colab
    sub_df[['Integer_id', 'Image_id', 'Product_id', 'Prdtypecode']].to_csv(config.COLAB_PATH/f'y_test_task1_phase1_pred_fold{fold}.tsv', index=False, sep='\t')
    
    print(f"Submission file saved at: \n {config.DATA_PATH}/'y_test_task1_phase1_pred_fold{fold}.tsv'")

## Fin

In [None]:
import gc; gc.collect()

1171

In [None]:
fold = 0
FLAGS = {}
def mp_wrapper(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    world_func(fold)

xmp.spawn(mp_wrapper, args=(FLAGS,), nprocs=8, start_method='fork')

# run inference
make_inference(fold)

Running Fold 0
num_train_steps = 2653, world_size = 8
--------------------------------------------------
Running Epoch #0 
-------------------------------------------------- 

Training...
Batch: 0, Training loss: 3.3006930351257324
Batch: 100, Training loss: 2.2969900759139863
Batch: 200, Training loss: 1.782161213568787
Training time: 347.63 secs
Evaluating...
Batch: 0, Evaluation loss: 0.9767934679985046
Batch: 100, Evaluation loss: 0.7326810224221485
Batch: 200, Evaluation loss: 0.7430760172469106
Evaluation time: 107.66 secs 

VALID ACCURACY: 0.8397405660377358
VALID MACRO_F1: 0.7941439795245313
Validation score improved (-inf --> 0.7941439795245313). Saving model!
Total time: 107.66 secs 

--------------------------------------------------
Running Epoch #1 🔥
-------------------------------------------------- 

Training...
Batch: 0, Training loss: 1.0407016277313232
Batch: 100, Training loss: 0.6969152792255477
Batch: 200, Training loss: 0.6627295613288879
Training time: 321.98 sec

HBox(children=(FloatProgress(value=0.0, max=59.0), HTML(value='')))


Submission file saved at: 
 /content/drive/My Drive/Rakuten/dataset/'y_test_task1_phase1_pred_fold0.tsv'


In [None]:
make_inference(fold=1)

HBox(children=(FloatProgress(value=0.0, max=59.0), HTML(value='')))


Submission file saved at: 
 /content/drive/My Drive/Rakuten/dataset/'y_test_task1_phase1_pred_fold1.tsv'


In [None]:
fold = 1
FLAGS = {}
def mp_wrapper(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    world_func(fold)

xmp.spawn(mp_wrapper, args=(FLAGS,), nprocs=8, start_method='fork')

# run inference
make_inference(fold)

Running Fold 1
num_train_steps = 2653, world_size = 8
--------------------------------------------------
Running Epoch #0 
-------------------------------------------------- 

Training...
Batch: 0, Training loss: 3.311984062194824
Batch: 100, Training loss: 2.31468132108745
Batch: 200, Training loss: 1.784091452460977
Training time: 369.89 secs
Evaluating...
Batch: 0, Evaluation loss: 0.7396478056907654
Batch: 100, Evaluation loss: 0.7432409026245079
Batch: 200, Evaluation loss: 0.7306556246470456
Evaluation time: 112.36 secs 

VALID ACCURACY: 0.8413915094339622
VALID MACRO_F1: 0.78413319658671
Validation score improved (-inf --> 0.78413319658671). Saving model!
Total time: 112.36 secs 

--------------------------------------------------
Running Epoch #1 🔥
-------------------------------------------------- 

Training...
Batch: 0, Training loss: 0.7491677403450012
Batch: 100, Training loss: 0.7152793224495236
Batch: 200, Training loss: 0.6621590823824726
Training time: 341.54 secs
Evalu

KeyboardInterrupt: ignored

In [None]:
fold = 2
FLAGS = {}
def mp_wrapper(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    world_func(fold)

xmp.spawn(mp_wrapper, args=(FLAGS,), nprocs=8, start_method='fork')

# run inference
make_inference(fold)

Exception in device=TPU:0: Cannot replicate if number of devices (1) is different from 8
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py", line 330, in _mp_start_fn
    _start_fn(index, pf_cfg, fn, args)
Exception in device=TPU:1: Cannot replicate if number of devices (1) is different from 8
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py", line 323, in _start_fn
    _setup_replication()
  File "/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py", line 316, in _setup_replication
    xm.set_replication(device, [device])
  File "/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py", line 330, in _mp_start_fn
    _start_fn(index, pf_cfg, fn, args)
Exception in device=TPU:2: Cannot replicate if number of devices (1) is different from 8
  File "/usr/local/lib/python3.6/dis

Exception: ignored

In [None]:
fold = 0
FLAGS = {}
def mp_wrapper(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    world_func(fold)

xmp.spawn(mp_wrapper, args=(FLAGS,), nprocs=8, start_method='fork')

# run inference
make_inference(fold)

Running Fold 0
num_train_steps = 1592, world_size = 8
Training...
Batch: 0, Training loss: 3.33591365814209
Batch: 100, Training loss: 2.3373561082500043
Batch: 200, Training loss: 1.8479502334523557
Training time: 343.19077825546265 secs
Evaluating...
Batch: 0, Evaluation loss: 0.9810035228729248
Batch: 100, Evaluation loss: 0.7740882924287626
Batch: 200, Evaluation loss: 0.7900369395664082
Evaluation time: 110.76497888565063 secs
VALID ACCURACY: 0.8307783018867925
VALID MACRO_F1: 0.7765953461835677
Validation score improved (-inf --> 0.7765953461835677). Saving model!
Training...
Batch: 0, Training loss: 1.0125924348831177
Batch: 100, Training loss: 0.7501974397956734
Batch: 200, Training loss: 0.718463994377288
Training time: 318.9730176925659 secs
Evaluating...
Batch: 0, Evaluation loss: 0.759476363658905
Batch: 100, Evaluation loss: 0.4876207639970402
Batch: 200, Evaluation loss: 0.4936477908151067
Evaluation time: 119.61412382125854 secs
VALID ACCURACY: 0.8787735849056604
VALID M

HBox(children=(FloatProgress(value=0.0, max=59.0), HTML(value='')))


Submission file saved at: 
 /content/drive/My Drive/Rakuten/dataset/'y_test_task1_phase1_pred_fold0.tsv'


In [None]:
fold = 1
FLAGS = {}
def mp_wrapper(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    world_func(fold)

xmp.spawn(mp_wrapper, args=(FLAGS,), nprocs=8, start_method='fork')

# run inference
make_inference(fold)

Running Fold 1
num_train_steps = 2653, world_size = 8
--------------------------------------------------
Running Epoch #0 
-------------------------------------------------- 

Training...
Batch: 0, Training loss: 3.345763683319092
Batch: 100, Training loss: 2.3367084984732145
Batch: 200, Training loss: 1.7959325710932414
Training time: 421.09 secs
Evaluating...
Batch: 0, Evaluation loss: 0.6838085055351257
Batch: 100, Evaluation loss: 0.743703978781653
Batch: 200, Evaluation loss: 0.7319193085034689
Evaluation time: 127.83 secs 

VALID ACCURACY: 0.8461084905660378
VALID MACRO_F1: 0.7982494137308145
Validation score improved (-inf --> 0.7982494137308145). Saving model!
--------------------------------------------------
Running Epoch #1 🔥
-------------------------------------------------- 

Training...
Batch: 0, Training loss: 0.7143270373344421
Batch: 100, Training loss: 0.7104518386987176
Batch: 200, Training loss: 0.653398674667178
Training time: 343.51 secs
Evaluating...
Batch: 0, Ev

Exception in device=TPU:5: tensorflow/compiler/xla/xla_client/mesh_service.cc:294 : Failed to meet rendezvous 'torch_xla.core.xla_model.save': Socket closed (14)
Exception in device=TPU:2: tensorflow/compiler/xla/xla_client/mesh_service.cc:294 : Failed to meet rendezvous 'torch_xla.core.xla_model.save': Socket closed (14)
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py", line 231, in _start_fn
    fn(gindex, *args)
  File "<ipython-input-67-e55a66988613>", line 5, in mp_wrapper
    world_func(fold)
Traceback (most recent call last):
  File "<ipython-input-64-501291fcc907>", line 181, in world_func
    es(macro_f1, model, model_path=model_path)
  File "/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py", line 231, in _start_fn
    fn(gindex, *args)
  File "<ipython-input-12-a3626b0087a3>", line 31, in __call__
    self.save_checkpoint(epoch_score, model, model_path)
  File "<i

Exception: ignored

In [None]:
make_inference(1)

HBox(children=(FloatProgress(value=0.0, max=59.0), HTML(value='')))


Submission file saved at: 
 /content/drive/My Drive/Rakuten/dataset/'y_test_task1_phase1_pred_fold1.tsv'


Model 1 86.94 on LB, 98 on CV

In [None]:
fold = 2
FLAGS = {}
def mp_wrapper(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    world_func(fold)

xmp.spawn(mp_wrapper, args=(FLAGS,), nprocs=8, start_method='fork')

# run inference
make_inference(fold)

Running Fold 2
num_train_steps = 2653, world_size = 8
--------------------------------------------------
Running Epoch #0 
-------------------------------------------------- 

Training...
Batch: 0, Training loss: 3.3442208766937256
Batch: 100, Training loss: 2.3326966514681824
Batch: 200, Training loss: 1.7737489384205187
Training time: 362.49 secs
Evaluating...
Batch: 0, Evaluation loss: 0.5701291561126709
Batch: 100, Evaluation loss: 0.7560484521459825
Batch: 200, Evaluation loss: 0.7328266193024555
Evaluation time: 116.84 secs 

VALID ACCURACY: 0.845754716981132
VALID MACRO_F1: 0.8023497792939893
Validation score improved (-inf --> 0.8023497792939893). Saving model!
--------------------------------------------------
Running Epoch #1 🔥
-------------------------------------------------- 

Training...
Batch: 0, Training loss: 0.5969314575195312
Batch: 100, Training loss: 0.7191913231174545
Batch: 200, Training loss: 0.6337611312741664
Training time: 343.47 secs
Evaluating...
Batch: 0, 

HBox(children=(FloatProgress(value=0.0, max=59.0), HTML(value='')))


Submission file saved at: 
 /content/drive/My Drive/Rakuten/dataset/'y_test_task1_phase1_pred_fold2.tsv'


In [None]:
fold = 3
FLAGS = {}
def mp_wrapper(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    world_func(fold)

xmp.spawn(mp_wrapper, args=(FLAGS,), nprocs=8, start_method='fork')

# run inference
make_inference(fold)

Running Fold 3
num_train_steps = 2653, world_size = 8
--------------------------------------------------
Running Epoch #0 
-------------------------------------------------- 

Training...
Batch: 0, Training loss: 3.4504196643829346
Batch: 100, Training loss: 2.318692701877934
Batch: 200, Training loss: 1.772972083803433
Training time: 345.6 secs
Evaluating...
Batch: 0, Evaluation loss: 0.7343528866767883
Batch: 100, Evaluation loss: 0.7351292678625276
Batch: 200, Evaluation loss: 0.724468876918157
Evaluation time: 110.17 secs 

VALID ACCURACY: 0.8474056603773585
VALID MACRO_F1: 0.7985932520183028
Validation score improved (-inf --> 0.7985932520183028). Saving model!
--------------------------------------------------
Running Epoch #1 🔥
-------------------------------------------------- 

Training...
Batch: 0, Training loss: 0.7009313106536865
Batch: 100, Training loss: 0.698773587989335
Batch: 200, Training loss: 0.6408535894469836
Training time: 340.63 secs
Evaluating...
Batch: 0, Eval

HBox(children=(FloatProgress(value=0.0, max=59.0), HTML(value='')))


Submission file saved at: 
 /content/drive/My Drive/Rakuten/dataset/'y_test_task1_phase1_pred_fold3.tsv'


In [None]:
fold = 4
FLAGS = {}
def mp_wrapper(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    world_func(fold)

xmp.spawn(mp_wrapper, args=(FLAGS,), nprocs=8, start_method='fork')

# run inference
make_inference(fold)

Running Fold 4
num_train_steps = 2653, world_size = 8
--------------------------------------------------
Running Epoch #0 
-------------------------------------------------- 

Training...
Batch: 0, Training loss: 3.3859407901763916
Batch: 100, Training loss: 2.3578107888155646
Batch: 200, Training loss: 1.806221757955219
Training time: 359.04 secs
Evaluating...
Batch: 0, Evaluation loss: 0.821690022945404
Batch: 100, Evaluation loss: 0.7831577971430108
Batch: 200, Evaluation loss: 0.759827194373999
Evaluation time: 115.28 secs 

VALID ACCURACY: 0.8340801886792453
VALID MACRO_F1: 0.7875756128803119
Validation score improved (-inf --> 0.7875756128803119). Saving model!
--------------------------------------------------
Running Epoch #1 🔥
-------------------------------------------------- 

Training...
Batch: 0, Training loss: 0.912128746509552
Batch: 100, Training loss: 0.7352562018550268
Batch: 200, Training loss: 0.6702111697760388
Training time: 338.54 secs
Evaluating...
Batch: 0, Eva

If you using a big model, declare the model above multiprocessor func as a global. Also cahnge the pytorch_xla version to nightlyn