In [2]:
import collections
import os
import random

import matplotlib.pyplot as plt
import nlp
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm
from transformers import AdamW, AutoModel, AutoTokenizer
import warnings
warnings.simplefilter('ignore')

# seeds
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

seed_everything(SEED)

if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    current_device = torch.cuda.current_device()
    print("Device:", torch.cuda.get_device_name(current_device))


# config
data_dir = os.path.join( "input/")
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
TRAIN_FILE = os.path.join(data_dir, "data.csv")
TEST_FILE = os.path.join(data_dir, "test.csv")
MODELS_DIR = "models/"
MODEL_NAME = 'bert-base-uncased' # bert-base-uncased
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 128
NUM_CLASSES = 4
EPOCHS = 5
NUM_SPLITS = 5


# dataset
def make_folded_df(csv_file, num_splits=5):
    df = pd.read_csv(csv_file)
    df["jobflag"] = df["jobflag"] - 1
    df["kfold"] = np.nan
    df = df.rename(columns={'jobflag': 'labels'})
    label = df["labels"].tolist()

    skfold = StratifiedKFold(num_splits, shuffle=True, random_state=SEED)
    for fold, (_, valid_indexes) in enumerate(skfold.split(range(len(label)), label)):
        for i in valid_indexes:
            df.iat[i,3] = fold
    return df

def make_dataset(df, tokenizer, device):
    dataset = nlp.Dataset.from_pandas(df)
    dataset = dataset.map(
        lambda example: tokenizer(example["description"],
                                  padding="max_length",
                                  truncation=True,
                                  max_length=128))
    dataset.set_format(type='torch', 
                       columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'], 
                       device=device)
    return dataset


# model

class Classifier(nn.Module):
    def __init__(self, model_name, num_classes=4):
        super().__init__()

        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, num_classes)
        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.zeros_(self.linear.bias)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output, _ = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids,
        return_dict=False)
        #print(output)
        output = output[:, 0, :]
        output = self.dropout(output)
        output = self.linear(output)
        return output


# training function
def train_fn(dataloader, model, criterion, optimizer, scheduler, device, epoch):
    
    model.train()
    total_loss = 0
    total_corrects = 0
    all_labels = []
    all_preds = []

    progress = tqdm(dataloader, total=len(dataloader))

    for i, batch in enumerate(progress):
        progress.set_description(f"<Train> Epoch{epoch+1}")

        attention_mask, input_ids, labels, token_type_ids = batch.values()
        del batch

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask, token_type_ids)
        del input_ids, attention_mask, token_type_ids
        
        #print(labels.shape)
        #print(outputs)
        
        
        loss = criterion(outputs, labels)  # 損失を計算
        _, preds = torch.max(outputs, 1)  # ラベルを予測
        del outputs

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        del loss
        total_corrects += torch.sum(preds == labels)

        all_labels += labels.tolist()
        all_preds += preds.tolist()
        del labels, preds

        progress.set_postfix(loss=total_loss/(i+1), f1=f1_score(all_labels, all_preds, average="macro"))

    train_loss = total_loss / len(dataloader)
    train_acc = total_corrects.double().cpu().detach().numpy() / len(dataloader.dataset)
    train_f1 = f1_score(all_labels, all_preds, average="macro")

    return train_loss, train_acc, train_f1


def eval_fn(dataloader, model, criterion, device, epoch):
    model.eval()
    total_loss = 0
    total_corrects = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        progress = tqdm(dataloader, total=len(dataloader))
        
        for i, batch in enumerate(progress):
            progress.set_description(f"<Valid> Epoch{epoch+1}")

            attention_mask, input_ids, labels, token_type_ids = batch.values()
            del batch

            outputs = model(input_ids, attention_mask, token_type_ids)
            del input_ids, attention_mask, token_type_ids
            
            
            
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)
            del outputs

            total_loss += loss.item()
            del loss
            total_corrects += torch.sum(preds == labels)

            all_labels += labels.tolist()
            all_preds += preds.tolist()
            del labels, preds

            progress.set_postfix(loss=total_loss/(i+1), f1=f1_score(all_labels, all_preds, average="macro"))

    valid_loss = total_loss / len(dataloader)
    valid_acc = total_corrects.double().cpu().detach().numpy() / len(dataloader.dataset)

    valid_f1 = f1_score(all_labels, all_preds, average="macro")

    return valid_loss, valid_acc, valid_f1


def plot_training(train_losses, train_accs, train_f1s,
                  valid_losses, valid_accs, valid_f1s,
                  epoch, fold):
    
    loss_df = pd.DataFrame({"Train":train_losses,
                            "Valid":valid_losses},
                        index=range(1, epoch+2))
    loss_ax = sns.lineplot(data=loss_df).get_figure()
    loss_ax.savefig(f"figures/loss_plot_fold={fold}.png", dpi=300)
    loss_ax.clf()

    acc_df = pd.DataFrame({"Train":train_accs,
                           "Valid":valid_accs},
                          index=range(1, epoch+2))
    acc_ax = sns.lineplot(data=acc_df).get_figure()
    acc_ax.savefig(f"figures/acc_plot_fold={fold}.png", dpi=300)
    acc_ax.clf()

    f1_df = pd.DataFrame({"Train":train_f1s,
                          "Valid":valid_f1s},
                         index=range(1, epoch+2))
    f1_ax = sns.lineplot(data=f1_df).get_figure()
    f1_ax.savefig(f"figures/f1_plot_fold={fold}.png", dpi=300)
    f1_ax.clf()

def trainer(fold, df):
    
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    train_dataset = make_dataset(train_df, tokenizer, DEVICE)
    valid_dataset = make_dataset(valid_df, tokenizer, DEVICE)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True
    )
    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False
    )

    model = Classifier(MODEL_NAME, num_classes=NUM_CLASSES)
    model = model.to(DEVICE)

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100000, gamma=1.0)
    # ダミーのスケジューラー

    train_losses = []
    train_accs = []
    train_f1s = []
    valid_losses = []
    valid_accs = []
    valid_f1s = []

    best_loss = np.inf
    best_acc = 0
    best_f1 = 0

    for epoch in range(EPOCHS):
        train_loss, train_acc, train_f1 = train_fn(train_dataloader, model, criterion, optimizer, scheduler, DEVICE, epoch)
        valid_loss, valid_acc, valid_f1 = eval_fn(valid_dataloader, model, criterion, DEVICE, epoch)
        print(f"Loss: {valid_loss}  Acc: {valid_acc}  f1: {valid_f1}  ", end="")

        train_losses.append(train_loss)
        train_accs.append(train_acc)
        train_f1s.append(train_f1)
        valid_losses.append(valid_loss)
        valid_accs.append(valid_acc)
        valid_f1s.append(valid_f1)

        plot_training(train_losses, train_accs, train_f1s,
                      valid_losses, valid_accs, valid_f1s,
                      epoch, fold)
        
        best_loss = valid_loss if valid_loss < best_loss else best_loss
        besl_acc = valid_acc if valid_acc > best_acc else best_acc
        if valid_f1 > best_f1:
            best_f1 = valid_f1
            print("model saving!", end="")
            torch.save(model.state_dict(), MODELS_DIR + f"best_{MODEL_NAME}_{fold}.pth")
        print("\n")

    return best_f1


# training
df = make_folded_df(TRAIN_FILE, NUM_SPLITS)
f1_scores = []
for fold in range(NUM_SPLITS):
    print(f"fold {fold}", "="*80)
    f1 = trainer(fold, df)
    f1_scores.append(f1)
    print(f"<fold={fold}> best score: {f1}\n")

cv = sum(f1_scores) / len(f1_scores)
print(f"CV: {cv}")

lines = ""
for i, f1 in enumerate(f1_scores):
    line = f"fold={i}: {f1}\n"
    lines += line
lines += f"CV    : {cv}"
with open(f"result/{MODEL_NAME}_result.txt", mode='w') as f:
    f.write(lines)


# inference
models = []
for fold in range(NUM_SPLITS):
    model = Classifier(MODEL_NAME)
    model.load_state_dict(torch.load(MODELS_DIR + f"best_{MODEL_NAME}_{fold}.pth"))
    model.to(DEVICE)
    model.eval()
    models.append(model)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
test_df = pd.read_csv(TEST_FILE)
test_df["labels"] = -1
test_dataset = make_dataset(test_df, tokenizer, DEVICE)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

with torch.no_grad():
    progress = tqdm(test_dataloader, total=len(test_dataloader))
    final_output = []

    for batch in progress:
        progress.set_description("<Test>")

        attention_mask, input_ids, labels, token_type_ids = batch.values()

        outputs = []
        for model in models:
            output = model(input_ids, attention_mask, token_type_ids)
            outputs.append(output)

        outputs = sum(outputs) / len(outputs)
        outputs = torch.softmax(outputs, dim=1).cpu().detach().tolist()
        outputs = np.argmax(outputs, axis=1)

        final_output.extend(outputs)

submit = pd.read_csv(os.path.join(data_dir, "submit_sample.csv"), names=["id", "labels"])
submit["labels"] = final_output
submit["labels"] = submit["labels"] + 1
try:
    submit.to_csv("./output/Bert_submission_cv{}.csv".format(str(cv).replace(".", "")[:10]), index=False, header=False)
except NameError:
    submit.to_csv("./output/submission.csv", index=False, header=False)

Device: NVIDIA GeForce RTX 3060


  0%|          | 0/2426 [00:00<?, ?it/s]

  0%|          | 0/607 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5089432001113892  Acc: 0.8220757825370676  f1: 0.78549063097655  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4897942185401917  Acc: 0.8401976935749588  f1: 0.8278349877849817  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5089629292488098  Acc: 0.8352553542009885  f1: 0.78765044717482  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.524033498764038  Acc: 0.8467874794069192  f1: 0.817538056560454  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5388268440961838  Acc: 0.8632619439868204  f1: 0.8315078388631147  model saving!

<fold=0> best score: 0.8315078388631147



  0%|          | 0/2426 [00:00<?, ?it/s]

  0%|          | 0/607 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5539555847644806  Acc: 0.800658978583196  f1: 0.7239946010113116  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.511066448688507  Acc: 0.8039538714991763  f1: 0.7444081235220475  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5111147075891495  Acc: 0.8154859967051071  f1: 0.7464313324016885  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6225999742746353  Acc: 0.8220757825370676  f1: 0.7789546285077811  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6286753609776496  Acc: 0.8121911037891268  f1: 0.7443688069852301  

<fold=1> best score: 0.7789546285077811



  0%|          | 0/2426 [00:00<?, ?it/s]

  0%|          | 0/607 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.45950919985771177  Acc: 0.828665568369028  f1: 0.696688421561518  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.42364715933799746  Acc: 0.8500823723228995  f1: 0.8110460040003846  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.519795161485672  Acc: 0.8105436573311368  f1: 0.7712600004027007  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.49212340712547303  Acc: 0.8401976935749588  f1: 0.7917577278559502  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5077079772949219  Acc: 0.8319604612850082  f1: 0.7799357864754008  

<fold=2> best score: 0.8110460040003846



  0%|          | 0/2427 [00:00<?, ?it/s]

  0%|          | 0/606 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4847229480743408  Acc: 0.8366336633663366  f1: 0.6968075406678069  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4120050936937332  Acc: 0.8597359735973598  f1: 0.8075501741552512  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.3721938908100128  Acc: 0.8844884488448845  f1: 0.8327200732003059  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.39284360110759736  Acc: 0.8679867986798679  f1: 0.8258821443027093  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4976678937673569  Acc: 0.8597359735973598  f1: 0.8357389864067937  model saving!

<fold=3> best score: 0.8357389864067937



  0%|          | 0/2427 [00:00<?, ?it/s]

  0%|          | 0/606 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4674301266670227  Acc: 0.8135313531353136  f1: 0.7109500715663224  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4831689357757568  Acc: 0.8234323432343235  f1: 0.7910954492944154  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4802979439496994  Acc: 0.8399339933993399  f1: 0.7937137929422423  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5206526637077331  Acc: 0.8382838283828383  f1: 0.7838051218694727  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5255190283060074  Acc: 0.858085808580858  f1: 0.8105744035300384  model saving!

<fold=4> best score: 0.8105744035300384

CV: 0.8135643722616225


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predic

  0%|          | 0/1517 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

<Figure size 432x288 with 0 Axes>

In [8]:
print("finish")

finish
