In [None]:
!nvidia-smi

In [None]:
!pip install --upgrade jupyter

In [3]:
!pip install -q transformers

In [None]:
!pip install ipywidgets

In [None]:
!pip install ipywidgets widgetsnbextension pandas-profiling

In [6]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [None]:
!pip install -U scikit-learn

In [None]:
!pip install tensorboard

In [17]:
import matplotlib.pyplot as plt
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from transformers import AdamW
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [18]:
class JpSentiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [19]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [20]:
df = pd.read_csv("train35404_nolast.tsv", sep='\t', header=None,
                 names=["id", "text1", "text2","last_text", "label", "usage"])
df["text"] = df["text1"] + "[SEP]" + df["text2"] + "[SEP]" + df["last_text"]

df_train = df[(df["usage"] == "train")]
temp1 = df_train["text"]
train_docs = temp1[:].tolist()
temp2 = df_train["label"]
train_labels = temp2[:].tolist()
print(len(train_docs))

3248378


In [21]:
df_dev = pd.read_csv("1000_dev.tsv", sep='\t', header=None,
                 names=["id", "text1", "text2","last_text", "label", "usage"])
df_dev["text"] = df_dev["text1"] + "[SEP]" + df_dev["text2"] + "[SEP]" + df_dev["last_text"]

df_dev1 = df_dev[(df_dev["usage"] == "test")]
temp3 = df_dev1["text"]
dev_docs = temp3[:].tolist()
temp4 = df_dev1["label"]
dev_labels = temp4[:].tolist()
print(len(dev_docs))

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

61580
cuda:0


In [22]:
model_name = "cl-tohoku/bert-large-japanese"
# model_name = "cl-tohoku/bert-base-japanese-v2"
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model = model.to(device)
tokenizer = BertTokenizer.from_pretrained(model_name)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1354281605.0), HTML(value='')))




Some weights of the model checkpoint at cl-tohoku/bert-large-japanese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were 

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=236001.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=174.0), HTML(value='')))




The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [23]:
train_encodings = tokenizer(train_docs, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
dev_encodings = tokenizer(dev_docs, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
train_dataset = JpSentiDataset(train_encodings, train_labels)
dev_dataset = JpSentiDataset(dev_encodings, dev_labels)

In [24]:
training_args = TrainingArguments(
    output_dir="./results/5e-6",  # output directory
    num_train_epochs=10,  # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,  # batch size for evaluation
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    dataloader_pin_memory=False,
    save_strategy="epoch",
    logging_dir="./logs/5e-6",
    logging_strategy="epoch",
    tf32=True,
    learning_rate=5e-6
)

In [25]:
trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=dev_dataset,  # evaluation dataset
    compute_metrics=compute_metrics  # The function that will be used to compute metrics at evaluation
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 3248378
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2030240
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6001,0.742837,0.613901,0.613584,0.614276,0.613901
2,0.5359,0.812284,0.616661,0.616432,0.61694,0.616661
3,0.4861,0.889823,0.615525,0.61537,0.61571,0.615525
4,0.4437,0.94853,0.611838,0.611831,0.611847,0.611838
5,0.4088,1.007247,0.611903,0.611853,0.611961,0.611903
6,0.3814,1.07073,0.611026,0.610967,0.611094,0.611026
7,0.3597,1.097035,0.609922,0.609901,0.609946,0.609922
8,0.3438,1.145269,0.610864,0.610859,0.61087,0.610864


***** Running Evaluation *****
  Num examples = 61580
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-203024
Configuration saved in ./results/checkpoint-203024/config.json
Model weights saved in ./results/checkpoint-203024/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 61580
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-406048
Configuration saved in ./results/checkpoint-406048/config.json
Model weights saved in ./results/checkpoint-406048/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 61580
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-609072
Configuration saved in ./results/checkpoint-609072/config.json
Model weights saved in ./results/checkpoint-609072/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encoding

trainer.train()

In [None]:
save_dir = "./train35404_5e-7_epoch10"
tokenizer.save_pretrained(save_dir)
model.save_pretrained(save_dir)