In [None]:
# dependencies installments
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
# load data
import os
import json

data_path = '/content/gdrive/MyDrive/news-split-data-processed'

with open('{}/train_text.json'.format(data_path), 'r') as f:
    train_text = json.load(f)

# with open('{}/test_text.json'.format(data_path), 'r') as f:
#     test_text = json.load(f)

with open('{}/train_label.json'.format(data_path), 'r') as f:
    train_label = json.load(f)

# with open('{}/test_label.json'.format(data_path), 'r') as f:
#     test_label = json.load(f)


In [None]:
# tokenize
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

sample_size = int(0.25 * len(train_text))
# just sample small portion of data to try result
train_encodings = tokenizer(train_text[0: sample_size], truncation=True, padding=True)
train_label = train_label[0: sample_size]

In [None]:
# define dataset
import torch
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
# usr trainer to train
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results-3',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
)

train_dataset = NewsDataset(train_encodings, train_label)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset          # training dataset
)

trainer.train()


Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier

Step,Training Loss
100,0.602
200,0.253
300,0.1562
400,0.1482
500,0.155
600,0.1223
700,0.1265
800,0.1182
900,0.1226
1000,0.1092


Saving model checkpoint to ./results-3/checkpoint-500
Configuration saved in ./results-3/checkpoint-500/config.json
Model weights saved in ./results-3/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results-3/checkpoint-1000
Configuration saved in ./results-3/checkpoint-1000/config.json
Model weights saved in ./results-3/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results-3/checkpoint-1500
Configuration saved in ./results-3/checkpoint-1500/config.json
Model weights saved in ./results-3/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./results-3/checkpoint-2000
Configuration saved in ./results-3/checkpoint-2000/config.json
Model weights saved in ./results-3/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./results-3/checkpoint-2500
Configuration saved in ./results-3/checkpoint-2500/config.json
Model weights saved in ./results-3/checkpoint-2500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingfa

TrainOutput(global_step=2607, training_loss=0.11544157000487912, metrics={'train_runtime': 1955.3107, 'train_samples_per_second': 21.332, 'train_steps_per_second': 1.333, 'total_flos': 5525347665340416.0, 'train_loss': 0.11544157000487912, 'epoch': 1.0})

In [17]:
# save model

trainer.save_model('/content/gdrive/MyDrive/news_peace_models/distilbert-uncased-train-on-processed-data')

Saving model checkpoint to /content/gdrive/MyDrive/news_peace_models/distilbert-uncased-train-on-processed-data
Configuration saved in /content/gdrive/MyDrive/news_peace_models/distilbert-uncased-train-on-processed-data/config.json
Model weights saved in /content/gdrive/MyDrive/news_peace_models/distilbert-uncased-train-on-processed-data/pytorch_model.bin
