# distilbert-base-uncased

# Install Libraries

In [1]:
!pip install transformers



# Import Library

In [2]:
import json
import copy
import time
import torch
import random
import datetime
import pickle
import numpy as np
import pandas as pd
from os import listdir, path
import tensorflow as tf
from xml.dom import minidom
from os.path import isfile, join
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import BertConfig, BertTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, AdamW, get_linear_schedule_with_warmup

# 1. Setup

## 1.1. Using Colab GPU for Training

In [3]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [4]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


# 2. Loading KP Dataset

In [5]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## 2.1. Get All Filenames

In [6]:
root = "drive/MyDrive/NLP/Tugas Akhir/keyword extraction resources/"
saved_dataset_path = "drive/MyDrive/NLP/Tugas Akhir/keyword extraction resources/saved_dataset/"

In [7]:
# testarr = [1,2,3,4,5]
# with open(saved_dataset_path + 'testarr', 'wb') as f:
#     pickle.dump(testarr, f)

In [8]:
# with open(saved_dataset_path + 'testarr', 'rb') as f:
#     testtesttest = pickle.load(f)
# testtesttest

In [9]:
if not path.isfile(saved_dataset_path + 'dataset_filenames'):
  dataset_path = "drive/MyDrive/NLP/Tugas Akhir/keyword extraction resources/dataset/train"
  datasets_filenames = [f for f in listdir(dataset_path) if isfile(join(dataset_path, f))]
  #Sementara dibikin 5 filenames aja dulu
  # datasets_filenames = datasets_filenames[:5]
  #Sementara dibikin 5 filenames aja dulu
else:
  print("Dataset filenames already exist. Retrieving")
  with open(saved_dataset_path + 'dataset_filenames', 'rb') as f:
    datasets_filenames = pickle.load(f)

Dataset filenames already exist. Retrieving


In [10]:
print(len(datasets_filenames))
print(datasets_filenames)

450
['art_and_culture-20906350.xml', 'art_and_culture-20927511.xml', 'art_and_culture-20938615.xml', 'art_and_culture-20906382.xml', 'art_and_culture-20918624.xml', 'art_and_culture-20927518.xml', 'art_and_culture-20956483.xml', 'business-20906848.xml', 'art_and_culture-20922011.xml', 'art_and_culture-20906975.xml', 'art_and_culture-20932442.xml', 'art_and_culture-20934732.xml', 'art_and_culture-20922861.xml', 'art_and_culture-20941845.xml', 'art_and_culture-20938179.xml', 'art_and_culture-20924855.xml', 'art_and_culture-20945617.xml', 'art_and_culture-20919723.xml', 'art_and_culture-20900470.xml', 'art_and_culture-20946492.xml', 'art_and_culture-20923803.xml', 'art_and_culture-20943010.xml', 'art_and_culture-20927516.xml', 'art_and_culture-20927491.xml', 'art_and_culture-20927139.xml', 'art_and_culture-20902975.xml', 'art_and_culture-20927486.xml', 'business-20913435.xml', 'art_and_culture-20944018.xml', 'art_and_culture-20925876.xml', 'art_and_culture-20951992.xml', 'art_and_culture-

In [11]:
if not path.isfile(saved_dataset_path + 'dataset_filenames'):  
  with open(saved_dataset_path + 'dataset_filenames', 'wb') as f:
    pickle.dump(datasets_filenames, f)
else:
  print("Dataset filenames already exist.")

Dataset filenames already exist.


## 2.2. Read Dataset

In [12]:
if not path.isfile(saved_dataset_path + 'files_paragraphs'):
  files = []
  # parse an xml file by name
  # file = minidom.parse(root+'dataset/train/art_and_culture-20893614.xml')
  for i in range(len(datasets_filenames)):
    print(i)
    files.append(minidom.parse(root+'dataset/train/'+datasets_filenames[i]))
  print("Jumlah data: ", len(files))
else:  
  print("Dataset files_paragraphs already exist. Retrieving...")
  with open(saved_dataset_path + 'files_paragraphs', 'rb') as f:
    files_paragpraphs = pickle.load(f)  

Dataset files_paragraphs already exist. Retrieving...


In [13]:
def makeListOfSentences(file):
  sentencesList = []
  sentences = file.getElementsByTagName('sentence')
  for sentence in sentences: 
    eachSentence = ""
    words = sentence.getElementsByTagName('word')
    for i in range(len(words)-1):
      if(words[i+1].childNodes[0].data == "."):
        eachSentence += words[i].childNodes[0].data + words[i+1].childNodes[0].data
      elif(i+1 == len(words)-1):
        eachSentence += words[i].childNodes[0].data + " " + words[i+1].childNodes[0].data + "."
      else:
        eachSentence += words[i].childNodes[0].data + " "
    sentencesList.append(eachSentence)
  return sentencesList

In [14]:
if not path.isfile(saved_dataset_path + 'files_paragraphs'):
  files_paragpraphs = []
  for i in range(len(files)):
    print(i)
    files_paragpraphs.append(makeListOfSentences(files[i]))
  print(len(files_paragpraphs))
else:
  print("Dataset files_paragraphs already exist.")

Dataset files_paragraphs already exist.


In [15]:
files_paragpraphs[0]

['The Tree of Life to premiere in UK.',
 "An insider 's guide to the world of cinema by David Gritten.",
 "It 's a very British coup -- legendary US director Terrence Malick 's long-awaited new film The Tree of Life , starring Brad Pitt and Sean Penn , is to receive its world premiere not in Cannes next month , but a week earlier in the UK.",
 'Its distribution company , Icon , has confirmed its May 4 British release date to me.',
 'Speculation about the film has been swirling for more than a year ; at one point , it was expected to be screened in Cannes last year.',
 'This all adds to the mystique that surrounds the elusive Malick.',
 "The film 's American distributor , Fox Searchlight , was clearly taken by surprise by news of the UK opening , describing it as `` not true '' to a Hollywood website.",
 "My guess is that the film will still be shown in Cannes , though not in competition for the Palme d'Or.",
 'What do you get when you combine the talents of Johnny Depp , gonzo author-j

In [16]:
files_paragpraphs[1]

['Penelope Cruz gets Hollywood Walk of Fame star.',
 'Penelope Cruz gets Hollywood Walk of Fame star -LRB- AP -RRB- LOS ANGELES -LRB- AP -RRB- -- Penelope Cruz has been enshrined in concrete.',
 'The Oscar-winning actress unveiled her star on the Hollywood Walk of Fame star Friday , flanked by leading men Javier Bardem and Johnny Depp.',
 "The event is timed ahead of next month 's release of '' `` Pirates of the Caribbean : On Stranger Tides , '' '' in which she stars with Depp.",
 "The Spanish star says that when she came to the United States in 1994 , she only knew how to say '' `` How are you ? '' ''.",
 "and '' `` I want to work with Johnny Depp '' '' in English.",
 "She jokes that now , she knows how to say '' `` I want to work with Johnny Depp again.. '' ''.",
 "The 36-year-old actress co-starred with husband Bardem in her Oscar-winning role in '' `` Vicky Christina Barcelona.. '' ''.",
 'They welcomed their son in January.']

In [17]:
len(files_paragpraphs[0])

14

In [18]:
if not path.isfile(saved_dataset_path + 'files_paragraphs'):
  with open(saved_dataset_path + 'files_paragraphs', 'wb') as f:
    pickle.dump(files_paragpraphs, f)
else:
  print("Dataset files_paragraphs already exist.")

Dataset files_paragraphs already exist.


## 2.3 Tokenization

In [19]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


Encode until token of words

In [20]:
if not path.isfile(saved_dataset_path + 'tokenized_files'):
  tokenized_files = []
  for i in range(len(files_paragpraphs)):
    print(i)
    tokenized_sentences = []
    for sentence in files_paragpraphs[i]:
      tokenized_sentences.append(tokenizer.tokenize(sentence))
    tokenized_files.append(tokenized_sentences)
else:
  print("Dataset tokenized_files already exist. Retrieving..")
  with open(saved_dataset_path + 'tokenized_files', 'rb') as f:
    tokenized_files = pickle.load(f)

print("Jumlah data: ", len(tokenized_files))

Dataset tokenized_files already exist. Retrieving..
Jumlah data:  450


In [21]:
print(len(tokenized_files))
print(len(tokenized_files[0]))
print(len(tokenized_files[0][0]))

450
14
9


In [22]:
print(tokenized_files[0][0])

['the', 'tree', 'of', 'life', 'to', 'premiere', 'in', 'uk', '.']


In [23]:
if not path.isfile(saved_dataset_path + 'tokenized_files'):
  with open(saved_dataset_path + 'tokenized_files', 'wb') as f:
    pickle.dump(tokenized_files, f)
else:
  print("Dataset tokenized_files already exist.")

Dataset tokenized_files already exist.


Full Encoded (until number)

In [24]:
if not (path.isfile(saved_dataset_path + 'files_input_ids') and path.isfile(saved_dataset_path + 'files_attention_masks')):
  max_len = 0
  for i in range(len(tokenized_files)):
    # for j in range(len(tokenized_files[i])):
    for sentence in (tokenized_files[i]):
      if(len(sentence) > max_len):
        max_len = len(sentence)
  print(max_len)      
else:
  print("Dataset files_input_ids & files_attention_masks already exist. Retrieving...")
  with open(saved_dataset_path + 'files_input_ids', 'rb') as f:
    files_input_ids = pickle.load(f)
  with open(saved_dataset_path + 'files_attention_masks', 'rb') as f:
    files_attention_masks = pickle.load(f)

Dataset files_input_ids & files_attention_masks already exist. Retrieving...


In [25]:
for i in range(len(tokenized_files)):
  for j in range(len(tokenized_files[i])):
    if(len(tokenized_files[i][j]) == 87):
      print(i,j)

4 10
21 6
152 13


In [26]:
len(tokenized_files[4][10])

87

In [27]:
if not (path.isfile(saved_dataset_path + 'files_input_ids') and path.isfile(saved_dataset_path + 'files_attention_masks')):
  files_input_ids = []
  files_attention_masks = []

  for i in range(len(files_paragpraphs)):
    input_ids = []
    attention_masks = []
    # print(i)
  
    for sentence in files_paragpraphs[i]:
      encoded_dict = tokenizer.encode_plus(
                        sentence,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len+2,           # Pad & truncate all sentences.
                        padding='max_length',
                        truncation = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
      )

      input_ids.append(encoded_dict['input_ids'][0])
      attention_masks.append(encoded_dict['attention_mask'][0])

    files_input_ids.append(input_ids)
    files_attention_masks.append(attention_masks)
else:
  print("Dataset files_input_ids & files_attention_masks already exist.")

Dataset files_input_ids & files_attention_masks already exist.


In [28]:
np.count_nonzero(files_input_ids[4][10])

89

In [29]:
print(len(files_input_ids))
print(len(files_input_ids[0]))
print(len(files_input_ids[0][0]))
print(np.count_nonzero(files_input_ids[0][0]))

450
14
255
11


In [30]:
if not (path.isfile(saved_dataset_path + 'files_input_ids') and path.isfile(saved_dataset_path + 'files_attention_masks')):
  with open(saved_dataset_path + 'files_input_ids', 'wb') as f:
    pickle.dump(files_input_ids, f)
  with open(saved_dataset_path + 'files_attention_masks', 'wb') as f:
    pickle.dump(files_attention_masks, f)
else:
  print("Dataset files_input_ids & files_attention_masks already exist.")

Dataset files_input_ids & files_attention_masks already exist.


## 2.4. Read Label

In [31]:
def read_keywords(filename):
  keywords = []

  # Opening JSON file
  f = open(root+'dataset/label/train.reader.json',)
   
  # returns JSON object as 
  # a dictionary
  data = json.load(f)
   
  # Iterating through the json
  # list
  for keyword in data[filename]:
    keywords.append(keyword[0])
   
  # Closing file
  f.close()

  return(keywords)

In [32]:
def duplicate_keywords(files_input_ids, keywords):
  duplicated_keywords = []
  for i in range(len(keywords)):
    for j in range(len(files_input_ids[i])):
      duplicated_keywords.append(keywords[i])
  
  return duplicated_keywords

In [33]:
def split_keywords_phrases(keywords):
  keyword_phrases_list = []
  for i in range(len(keywords)):
    keyword_phrases = []
    for j in range(len(keywords[i])):
      keyword_phrases += keywords[i][j].split(" ")

    keyword_phrases_list.append(keyword_phrases)

  return keyword_phrases_list

In [34]:
def process_label(labels_shape, duplicated_keywords, tokenized_sentences):

  labels = [ [ 0 for i in range(labels_shape[1]) ] for j in range(labels_shape[0]) ]

  for i in range(len(tokenized_sentences)):
    for j in range(len(tokenized_sentences[i])):
      if(tokenized_sentences[i][j].lower() in duplicated_keywords[i]):
        labels[i][j+1] = 1 #tambah 1 karena ada token awal CLS 

  return labels

In [35]:
def flatten_data(files_input_ids):
  input_ids = []
  for each_file_input_ids in files_input_ids:
    input_ids += each_file_input_ids
  return input_ids

In [36]:
def encode_label(labels):
  return tf.keras.utils.to_categorical(labels, num_classes=2)

In [37]:
input_ids = flatten_data(files_input_ids)
print(len(input_ids))
print(len(input_ids[0]))

9810
255


In [38]:
tokenized_sentences = flatten_data(tokenized_files)
print(len(tokenized_sentences))

9810


In [39]:
attention_masks = flatten_data(files_attention_masks)
print(len(attention_masks))

9810


In [40]:
if not path.isfile(saved_dataset_path + 'labels'):
  files_keywords = []
  for dataset_filename in datasets_filenames:
    keywords = read_keywords(dataset_filename[:-4]) #-4 untuk menghilangkan .xml dari filename
    files_keywords.append(keywords)
  print(len(files_keywords))
  print(files_keywords[0])
else:
  print("Dataset labels already exist. Retrieving")
  with open(saved_dataset_path + 'labels', 'rb') as f:
    labels = pickle.load(f)

Dataset labels already exist. Retrieving


In [41]:
if not path.isfile(saved_dataset_path + 'labels'):
  duplicated_keywords = duplicate_keywords(files_input_ids, files_keywords)
  print(len(duplicated_keywords))
  print(len(duplicated_keywords[0]))
else:
  print("Dataset labels already exist.")

Dataset labels already exist.


In [42]:
if not path.isfile(saved_dataset_path + 'labels'):
  phrases_splitted_duplicated_keywords = split_keywords_phrases(duplicated_keywords)
  print(len(phrases_splitted_duplicated_keywords))
  print(len(phrases_splitted_duplicated_keywords[0]))
else:
  print("Dataset labels already exist.")

Dataset labels already exist.


In [43]:
labels_shape = [len(input_ids), len(input_ids[0])]

In [44]:
labels_shape

[9810, 255]

In [45]:
if not path.isfile(saved_dataset_path + 'labels'):
  labels = process_label(labels_shape, phrases_splitted_duplicated_keywords, tokenized_sentences)
  print(labels[-1])
else:
  print("Dataset labels already exist.")


Dataset labels already exist.


In [46]:
if not path.isfile(saved_dataset_path + 'labels'):
  with open(saved_dataset_path + 'labels', 'wb') as f:
    pickle.dump(labels, f)
else:
  print("Dataset labels already exist.")

Dataset labels already exist.


## 3.4. Training & Validation Split

In [47]:
with open(saved_dataset_path + 'labels', 'rb') as f:
  labels = pickle.load(f)  

In [48]:
input_ids = torch.stack(input_ids)
input_ids.shape

torch.Size([9810, 255])

In [49]:
attention_masks = torch.stack(attention_masks)
attention_masks.shape

torch.Size([9810, 255])

In [50]:
labels = torch.tensor(labels)
labels.shape

torch.Size([9810, 255])

In [51]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

8,829 training samples
  981 validation samples


In [52]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# 3. Model

In [53]:
model_checkpoint = "distilbert-base-uncased"

In [54]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, 
    num_labels=2, 
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

model.cuda()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
          

## 3.1. Predict (Test Shape)

In [55]:
# def get_list_of_keywords(tokenized_sentences, prediction):
#   keywords=[]
#   for i in range(len(tokenized_sentences)):
#     for j in range(len(tokenized_sentences[i])):
#       if(prediction[i][j+1]==1): #+1 karena ada token CLS di awal sentence
#         keywords.append(tokenized_sentences[i][j])
#   return keywords

In [56]:
# num_sentences = len(tokenized_files[0]) #file pertama
# num_sentences

In [57]:
# len(tokenized_sentences[0])

In [58]:
# prediction = model(input_ids[:num_sentences].to(device), attention_mask=attention_masks[:num_sentences].to(device))
# prediction

In [59]:
# final_prediction = prediction[0].detach().to('cpu').numpy()
# final_prediction.shape

In [60]:
# prediction_temp = np.argmax(final_prediction[:], axis=2)
# prediction_temp.shape

In [61]:
# files_keywords[0]

In [62]:
# get_list_of_keywords(tokenized_sentences[:num_sentences], prediction_temp)

## 3.1 Optimizer & Learning Rate Scheduler

In [63]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [64]:
# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 2

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

## Training Loop

In [65]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [66]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [67]:
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # In PyTorch, calling `model` will in turn call the model's `forward` 
        # function and pass down the arguments. The `forward` function is 
        # documented here: 
        # https://huggingface.co/transformers/model_doc/bert.html#bertforsequenceclassification
        # The results are returned in a results object, documented here:
        # https://huggingface.co/transformers/main_classes/output.html#transformers.modeling_outputs.SequenceClassifierOutput
        # Specifically, we'll get the loss (because we provided labels) and the
        # "logits"--the model outputs prior to activation.
        result = model(b_input_ids,
                       attention_mask=b_input_mask, 
                       labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            result = model(b_input_ids,
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)

        # Get the loss and "logits" output by the model. The "logits" are the 
        # output values prior to applying an activation function like the 
        # softmax.
        loss = result.loss
        logits = result.logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    40  of    276.    Elapsed: 0:00:50.
  Batch    80  of    276.    Elapsed: 0:01:40.
  Batch   120  of    276.    Elapsed: 0:02:30.
  Batch   160  of    276.    Elapsed: 0:03:20.
  Batch   200  of    276.    Elapsed: 0:04:10.
  Batch   240  of    276.    Elapsed: 0:05:00.

  Average training loss: 0.30
  Training epcoh took: 0:05:45

Running Validation...
  Accuracy: 0.98
  Validation Loss: 0.27
  Validation took: 0:00:15

Training...
  Batch    40  of    276.    Elapsed: 0:00:50.
  Batch    80  of    276.    Elapsed: 0:01:40.
  Batch   120  of    276.    Elapsed: 0:02:30.
  Batch   160  of    276.    Elapsed: 0:03:20.
  Batch   200  of    276.    Elapsed: 0:04:10.
  Batch   240  of    276.    Elapsed: 0:05:00.

  Average training loss: 0.26
  Training epcoh took: 0:05:45

Running Validation...
  Accuracy: 0.97
  Validation Loss: 0.27
  Validation took: 0:00:15

Training complete!
Total training took 0:11:59 (h:mm:ss)


In [68]:
# Display floats with two decimal places.
pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.3,0.27,0.98,0:05:45,0:00:15
2,0.26,0.27,0.97,0:05:45,0:00:15


# 4. Save Module

In [69]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './saved_model/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))


Saving model to ./saved_model/


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

In [70]:
# Copy the model files to a directory in your Google Drive.
!cp -r ./saved_model/ "drive/MyDrive/NLP/Tugas Akhir/keyword extraction resources/"

In [71]:
config_class, model_class, tokenizer_class = (BertConfig, AutoModelForTokenClassification, BertTokenizer)

In [72]:
# Load a trained model and vocabulary that you have fine-tuned
model = model_class.from_pretrained(output_dir)
tokenizer = tokenizer_class.from_pretrained(output_dir)

# Copy the model to the GPU.
model.to(device)

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
          