In [1]:
!pip install langdetect
!pip install fastai

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993225 sha256=97cd4a3ce871a6a0c0550184f2893834fb5befd341a01455f1d844aed8e50dea
  Stored in directory: /root/.cache/pip/wheels/6a/67/f8/9cf1a8ff87e0b37f738769df49cc142a655489a6d27b68089f
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
[0mCollecting fastai
  Downloading fastai-2.7.14-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.2/232.2 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
Collecting fastdownload<2,>=0.0.5
  Downloading fastdownload-0.0.7-

# 1. Import data

In [2]:
import pandas as pd

filename = 'bios.csv'

texts = pd.read_csv(filename)['text']

# 2. Import pre-trained model

In [3]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import torch

pretrained_weights = 'gpt2'
tokenizer = GPT2TokenizerFast.from_pretrained(pretrained_weights)
model = GPT2LMHeadModel.from_pretrained(pretrained_weights)

Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/523M [00:00<?, ?B/s]

# 3. Load data to fast.ai Learner

In [7]:
from fastai.text.all import *

class TransformersTokenizer(Transform):
    def __init__(self, tokenizer): self.tokenizer = tokenizer
    def encodes(self, x): 
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        toks = self.tokenizer.tokenize(x)
        return tensor(self.tokenizer.convert_tokens_to_ids(toks)).to(device).long()
    def decodes(self, x): return TitledStr(self.tokenizer.decode(x.cpu().numpy()))

In [4]:
def splitter(model):
    "Split a GPT2 `model` in 3 groups for differential learning rates."
    
    # First layers group : decoder blocks from 0 to 3
    modules = []
    for i in range(4): modules.append(model.transformer.h[i])
    groups = [nn.Sequential(*modules)]

    # Second layers group : decoder blocks from 4 to 7
    modules = []
    for i in range(4,8,1): modules.append(model.transformer.h[i])
    groups = L(groups + [nn.Sequential(*modules)])

    # Third layers group : decoder blocks from 8 to 11
    modules = []
    for i in range(8,12,1): modules.append(model.transformer.h[i])
    groups = L(groups + [nn.Sequential(*modules)])
    
    # Fourth layers group : embeddings matrices wte and wpe + LayerNorm at the model output
    groups = L(groups + [nn.Sequential(model.transformer.wte,model.transformer.wpe,model.transformer.ln_f)])
    
    return groups.map(params)

In [8]:
# Replace predictions with only its first element to allow fine-tuning
class DropOutput(Callback):
    def after_pred(self): self.learn.pred = self.pred[0]

In [13]:
def get_learner(train_pct=0.7):
    
    '''Function that load data into fast.ai and create a fast.ai learner using the data.'''
    
    # Transform texts into token : id mappings
    l = len(texts)
    train_len = int(l * train_pct)
    splits = [range_of(train_len), list(range(train_len, l))]
    tls = TfmdLists(texts, TransformersTokenizer(tokenizer), splits=splits, dl_type=LMDataLoader)
    
    # Create a dataloader object that contains attributes used in training loop
    bs,sl = 4,256 # batch_size, sequence_length
    dls = tls.dataloaders(bs=bs, seq_len=sl)
    
    # Create a learner for our use
    learn = Learner(dls, model, loss_func=CrossEntropyLossFlat(), 
                    splitter = splitter, cbs=[DropOutput], 
                    metrics=[accuracy, Perplexity()]).to_fp16()
    
    return learn

In [14]:
learn = get_learner()

In [6]:
def initial_fine_tune(learn):
    
    '''Function that applies 1 epoch to frozen pre-trained model.'''
    
    # Print metrics without fine-tuning
    learn.validate()
    
    # Freeze pre-trained model
    learn.freeze()
    
    # Fine-tune once
    lr_rec = learn.lr_find().valley
    lr = lr_rec * 1/3
    learn.fit_one_cycle(1, lr)
    
    learn.recorder.plot_loss()
    
    learn.save("1epoch")
    
    return learn, lr

In [8]:
def fine_tune_10epoch(learn, lr):
    
    '''Function that applies 10 epoch to frozen pre-trained model.
    Use lr from `initial_fine_tune()`'''
    
    # Freeze pre-trained model
    learn.freeze()
    
    # Fine-tune for 10 epochs
    lr *= 1/10
    learn.fit_one_cycle(10, lr)
    
    # Save
    learn.save("10epoch")
    
    return learn, lr

In [None]:
def fine_tune_unfreeze_2_layers(learn, lr):
    
    '''Function that applies 1 epoch to model that have the last 2 layers unfrozen.
    Use lr from `initial_fine_tune()`'''
    
    learn.freeze_to(-2)
    
    lr *= 1/2
    learn.fit_one_cycle(1, slice(lr/(2.6**4),lr))
    
    learn.recorder.plot_loss()
    
    learn.save("unfreeze_2_layers")
    
    return learn, lr

In [None]:
def fine_tune_unfreeze_3_layers(learn, lr):
    
    '''Function that applies 1 epoch to model that have the last 3 layers unfrozen.
    Use lr from `fine_tune_unfreeze_2_layers()`'''
    
    learn.freeze_to(-3)
    
    lr *= 1/2
    learn.fit_one_cycle(1, slice(lr/(2.6**4),lr))
    
    learn.recorder.plot_loss()
    
    learn.save("unfreeze_3_layers")
    
    return learn

In [None]:
def fine_tune_unfreeze(learn, lr):
    
    '''Function that applies 1 epoch to model that have all layers unfrozen.
    Use lr from `fine_tune_unfreeze_2_layers()`'''
    
    learn.unfreeze()
    
    lr *= 1/10
    learn.fit_one_cycle(1, slice(lr/(2.6**4),lr))
    
    learn.recorder.plot_loss()
    
    learn.save("fine-tuned")
    
    return learn