In [2]:
# Load Python programming language corpus
# Different from the course website (original data is no longer available)
from datasets import load_dataset

raw_datasets = load_dataset("theothertom/codeparrot-python-only")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['code', 'repo_name', 'path', 'language', 'license', 'size'],
        num_rows: 45001
    })
    validation: Dataset({
        features: ['code', 'repo_name', 'path', 'language', 'license', 'size'],
        num_rows: 5000
    })
})

In [3]:
# Check size
print(f"Rows: {len(raw_datasets['train'])}")
print(f"Size: {raw_datasets['train'].data.nbytes / 1e6:.1f} MB")

Rows: 45001
Size: 360.7 MB


In [4]:
print(raw_datasets["train"][0]['code'])

from django import forms
from django.core.exceptions import ValidationError
from django.core.validators import validate_slug
from django.db import models
from django.utils import simplejson as json
from django.utils.text import capfirst
from django.utils.translation import ugettext_lazy as _

from philo.forms.fields import JSONFormField
from philo.utils.registry import RegistryIterator
from philo.validators import TemplateValidator, json_validator
#from philo.models.fields.entities import *


class TemplateField(models.TextField):
	"""A :class:`TextField` which is validated with a :class:`.TemplateValidator`. ``allow``, ``disallow``, and ``secure`` will be passed into the validator's construction."""
	def __init__(self, allow=None, disallow=None, secure=True, *args, **kwargs):
		super(TemplateField, self).__init__(*args, **kwargs)
		self.validators.append(TemplateValidator(allow, disallow, secure))


class JSONDescriptor(object):
	def __init__(self, field):
		self.field = field
	
	def 

In [5]:
# Define a generator function
def get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["code"]
        
training_corpus = get_training_corpus()

In [6]:
# Understand old tokenizer behavior
from transformers import AutoTokenizer
old_tokenizer = AutoTokenizer.from_pretrained("gpt2")

example = '''def add_numbers(a, b):
    """Add the two numbers `a` and `b`."""
    return a + b'''

print(old_tokenizer.tokenize(example))

['def', 'Ġadd', '_', 'n', 'umbers', '(', 'a', ',', 'Ġb', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ"""', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`', '."', '""', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']


In [7]:
# Train a new tokenizer from the corpus
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)






In [8]:
tokens = tokenizer.tokenize(example)
print(tokens)

['def', 'Ġadd', '_', 'numbers', '(', 'a', ',', 'Ġb', '):', 'ĊĠĠĠ', 'Ġ"""', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`."""', 'ĊĠĠĠ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']


In [9]:
print(len(tokens))
print(len(old_tokenizer.tokenize(example)))

27
36


In [10]:
example = """class LinearLayer():
    def __init__(self, input_size, output_size):
        self.weight = torch.randn(input_size, output_size)
        self.bias = torch.zeros(output_size)

    def __call__(self, x):
        return x @ self.weights + self.bias
    """
tokenizer.tokenize(example)

['class',
 'ĠLinear',
 'Layer',
 '():',
 'ĊĠĠĠ',
 'Ġdef',
 'Ġ__',
 'init',
 '__(',
 'self',
 ',',
 'Ġinput',
 '_',
 'size',
 ',',
 'Ġoutput',
 '_',
 'size',
 '):',
 'ĊĠĠĠĠĠĠĠ',
 'Ġself',
 '.',
 'weight',
 'Ġ=',
 'Ġtorch',
 '.',
 'randn',
 '(',
 'input',
 '_',
 'size',
 ',',
 'Ġoutput',
 '_',
 'size',
 ')',
 'ĊĠĠĠĠĠĠĠ',
 'Ġself',
 '.',
 'bias',
 'Ġ=',
 'Ġtorch',
 '.',
 'zeros',
 '(',
 'output',
 '_',
 'size',
 ')',
 'ĊĊĠĠĠ',
 'Ġdef',
 'Ġ__',
 'call',
 '__(',
 'self',
 ',',
 'Ġx',
 '):',
 'ĊĠĠĠĠĠĠĠ',
 'Ġreturn',
 'Ġx',
 'Ġ@',
 'Ġself',
 '.',
 'weights',
 'Ġ+',
 'Ġself',
 '.',
 'bias',
 'ĊĠĠĠĠ']

In [11]:
# Saving the tokenizer to disk
tokenizer.save_pretrained("../data/tokenizer_python_52k")

('../data/tokenizer_python_52k/tokenizer_config.json',
 '../data/tokenizer_python_52k/special_tokens_map.json',
 '../data/tokenizer_python_52k/vocab.json',
 '../data/tokenizer_python_52k/merges.txt',
 '../data/tokenizer_python_52k/added_tokens.json',
 '../data/tokenizer_python_52k/tokenizer.json')

In [12]:
# Upload to Hugging Face Hub
import os
from dotenv import load_dotenv

load_dotenv()

tokenizer.push_to_hub(
    "tensor-polinomics/tokenizer_python_52k",
    token=os.getenv("HF_TOKEN_WRITE")  # Bypasses all cached credentials
)

README.md: 0.00B [00:00, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/tensor-polinomics/tokenizer_python_52k/commit/84d21af557eba3635dbad11da610d307dbdb01d4', commit_message='Upload tokenizer', commit_description='', oid='84d21af557eba3635dbad11da610d307dbdb01d4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tensor-polinomics/tokenizer_python_52k', endpoint='https://huggingface.co', repo_type='model', repo_id='tensor-polinomics/tokenizer_python_52k'), pr_revision=None, pr_num=None)

In [27]:
# BatchEncoding object example
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
example = "Hugging Face's transformers library is among the greatest!"
encoding = tokenizer(example)
print(type(encoding))

<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [28]:
tokenizer.is_fast

True

In [29]:
# We can access tokens directly
print(encoding.tokens())

['[CLS]', 'hugging', 'face', "'", 's', 'transformers', 'library', 'is', 'among', 'the', 'greatest', '!', '[SEP]']


In [30]:
encoding.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]

In [32]:
tokenizer_roberta = AutoTokenizer.from_pretrained("roberta-base")
example_short = "81s"
encoding_roberta = tokenizer_roberta(example_short)
encoding_roberta.tokens()

['<s>', '81', 's', '</s>']

In [33]:
encoding_bert = tokenizer(example_short)
encoding_bert.tokens()

['[CLS]', '81', '##s', '[SEP]']

In [77]:
start, end = encoding.word_to_chars(5)
print(f"Token 2 corresponds to characters from {start} to {end}: '{example[start:end]}'")

Token 2 corresponds to characters from 28 to 35: 'k at Hu'


In [37]:
# Pipeline example
from transformers import pipeline
token_classifier = pipeline("token-classification", model="dbmdz/bert-large-cased-finetuned-conll03-english")
example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
token_classifier(example)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'entity': 'I-PER',
  'score': np.float32(0.99938285),
  'index': 4,
  'word': 'S',
  'start': 11,
  'end': 12},
 {'entity': 'I-PER',
  'score': np.float32(0.99815494),
  'index': 5,
  'word': '##yl',
  'start': 12,
  'end': 14},
 {'entity': 'I-PER',
  'score': np.float32(0.99590707),
  'index': 6,
  'word': '##va',
  'start': 14,
  'end': 16},
 {'entity': 'I-PER',
  'score': np.float32(0.99923277),
  'index': 7,
  'word': '##in',
  'start': 16,
  'end': 18},
 {'entity': 'I-ORG',
  'score': np.float32(0.9738931),
  'index': 12,
  'word': 'Hu',
  'start': 33,
  'end': 35},
 {'entity': 'I-ORG',
  'score': np.float32(0.976115),
  'index': 13,
  'word': '##gging',
  'start': 35,
  'end': 40},
 {'entity': 'I-ORG',
  'score': np.float32(0.9887976),
  'index': 14,
  'word': 'Face',
  'start': 41,
  'end': 45},
 {'entity': 'I-LOC',
  'score': np.float32(0.9932106),
  'index': 16,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [38]:
token_classifier = pipeline("token-classification", aggregation_strategy="simple")
token_classifier(example)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'entity_group': 'PER',
  'score': np.float32(0.9981694),
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': np.float32(0.9796019),
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': np.float32(0.9932106),
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [39]:
# Manually build up the pipeline
from transformers import AutoModelForTokenClassification, AutoTokenizer

model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

inputs = tokenizer(example, return_tensors="pt")
outputs = model(**inputs)
inputs, outputs

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


({'input_ids': tensor([[  101,  1422,  1271,  1110,   156,  7777,  2497,  1394,  1105,   146,
           1250,  1120, 20164, 10932, 10289,  1107,  6010,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 TokenClassifierOutput(loss=None, logits=tensor([[[ 8.7508, -2.2626, -1.5300, -2.2889, -0.6513, -2.0016, -0.0112,
           -2.0860,  0.3335],
          [ 8.4973, -2.3986, -1.3582, -2.7887,  0.7575, -1.8873,  0.4344,
           -1.9900, -0.3397],
          [ 9.4719, -2.2261, -0.9849, -2.6116,  0.1219, -2.0627, -0.1259,
           -1.8758, -0.0609],
          [ 9.8670, -2.2175, -1.3125, -2.4866, -0.2550, -1.8536,  0.0856,
           -1.7520, -0.6437],
          [-0.2011, -2.1873, -1.5316, -2.7110,  8.4025, -2.4168, -0.6980,
           -3.0337, -0.0997],
          [ 0.1065, -2.0520, -1.4787, -2.8139,  7.4525, -2.8399, -0.0626,
           -3.3666, -

In [41]:
print(inputs["input_ids"].shape)
print(outputs.logits.shape)

torch.Size([1, 19])
torch.Size([1, 19, 9])


In [48]:
import torch
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
predictions = outputs.logits.argmax(dim=-1)[0].tolist()
predictions

[0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 6, 6, 6, 0, 8, 0, 0]

In [49]:
model.config.id2label

{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

In [58]:
results = []
tokens = inputs.tokens()

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "0":
        results.append({
            "entity": label,
            "score": probabilities[idx][pred],
            "word": tokens[idx],
            "start": start,
            "end": end
        })
results

[{'entity': 'O',
  'score': 0.9994322657585144,
  'word': '[CLS]',
  'start': 28,
  'end': 35},
 {'entity': 'O',
  'score': 0.9989631175994873,
  'word': 'My',
  'start': 28,
  'end': 35},
 {'entity': 'O',
  'score': 0.999708354473114,
  'word': 'name',
  'start': 28,
  'end': 35},
 {'entity': 'O',
  'score': 0.9998350143432617,
  'word': 'is',
  'start': 28,
  'end': 35},
 {'entity': 'I-PER',
  'score': 0.9993828535079956,
  'word': 'S',
  'start': 28,
  'end': 35},
 {'entity': 'I-PER',
  'score': 0.9981548190116882,
  'word': '##yl',
  'start': 28,
  'end': 35},
 {'entity': 'I-PER',
  'score': 0.995907187461853,
  'word': '##va',
  'start': 28,
  'end': 35},
 {'entity': 'I-PER',
  'score': 0.9992327690124512,
  'word': '##in',
  'start': 28,
  'end': 35},
 {'entity': 'O',
  'score': 0.999804675579071,
  'word': 'and',
  'start': 28,
  'end': 35},
 {'entity': 'O',
  'score': 0.9995046854019165,
  'word': 'I',
  'start': 28,
  'end': 35},
 {'entity': 'O',
  'score': 0.9996776580810547,

In [55]:
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True, return_tensors="pt")
offsets = inputs_with_offsets["offset_mapping"]
offsets

tensor([[[ 0,  0],
         [ 0,  2],
         [ 3,  7],
         [ 8, 10],
         [11, 12],
         [12, 14],
         [14, 16],
         [16, 18],
         [19, 22],
         [23, 24],
         [25, 29],
         [30, 32],
         [33, 35],
         [35, 40],
         [41, 45],
         [46, 48],
         [49, 57],
         [57, 58],
         [ 0,  0]]])

In [56]:
example[12:14]

'yl'

In [59]:
example[33:45]

'Hugging Face'

In [None]:
# Manually grouping tokens into entities—this version doesn't work well
import numpy as np

results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True, return_tensors="pt")
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"][0].tolist()

idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label = model.config.id2label[pred]
    
    if label != "0":
        # report the B- or I- label 
        label = label[2:]
        start, _ = offsets[idx]
        
        # Grab all the tokens labled with I-<label>
        all_scores = []
        while (
            idx < len(predictions) and
            model.config.id2label[predictions[idx]] == f"I-{label}"
        ):
            all_scores.append(probabilities[idx][pred])
            _, end = offsets[idx]
            idx += 1
            
        # The score is the meean of all the scores of the tokens in that group entity
        score = np.mean(all_scores).item()
        word = example[start:end]
        results.append({
            "entity": label,
            "score": score,
            "word": word,
            "start": start,
            "end": end
        })
    idx += 1
    
print(results)


[{'entity': '', 'score': nan, 'word': 'My name is Sylvain and I work at Hugging Face in Brooklyn', 'start': 0, 'end': 57}, {'entity': '', 'score': nan, 'word': 'My name is Sylvain and I work at Hugging Face in Brooklyn', 'start': 0, 'end': 57}, {'entity': '', 'score': nan, 'word': 'name is Sylvain and I work at Hugging Face in Brooklyn', 'start': 3, 'end': 57}, {'entity': '', 'score': nan, 'word': 'is Sylvain and I work at Hugging Face in Brooklyn', 'start': 8, 'end': 57}, {'entity': 'PER', 'score': 0.998169407248497, 'word': 'Sylvain', 'start': 11, 'end': 18}, {'entity': '', 'score': nan, 'word': '', 'start': 23, 'end': 18}, {'entity': '', 'score': nan, 'word': '', 'start': 25, 'end': 18}, {'entity': '', 'score': nan, 'word': '', 'start': 30, 'end': 18}, {'entity': 'ORG', 'score': 0.9796018600463867, 'word': 'Hugging Face', 'start': 33, 'end': 45}, {'entity': 'LOC', 'score': 0.99321049451828, 'word': 'Brooklyn', 'start': 49, 'end': 57}, {'entity': '', 'score': nan, 'word': 'My name is

In [None]:
# Improved version that handles missing B- labels

import numpy as np

results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True, return_tensors="pt")
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"][0].tolist()

idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label = model.config.id2label[pred]
    
    # Start entity on B- OR I- (model skips B- tags)
    if label.startswith("B-") or label.startswith("I-"):
        entity_type = label[2:]
        start, end = offsets[idx]
        
        all_scores = [probabilities[idx][pred]]
        idx += 1
        
        # Continue while same entity type (I- tag)
        while idx < len(predictions):
            next_label = model.config.id2label[predictions[idx]]
            if next_label == f"I-{entity_type}":
                all_scores.append(probabilities[idx][predictions[idx]])
                _, end = offsets[idx]
                idx += 1
            else:
                break
                
        score = np.mean(all_scores).item()
        word = example[start:end]
        results.append({
            "entity_group": entity_type,
            "score": score,
            "word": word,
            "start": start,
            "end": end
        })
    else:
        idx += 1
    
print(results)

[{'entity_group': 'PER', 'score': 0.998169407248497, 'word': 'Sylvain', 'start': 11, 'end': 18}, {'entity_group': 'ORG', 'score': 0.9796018600463867, 'word': 'Hugging Face', 'start': 33, 'end': 45}, {'entity_group': 'LOC', 'score': 0.99321049451828, 'word': 'Brooklyn', 'start': 49, 'end': 57}]


In [73]:
# Check what labels the model actually uses
print("Label map:", model.config.id2label)

# Check what predictions look like
print("Predictions:", predictions[:10])
print("Labels:", [model.config.id2label[p] for p in predictions])

Label map: {0: 'O', 1: 'B-MISC', 2: 'I-MISC', 3: 'B-PER', 4: 'I-PER', 5: 'B-ORG', 6: 'I-ORG', 7: 'B-LOC', 8: 'I-LOC'}
Predictions: [0, 0, 0, 0, 4, 4, 4, 4, 0, 0]
Labels: ['O', 'O', 'O', 'O', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'I-LOC', 'O', 'O']
