Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
req


tok


new

remove


fix

uu
  • Loading branch information
yhn112 committed Sep 17, 2021
1 parent 2d1bc32 commit 01f839f
Show file tree
Hide file tree
Showing 26 changed files with 1,287 additions and 10 deletions.
83 changes: 83 additions & 0 deletions .gitignore
@@ -0,0 +1,83 @@
# node and NPM
npm-debug.log
node_modules

# swap files
*~
*.swp

examples/data/*
examples/runs/*
examples/.ipynb_checkpoints/*

env.sh
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

# C extensions
*.so

# Distribution / packaging
.Python
env/
bin/
build/
develop-eggs/
dist/
eggs/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg/

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.cache
nosetests.xml
coverage.xml

# Translations
*.mo

# Mr Developer
.mr.developer.cfg
.project
.pydevproject
.idea
.ipynb_checkpoints

# Rope
.ropeproject

# Django stuff:
*.log
*.pot

# Sphinx documentation
docs/_build/
docs/tmp*

# OS X garbage
.DS_Store

# Debian things
debian/reproducible-experiment-platform
debian/files
*.substvars
*.debhelper.log

# protobuf stuff
hivemind/proto/*_pb2*

# libp2p-daemon binary
hivemind/hivemind_cli/p2pd
31 changes: 31 additions & 0 deletions config.json
@@ -0,0 +1,31 @@
{
"architectures": [
"AlbertForMaskedLM"
],
"attention_probs_dropout_prob": 0,
"bos_token_id": 2,
"classifier_dropout_prob": 0.1,
"down_scale_factor": 1,
"embedding_size": 128,
"eos_token_id": 3,
"gap_size": 0,
"hidden_act": "gelu_new",
"hidden_act_gated": true,
"hidden_dropout_prob": 0,
"hidden_size": 4096,
"initializer_range": 0.02,
"inner_group_num": 1,
"intermediate_size": 16384,
"layer_norm_eps": 1e-12,
"layers_to_keep": [],
"position_embedding_type": "rotary",
"model_type": "albert",
"net_structure_type": 0,
"num_attention_heads": 64,
"num_hidden_groups": 1,
"num_hidden_layers": 8,
"num_memory_blocks": 0,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 30000
}
8 changes: 4 additions & 4 deletions dataset.py
Expand Up @@ -14,7 +14,7 @@ def __init__(self, X, Y=None, device="cpu"):
self.X = X
self.Y = Y
self.device = device

def __len__(self):
return len(self.X["input_ids"])

Expand All @@ -30,7 +30,7 @@ def make_dataset(tokenizer, data, has_labels=True, device="cpu",
answer_field="answer", pos_label=True):
questions = [elem[first_key] for elem in data]
passages = [elem[second_key] for elem in data]
X = tokenizer(text=questions, text_pair=passages, truncation=True)
X = tokenizer(text=questions, text_pair=passages, truncation=True, max_length=512)
if has_labels:
Y = torch.FloatTensor([int(elem[answer_field]==pos_label) for elem in data])
else:
Expand All @@ -39,7 +39,7 @@ def make_dataset(tokenizer, data, has_labels=True, device="cpu",


class OrderedBatchSampler(Sampler):

def __init__(self, data, batch_size, length_func=None, shuffle=True, random_state=187):
if length_func is None:
length_func = lambda x: 0
Expand All @@ -62,4 +62,4 @@ def make_dataloader(dataset, batch_size=16, shuffle=True, key="input_ids"):
length_func = lambda x: len(x[key]) if key else None
sampler = OrderedBatchSampler(dataset, batch_size=batch_size,
length_func=length_func, shuffle=shuffle)
return DataLoader(dataset, collate_fn=collate_fn, batch_sampler=sampler)
return DataLoader(dataset, collate_fn=collate_fn, batch_sampler=sampler)
2 changes: 2 additions & 0 deletions lib/__init__.py
@@ -0,0 +1,2 @@
from .models import *
from .modules import *
Empty file added lib/data/__init__.py
Empty file.
148 changes: 148 additions & 0 deletions lib/data/data_collator.py
@@ -0,0 +1,148 @@
import random
import warnings
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union

import torch
from transformers import DataCollatorForLanguageModeling
from transformers.data.data_collator import _collate_batch, tolist
from transformers.models.albert import AlbertTokenizer, AlbertTokenizerFast
from transformers.tokenization_utils_base import BatchEncoding


def _is_start_piece_sp(piece):
"""Check if the current word piece is the starting piece (sentence piece)."""
special_pieces = set(list('!"#$%&"()*+,-./:;?@[\\]^_`{|}~'))
special_pieces.add(u"€".encode("utf-8"))
special_pieces.add(u"£".encode("utf-8"))
# Note(mingdachen):
# For foreign characters, we always treat them as a whole piece.
english_chars = set(list("abcdefghijklmnopqrstuvwxyz"))
if (
piece.startswith("▁")
or piece.startswith("<")
or piece in special_pieces
or not all(i.lower() in english_chars.union(special_pieces) for i in piece)
):
return True
else:
return False


@dataclass
class AlbertDataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
"""
Data collator used for language modeling that masks entire words.
- collates batches of tensors, honoring their tokenizer's pad_token
- preprocesses batches for masked language modeling
.. note::
This collator relies on details of the implementation of subword tokenization by
:class:`~transformers.AlbertTokenizer`, specifically that start-of-word tokens are prefixed with `▁`.
For tokenizers that do not adhere to this scheme, this collator will produce an output that is roughly
equivalent to :class:`.DataCollatorForLanguageModeling`.
"""

def __call__(
self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]]
) -> Dict[str, torch.Tensor]:
if isinstance(examples[0], (dict, BatchEncoding)):
batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of)
else:
batch = {"input_ids": _collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)}

# If special token mask has been preprocessed, pop it from the dict.
special_tokens_mask = batch.pop("special_tokens_mask", None)

mask_labels = []
for e in batch["input_ids"]:
ref_tokens = self.tokenizer.convert_ids_to_tokens(tolist(e))
mask_labels.append(self._whole_word_mask(ref_tokens))
batch_mask = _collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)

batch["input_ids"], batch["labels"] = self.mask_tokens(
batch["input_ids"], batch_mask, special_tokens_mask=special_tokens_mask
)
return batch

def _whole_word_mask(self, input_tokens: List[str], max_predictions=512):
"""
Get 0/1 labels for masked tokens with whole word mask proxy
"""
if not isinstance(self.tokenizer, (AlbertTokenizer, AlbertTokenizerFast)):
warnings.warn("AlbertDataCollatorForWholeWordMask is only suitable for AlbertTokenizer-like tokenizers.")

cand_indexes = []
for i, token in enumerate(input_tokens):
if token in (self.tokenizer.cls_token, self.tokenizer.sep_token, self.tokenizer.pad_token):
continue

if len(cand_indexes) >= 1 and not _is_start_piece_sp(token):
cand_indexes[-1].append(i)
else:
cand_indexes.append([i])

random.shuffle(cand_indexes)
num_to_predict = min(max_predictions, max(1, int(round(len(input_tokens) * self.mlm_probability))))

mask_labels = torch.zeros((len(input_tokens),), dtype=torch.long)
covered_indexes = set()

for index_set in cand_indexes:
if len(covered_indexes) >= num_to_predict:
break

# If adding a whole-word mask would exceed the maximum number of
# predictions, then just skip this candidate.
if len(covered_indexes) + len(index_set) > num_to_predict:
continue

is_any_index_covered = any(index in covered_indexes for index in index_set)
if is_any_index_covered:
continue

for index in index_set:
covered_indexes.add(index)
mask_labels[index] = 1

return mask_labels

def mask_tokens(
self, inputs: torch.Tensor, mask_labels: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
'mask_labels' means we use whole word mask (WMM), we directly mask idxs according to it's ref.
"""
assert self.mlm
labels = inputs.clone()
# We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)

probability_matrix = mask_labels

if special_tokens_mask is None:
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
else:
special_tokens_mask = special_tokens_mask.bool()

probability_matrix.masked_fill_(special_tokens_mask, value=0.0)

masked_indices = probability_matrix.bool()
labels[~masked_indices] = -100 # We only compute loss on masked tokens

# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

# 10% of the time, we replace masked input tokens with random word
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
inputs[indices_random] = random_words[indices_random]

# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels

0 comments on commit 01f839f

Please sign in to comment.