init

req tok new remove fix uu
yhn112 · Sep 17, 2021 · 01f839f · 01f839f
1 parent 2d1bc32
commit 01f839f
Show file tree

Hide file tree

Showing 26 changed files with 1,287 additions and 10 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,83 @@
+# node and NPM
+npm-debug.log
+node_modules
+
+# swap files
+*~
+*.swp
+
+examples/data/*
+examples/runs/*
+examples/.ipynb_checkpoints/*
+
+env.sh
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg/
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Translations
+*.mo
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+.idea
+.ipynb_checkpoints
+
+# Rope
+.ropeproject
+
+# Django stuff:
+*.log
+*.pot
+
+# Sphinx documentation
+docs/_build/
+docs/tmp*
+
+# OS X garbage
+.DS_Store
+
+# Debian things
+debian/reproducible-experiment-platform
+debian/files
+*.substvars
+*.debhelper.log
+
+# protobuf stuff
+hivemind/proto/*_pb2*
+
+# libp2p-daemon binary
+hivemind/hivemind_cli/p2pd
diff --git a/config.json b/config.json
@@ -0,0 +1,31 @@
+{
+  "architectures": [
+    "AlbertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0,
+  "bos_token_id": 2,
+  "classifier_dropout_prob": 0.1,
+  "down_scale_factor": 1,
+  "embedding_size": 128,
+  "eos_token_id": 3,
+  "gap_size": 0,
+  "hidden_act": "gelu_new",
+  "hidden_act_gated": true,
+  "hidden_dropout_prob": 0,
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "inner_group_num": 1,
+  "intermediate_size": 16384,
+  "layer_norm_eps": 1e-12,
+  "layers_to_keep": [],
+  "position_embedding_type": "rotary",
+  "model_type": "albert",
+  "net_structure_type": 0,
+  "num_attention_heads": 64,
+  "num_hidden_groups": 1,
+  "num_hidden_layers": 8,
+  "num_memory_blocks": 0,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 30000
+}
diff --git a/dataset.py b/dataset.py
@@ -14,7 +14,7 @@ def __init__(self, X, Y=None, device="cpu"):
         self.X = X
         self.Y = Y
         self.device = device
-    
+
     def __len__(self):
         return len(self.X["input_ids"])
 
@@ -30,7 +30,7 @@ def make_dataset(tokenizer, data, has_labels=True, device="cpu",
                  answer_field="answer", pos_label=True):
     questions = [elem[first_key] for elem in data]
     passages = [elem[second_key] for elem in data]
-    X = tokenizer(text=questions,  text_pair=passages, truncation=True)
+    X = tokenizer(text=questions,  text_pair=passages, truncation=True, max_length=512)
     if has_labels:
         Y = torch.FloatTensor([int(elem[answer_field]==pos_label) for elem in data])
     else:
@@ -39,7 +39,7 @@ def make_dataset(tokenizer, data, has_labels=True, device="cpu",
 
 
 class OrderedBatchSampler(Sampler):
-    
+
     def __init__(self, data, batch_size, length_func=None, shuffle=True, random_state=187):
         if length_func is None:
             length_func = lambda x: 0
@@ -62,4 +62,4 @@ def make_dataloader(dataset, batch_size=16, shuffle=True, key="input_ids"):
     length_func = lambda x: len(x[key]) if key else None
     sampler = OrderedBatchSampler(dataset, batch_size=batch_size, 
                                   length_func=length_func, shuffle=shuffle)
-    return DataLoader(dataset, collate_fn=collate_fn, batch_sampler=sampler)
+    return DataLoader(dataset, collate_fn=collate_fn, batch_sampler=sampler)
diff --git a/lib/__init__.py b/lib/__init__.py
@@ -0,0 +1,2 @@
+from .models import *
+from .modules import *
diff --git a/lib/data/__init__.py b/lib/data/__init__.py
diff --git a/lib/data/data_collator.py b/lib/data/data_collator.py
@@ -0,0 +1,148 @@
+import random
+import warnings
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import DataCollatorForLanguageModeling
+from transformers.data.data_collator import _collate_batch, tolist
+from transformers.models.albert import AlbertTokenizer, AlbertTokenizerFast
+from transformers.tokenization_utils_base import BatchEncoding
+
+
+def _is_start_piece_sp(piece):
+    """Check if the current word piece is the starting piece (sentence piece)."""
+    special_pieces = set(list('!"#$%&"()*+,-./:;?@[\\]^_`{|}~'))
+    special_pieces.add(u"€".encode("utf-8"))
+    special_pieces.add(u"£".encode("utf-8"))
+    # Note(mingdachen):
+    # For foreign characters, we always treat them as a whole piece.
+    english_chars = set(list("abcdefghijklmnopqrstuvwxyz"))
+    if (
+        piece.startswith("▁")
+        or piece.startswith("<")
+        or piece in special_pieces
+        or not all(i.lower() in english_chars.union(special_pieces) for i in piece)
+    ):
+        return True
+    else:
+        return False
+
+
+@dataclass
+class AlbertDataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
+    """
+    Data collator used for language modeling that masks entire words.
+
+    - collates batches of tensors, honoring their tokenizer's pad_token
+    - preprocesses batches for masked language modeling
+
+    .. note::
+
+        This collator relies on details of the implementation of subword tokenization by
+        :class:`~transformers.AlbertTokenizer`, specifically that start-of-word tokens are prefixed with `▁`.
+        For tokenizers that do not adhere to this scheme, this collator will produce an output that is roughly
+        equivalent to :class:`.DataCollatorForLanguageModeling`.
+    """
+
+    def __call__(
+        self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]]
+    ) -> Dict[str, torch.Tensor]:
+        if isinstance(examples[0], (dict, BatchEncoding)):
+            batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of)
+        else:
+            batch = {"input_ids": _collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)}
+
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+
+        mask_labels = []
+        for e in batch["input_ids"]:
+            ref_tokens = self.tokenizer.convert_ids_to_tokens(tolist(e))
+            mask_labels.append(self._whole_word_mask(ref_tokens))
+        batch_mask = _collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+
+        batch["input_ids"], batch["labels"] = self.mask_tokens(
+            batch["input_ids"], batch_mask, special_tokens_mask=special_tokens_mask
+        )
+        return batch
+
+    def _whole_word_mask(self, input_tokens: List[str], max_predictions=512):
+        """
+        Get 0/1 labels for masked tokens with whole word mask proxy
+        """
+        if not isinstance(self.tokenizer, (AlbertTokenizer, AlbertTokenizerFast)):
+            warnings.warn("AlbertDataCollatorForWholeWordMask is only suitable for AlbertTokenizer-like tokenizers.")
+
+        cand_indexes = []
+        for i, token in enumerate(input_tokens):
+            if token in (self.tokenizer.cls_token, self.tokenizer.sep_token, self.tokenizer.pad_token):
+                continue
+
+            if len(cand_indexes) >= 1 and not _is_start_piece_sp(token):
+                cand_indexes[-1].append(i)
+            else:
+                cand_indexes.append([i])
+
+        random.shuffle(cand_indexes)
+        num_to_predict = min(max_predictions, max(1, int(round(len(input_tokens) * self.mlm_probability))))
+
+        mask_labels = torch.zeros((len(input_tokens),), dtype=torch.long)
+        covered_indexes = set()
+
+        for index_set in cand_indexes:
+            if len(covered_indexes) >= num_to_predict:
+                break
+
+            # If adding a whole-word mask would exceed the maximum number of
+            # predictions, then just skip this candidate.
+            if len(covered_indexes) + len(index_set) > num_to_predict:
+                continue
+
+            is_any_index_covered = any(index in covered_indexes for index in index_set)
+            if is_any_index_covered:
+                continue
+
+            for index in index_set:
+                covered_indexes.add(index)
+                mask_labels[index] = 1
+
+        return mask_labels
+
+    def mask_tokens(
+        self, inputs: torch.Tensor, mask_labels: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
+        'mask_labels' means we use whole word mask (WMM), we directly mask idxs according to it's ref.
+        """
+        assert self.mlm
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+
+        probability_matrix = mask_labels
+
+        if special_tokens_mask is None:
+            special_tokens_mask = [
+                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+            ]
+            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
+        else:
+            special_tokens_mask = special_tokens_mask.bool()
+
+        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
+
+        masked_indices = probability_matrix.bool()
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels