In [2]:
import six
import collections
import json
import unicodedata
import numpy as np

In [3]:
import tensorflow as tf
from tensorflow import keras

In [4]:
def convert_to_unicode(text):
    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
    if six.PY3:
        if isinstance(text, str):
            return text
        elif isinstance(text, bytes):
            return text.decode("utf-8", "ignore")
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    elif six.PY2:
        if isinstance(text, str):
            return text.decode("utf-8", "ignore")
        elif isinstance(text, unicode):
            return text
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    else:
        raise ValueError("Not running on Python2 or Python 3?")

In [5]:
embedding_dict = {}
with open("/Users/vijay/MIDS/W266/Project/AnsweringMachines/glove.6B.300d.txt") as fp:
    for line in fp:
        e = line.split(" ")
        word = e[0]
        embeddings = np.array(e[1:])
        if word not in embedding_dict:
            embedding_dict[word] = embeddings

In [6]:
# handle "[CLS]" and "[SEP]"
embedding_dict['[CLS]'] = np.random.uniform(0, 1, 300)
embedding_dict['[SEP]'] = np.random.uniform(0, 1, 300)
embedding_dict['[UNK]'] = np.random.uniform(0, 1, 300)

In [7]:
def load_vocab():
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    index = 0
    for k in embedding_dict.keys():
        token = convert_to_unicode(k)
        if not token:
            break
        token = token.strip()
        vocab[token] = index
        index += 1
    return vocab

In [8]:
vocab = load_vocab()

In [9]:
type(vocab)

collections.OrderedDict

In [10]:
inv_vocab = {v: k for k, v in vocab.items()}

In [11]:
inv_vocab[3217]

'unusual'

In [12]:
len(vocab)

400003

In [13]:
def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False

In [14]:
def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False

In [15]:
def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False

In [16]:
class BasicTokenizer(object):
    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

    def __init__(self, do_lower_case=True):
        """Constructs a BasicTokenizer.

        Args:
          do_lower_case: Whether to lower case the input.
        """
        self.do_lower_case = do_lower_case

    def tokenize(self, text):
        """Tokenizes a piece of text."""
        text = convert_to_unicode(text)
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        text = self._tokenize_chinese_chars(text)

        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case:
                token = token.lower()
            token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                    start_new_word = False
            output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
            (cp >= 0x3400 and cp <= 0x4DBF) or  #
            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
            (cp >= 0x2B820 and cp <= 0x2CEAF) or
            (cp >= 0xF900 and cp <= 0xFAFF) or  #
            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or _is_control(char):
                continue
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)

In [17]:
def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens

In [18]:
class WordpieceTokenizer(object):
    """Runs WordPiece tokenziation."""

    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """Tokenizes a piece of text into its word pieces.

        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.

        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]

        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer.

        Returns:
          A list of wordpiece tokens.
        """

        text = convert_to_unicode(text)

        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)

        return output_tokens

In [19]:
wpt = WordpieceTokenizer(vocab)

In [20]:
wpt.tokenize("islet")

['islet']

In [21]:
def printable_text(text):
    """Returns text encoded in a way suitable for print or `tf.logging`."""

    # These functions want `str` for both Python2 and Python3, but in one case
    # it's a Unicode string and in the other it's a byte string.
    if six.PY3:
        if isinstance(text, str):
            return text
        elif isinstance(text, bytes):
            return text.decode("utf-8", "ignore")
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    elif six.PY2:
        if isinstance(text, str):
            return text
        elif isinstance(text, unicode):
            return text.encode("utf-8")
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    else:
        raise ValueError("Not running on Python2 or Python 3?")

In [22]:
class SquadExample(object):
    """A single training/test example for simple sequence classification.

        For examples without an answer, the start and end position are -1.
    """

    def __init__(self,
               qas_id,
               question_text,
               doc_tokens,
               orig_answer_text=None,
               start_position=None,
               end_position=None,
               is_impossible=False):
        self.qas_id = qas_id
        self.question_text = question_text
        self.doc_tokens = doc_tokens
        self.orig_answer_text = orig_answer_text
        self.start_position = start_position
        self.end_position = end_position
        self.is_impossible = is_impossible

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        s = ""
        s += "qas_id: %s" % (printable_text(self.qas_id))
        s += ", question_text: %s" % (
            printable_text(self.question_text))
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += ", original_answer: [%s]" % (" ".join(self.orig_answer_text))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.start_position:
            s += ", end_position: %d" % (self.end_position)
        if self.start_position:
            s += ", is_impossible: %r" % (self.is_impossible)
        return s

In [118]:
def read_squad_examples(input_file, is_training):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    show_first = False
    examples = []
    paragraphs = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            paragraphs.append(paragraph_text)
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            not_in_vocab = 0
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        if c.lower() not in vocab:
                            not_in_vocab += 1
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            if not_in_vocab > 5:
                print(paragraph_text)
                
            if show_first:
                show_first = False
                print("Paragraph: {:s}".format(paragraph_text))
                print(char_to_word_offset)
                print(doc_tokens)
            
            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False
                
                if is_training:

                    if (len(qa["answers"]) != 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have exactly 1 answer.")
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset + answer_length - 1]

                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            whitespace_tokenize(orig_answer_text))
                        #if actual_text.find(cleaned_answer_text) == -1:
                        #  tf.logging.warning("Could not find answer: '%s' vs. '%s'",
                        #                     actual_text, cleaned_answer_text)
                        #  continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""

                example = SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=doc_tokens,
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position,
                    is_impossible=is_impossible)
                
                examples.append(example)

    return examples, paragraphs

In [119]:
squad_examples, paragraphs = read_squad_examples("../AnsweringMachines/dataset/train-v1.1.json", True)

Winters are cold and damp, and prevailing wind patterns that blow offshore minimize the moderating effects of the Atlantic Ocean; yet the Atlantic and the partial shielding from colder air by the Appalachians keep the city warmer in the winter than inland North American cities at similar or lesser latitudes such as Pittsburgh, Cincinnati, and Indianapolis. The daily mean temperature in January, the area's coldest month, is 32.6 °F (0.3 °C); however, temperatures usually drop to 10 °F (−12 °C) several times per winter, and reach 50 °F (10 °C) several days each winter month. Spring and autumn are unpredictable and can range from chilly to warm, although they are usually mild with low humidity. Summers are typically warm to hot and humid, with a daily mean temperature of 76.5 °F (24.7 °C) in July and an average humidity level of 72%. Nighttime conditions are often exacerbated by the urban heat island phenomenon, while daytime temperatures exceed 90 °F (32 °C) on average of 17 days each su

Nanjing has a humid subtropical climate (Köppen Cfa) and is under the influence of the East Asian monsoon. The four seasons are distinct, with damp conditions seen throughout the year, very hot and muggy summers, cold, damp winters, and in between, spring and autumn are of reasonable length. Along with Chongqing and Wuhan, Nanjing is traditionally referred to as one of the "Three Furnacelike Cities" along the Yangtze River (长江流域三大火炉) for the perennially high temperatures in the summertime. However, the time from mid-June to the end of July is the plum blossom blooming season in which the meiyu (rainy season of East Asia; literally "plum rain") occurs, during which the city experiences a period of mild rain as well as dampness. Typhoons are uncommon but possible in the late stages of summer and early part of autumn. The annual mean temperature is around 15.46 °C (59.8 °F), with the monthly 24-hour average temperature ranging from 2.4 °C (36.3 °F) in January to 27.8 °C (82.0 °F) in July.

Present-day statutes from across the nation use the same words and phrases, requiring modern executions to take place within a wall or enclosure to exclude public view. Connecticut General Statute § 54–100 requires death sentences to be conducted in an "enclosure" which "shall be so constructed as to exclude public view." Kentucky Revised Statute 431.220 and Missouri Revised Statute § 546.730 contain substantially identical language. New Mexico's former death penalty, since repealed, see N.M. Stat. § 31-14-12, required executions be conducted in a "room or place enclosed from public view." Similarly, a dormant Massachusetts law, see Mass. Gen. Law ch. 279 § 60, required executions to take place "within an enclosure or building." North Carolina General Statute § 15-188 requires death sentences to be executed "within the walls" of the penitentiary, as do Oklahoma Statute Title 22 § 1015 and Montana Code § 46-19-103. Ohio Revised Code § 2949.22 requires that "[t]he enclosure shall exclude

Along with the rest of South West England, Somerset has a temperate climate which is generally wetter and milder than the rest of the country. The annual mean temperature is approximately 10 °C (50.0 °F). Seasonal temperature variation is less extreme than most of the United Kingdom because of the adjacent sea temperatures. The summer months of July and August are the warmest with mean daily maxima of approximately 21 °C (69.8 °F). In winter mean minimum temperatures of 1 °C (33.8 °F) or 2 °C (35.6 °F) are common. In the summer the Azores high pressure affects the south-west of England, but convective cloud sometimes forms inland, reducing the number of hours of sunshine. Annual sunshine rates are slightly less than the regional average of 1,600 hours. In December 1998 there were 20 days without sun recorded at Yeovilton. Most the rainfall in the south-west is caused by Atlantic depressions or by convection. Most of the rainfall in autumn and winter is caused by the Atlantic depression

The climate of the coastal plain is influenced by the Atlantic Ocean, which keeps conditions mild in winter and moderate, although humid, in summer. The highest coastal, daytime temperature averages less than 89 °F (32 °C) during summer months. The coast has mild temperatures in winter, with daytime highs rarely below 40 °F (4 °C). The average daytime temperature in the coastal plain is usually in the mid-50s °F (11–14 °C) in winter. Temperatures in the coastal plain only occasionally drop below the freezing point at night. The coastal plain averages only around 1 inch (2.5 cm) of snow or ice annually, and in many years, there may be no snow or ice at all.
The Appalachian Mountains are the coolest area of the state, with temperatures averaging in the low 40s and upper 30s °F (6–3 °C) for highs in the winter and falling into the low 20s °F (−5 °C) or lower on winter nights. Relatively cool summers have temperatures rarely rising above 80 °F (27 °C). Average snowfall in many areas exceed

Estonia is situated in the northern part of the temperate climate zone and in the transition zone between maritime and continental climate. Estonia has four seasons of near-equal length. Average temperatures range from 16.3 °C (61.3 °F) on the Baltic islands to 18.1 °C (64.6 °F) inland in July, the warmest month, and from −3.5 °C (25.7 °F) on the Baltic islands to −7.6 °C (18.3 °F) inland in February, the coldest month. The average annual temperature in Estonia is 5.2 °C (41.4 °F). The average precipitation in 1961–1990 ranged from 535 to 727 mm (21.1 to 28.6 in) per year.
Paris has a typical Western European oceanic climate (Köppen climate classification: Cfb ) which is affected by the North Atlantic Current. The overall climate throughout the year is mild and moderately wet. Summer days are usually warm and pleasant with average temperatures hovering between 15 and 25 °C (59 and 77 °F), and a fair amount of sunshine. Each year, however, there are a few days where the temperature rise

The period between the foundation of the Han dynasty and Wang Mang's reign is known as the Western Han dynasty (simplified Chinese: 西汉; traditional Chinese: 西漢; pinyin: Xī Hàn) or Former Han dynasty (simplified Chinese: 前汉; traditional Chinese: 前漢; pinyin: Qiánhàn) (206 BC – 9 AD). During this period the capital was at Chang'an (modern Xi'an). From the reign of Guangwu the capital was moved eastward to Luoyang. The era from his reign until the fall of Han is known as the Eastern Han dynasty (simplified Chinese: 东汉; traditional Chinese: 東漢; pinyin: Dōng Hàn) or the Later Han dynasty (simplified Chinese: 后汉; traditional Chinese: 後漢; pinyin: Hòu Hàn) (25–220 AD).
Hyderabad has a tropical wet and dry climate (Köppen Aw) bordering on a hot semi-arid climate (Köppen BSh). The annual mean temperature is 26.6 °C (79.9 °F); monthly mean temperatures are 21–33 °C (70–91 °F). Summers (March–June) are hot and humid, with average highs in the mid-to-high 30s Celsius; maximum temperatures often exce

In [25]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self,
               unique_id,
               example_index,
               doc_span_index,
               tokens,
               token_to_orig_map,
               token_is_max_context,
               input_ids,
               input_mask,
               segment_ids,
               start_position=None,
               end_position=None,
               is_impossible=None):
        self.unique_id = unique_id
        self.example_index = example_index
        self.doc_span_index = doc_span_index
        self.tokens = tokens
        self.token_to_orig_map = token_to_orig_map
        self.token_is_max_context = token_is_max_context
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.start_position = start_position
        self.end_position = end_position
        self.is_impossible = is_impossible

In [26]:
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):

    """Returns tokenized answer spans that better match the annotated answer."""

      # The SQuAD annotations are character based. We first project them to
      # whitespace-tokenized words. But then after WordPiece tokenization, we can
      # often find a "better match". For example:
      #
      #   Question: What year was John Smith born?
      #   Context: The leader was John Smith (1895-1943).
      #   Answer: 1895
      #
      # The original whitespace-tokenized answer will be "(1895-1943).". However
      # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
      # the exact answer, 1895.
      #
      # However, this is not always possible. Consider the following:
      #
      #   Question: What country is the top exporter of electornics?
      #   Context: The Japanese electronics industry is the lagest in the world.
      #   Answer: Japan
      #
      # In this case, the annotator chose "Japan" as a character sub-span of
      # the word "Japanese". Since our WordPiece tokenizer does not split
      # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
      # in SQuAD, but does happen.
    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))

    for new_start in range(input_start, input_end + 1):
        for new_end in range(input_end, new_start - 1, -1):
            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
            if text_span == tok_answer_text:
                return (new_start, new_end)

    return (input_start, input_end)

In [27]:
def convert_by_vocab(vocab, items):
    """Converts a sequence of [tokens|ids] using the vocab."""
    output = []
    for item in items:
        output.append(vocab[item])
    return output

In [28]:
def convert_tokens_to_ids(vocab, tokens):
    return convert_by_vocab(vocab, tokens)


def convert_ids_to_tokens(inv_vocab, ids):
    return convert_by_vocab(inv_vocab, ids)

In [29]:
class FullTokenizer(object):
    """Runs end-to-end tokenziation."""

    def __init__(self, vocab_file, do_lower_case=True):
        self.vocab = load_vocab()
        self.inv_vocab = {v: k for k, v in self.vocab.items()}
        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

    def tokenize(self, text):
        split_tokens = []
        for token in self.basic_tokenizer.tokenize(text):
            for sub_token in self.wordpiece_tokenizer.tokenize(token):
                split_tokens.append(sub_token)

        return split_tokens

    def convert_tokens_to_ids(self, tokens):
        return convert_by_vocab(self.vocab, tokens)

    def convert_ids_to_tokens(self, ids):
        return convert_by_vocab(self.inv_vocab, ids)

In [30]:
def _check_is_max_context(doc_spans, cur_span_index, position):
    """Check if this is the 'max context' doc span for the token."""

    # Because of the sliding window approach taken to scoring documents, a single
    # token can appear in multiple documents. E.g.
    #  Doc: the man went to the store and bought a gallon of milk
    #  Span A: the man went to the
    #  Span B: to the store and bought
    #  Span C: and bought a gallon of
    #  ...
    #
    # Now the word 'bought' will have two scores from spans B and C. We only
    # want to consider the score with "maximum context", which we define as
    # the *minimum* of its left and right context (the *sum* of left and
    # right context will always be the same, of course).
    #
    # In the example the maximum context for 'bought' would be span C since
    # it has 1 left context and 3 right context, while span B has 4 left context
    # and 0 right context.
    best_score = None
    best_span_index = None
    for (span_index, doc_span) in enumerate(doc_spans):
        end = doc_span.start + doc_span.length - 1
        if position < doc_span.start:
            continue
        if position > end:
            continue
        num_left_context = position - doc_span.start
        num_right_context = end - position
        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
        if best_score is None or score > best_score:
            best_score = score
            best_span_index = span_index

    return cur_span_index == best_span_index

In [31]:
from IPython.display import clear_output
from progress.bar import Bar

In [46]:
import time, sys
from IPython.display import clear_output

def update_progress(progress):
    bar_length=50
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [100]:
def convert_examples_to_features(examples, tokenizer, max_seq_length,
                                 doc_stride, max_query_length, is_training):
                                 
    """Loads a data file into a list of `InputBatch`s."""
    count = 0.0
    unique_id = 1000000000
    max_length = len(examples)
    features = []
    max_query_len = 0
    max_text_length = 0
    for (example_index, example) in enumerate(examples):
        query_tokens = tokenizer.tokenize(example.question_text)

        # cap queries to 64 tokens
        if len(query_tokens) > max_query_length:
            query_tokens = query_tokens[0:max_query_length]
        if len(query_tokens) > max_query_len:
            max_query_len = len(query_tokens)

        tok_to_orig_index = []
        orig_to_tok_index = []
        all_doc_tokens = []
        for (i, token) in enumerate(example.doc_tokens):
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = tokenizer.tokenize(token)
            for sub_token in sub_tokens:
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)

        tok_start_position = None
        tok_end_position = None
        if is_training and example.is_impossible:
            tok_start_position = -1
            tok_end_position = -1
        if is_training and not example.is_impossible:
            tok_start_position = orig_to_tok_index[example.start_position]
            if example.end_position < len(example.doc_tokens) - 1:
                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
            else:
                tok_end_position = len(all_doc_tokens) - 1
            (tok_start_position, tok_end_position) = _improve_answer_span(
                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
                example.orig_answer_text)

        # The -3 accounts for [CLS], [SEP] and [SEP]
        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3

        # We can have documents that are longer than the maximum sequence length.
        # To deal with this we do a sliding window approach, where we take chunks
        # of the up to our max length with a stride of `doc_stride`.
        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
            "DocSpan", ["start", "length"])
        doc_spans = []
        start_offset = 0
        while start_offset < len(all_doc_tokens):
            length = len(all_doc_tokens) - start_offset
            if length > max_tokens_for_doc:
                length = max_tokens_for_doc
            doc_spans.append(_DocSpan(start=start_offset, length=length))
            if start_offset + length == len(all_doc_tokens):
                break
            start_offset += min(length, doc_stride)

        # each input is: [CLS] [Qt] [Qt] [SEP] [Tt] [Tt]
        # where [Qt]s are tokens from query
        # and [Tt]s are tokens from the text
        for (doc_span_index, doc_span) in enumerate(doc_spans):
            tokens = []
            token_to_orig_map = {}
            token_is_max_context = {}
            segment_ids = []
            tokens.append("[CLS]")
            segment_ids.append(0)
            for token in query_tokens:
                tokens.append(token)
                segment_ids.append(0)
            tokens.append("[SEP]")
            segment_ids.append(0)

            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]

                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
                                               split_token_index)
                token_is_max_context[len(tokens)] = is_max_context
                tokens.append(all_doc_tokens[split_token_index])
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1] * len(input_ids)

            # Zero-pad up to the sequence length.
            while len(input_ids) < max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length

            start_position = None
            end_position = None
            if is_training and not example.is_impossible:
                # For training, if our document chunk does not contain an annotation
                # we throw it out, since there is nothing to predict.
                doc_start = doc_span.start
                doc_end = doc_span.start + doc_span.length - 1
                out_of_span = False
                if not (tok_start_position >= doc_start and
                        tok_end_position <= doc_end):
                    out_of_span = True
                if out_of_span:
                    start_position = 0
                    end_position = 0
                else:
                    doc_offset = len(query_tokens) + 2
                    start_position = tok_start_position - doc_start + doc_offset
                    end_position = tok_end_position - doc_start + doc_offset

            if is_training and example.is_impossible:
                start_position = 0
                end_position = 0

            if example_index < 0:
                print("*** Example ***")
                print("unique_id: %s" % (unique_id))
                print("example_index: %s" % (example_index))
                print("doc_span_index: %s" % (doc_span_index))
                print("tokens: %s" % " ".join(
                    [printable_text(x) for x in tokens]))
                print("token_to_orig_map: %s" % " ".join(
                    ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)]))
                print("token_is_max_context: %s" % " ".join([
                    "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context)
                    ]))
                print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
                print(
                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
                print(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
                if is_training and example.is_impossible:
                    print("impossible example")
                if is_training and not example.is_impossible:
                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
                    print("start_position: %d" % (start_position))
                    print("end_position: %d" % (end_position))
                    print(
                        "answer: %s" % (printable_text(answer_text)))

            feature = InputFeatures(
                    unique_id=unique_id,
                    example_index=example_index,
                    doc_span_index=doc_span_index,
                    tokens=tokens,
                    token_to_orig_map=token_to_orig_map,
                    token_is_max_context=token_is_max_context,
                    input_ids=input_ids,
                    input_mask=input_mask,
                    segment_ids=segment_ids,
                    start_position=start_position,
                    end_position=end_position,
                    is_impossible=example.is_impossible)

            features.append(feature)
            # Run callback
            #output_fn(feature)

        unique_id += 1
        count += 1.0
        if (count % 1000) == 0:
            update_progress(count/max_length)

    print("{} queries were greater than 64 tokens".format(num_queries_greater))
    return features

In [34]:
tokenizer = FullTokenizer("/Users/vijay/MIDS/w266/Project/BERT/uncased_L-12_H-768_A-12/vocab.txt")

In [102]:
features = convert_examples_to_features(squad_examples, tokenizer, 384, 128, 32, True)

Progress: [##################################################] 99.3%
49 queries were greater than 64 tokens


In [37]:
max_features

88152

In [38]:
reduced_features = features[:5000]

In [40]:
max_features = len(reduced_features)

In [41]:
X_squad = np.ndarray(shape=(max_features,384,300), dtype=float)

In [48]:
for i, f in enumerate(reduced_features):
    f_vector = [ ]
    for _id in f.input_ids:
        w = inv_vocab[_id]
        f_vector.append(embedding_dict[w])
    v = np.asarray(f_vector)
    X_squad[i-1] = v
    update_progress(i/max_features)

Progress: [##################################################] 100.0%


In [49]:
X_squad.shape

(5000, 384, 300)

In [55]:
y_squad = np.ndarray(shape=(max_features,384), dtype=float)

In [75]:
X_train[0].shape

(384, 300)

In [79]:
for i, f in enumerate(reduced_features):
    y = np.zeros((384,))
    y[f.start_position] = 1.0
    y_squad[i-1] = y
    update_progress(i/max_features)

Progress: [##################################################] 100.0%


In [80]:
X_train = X_squad[:4000]
X_test = X_squad[4000:]

In [81]:
y_train = y_squad[:4000]
y_test = y_squad[4000:]

In [108]:
model = keras.models.Sequential([
        keras.layers.Flatten(input_shape=[384, 300]),
        keras.layers.Dropout(rate=0.5),
        keras.layers.Dense(600, activation='relu'),
        keras.layers.Dense(384, activation="softmax")
])

In [121]:
model.layers[0].trainable = False

In [123]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_5 (Flatten)          (None, 115200)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 115200)            0         
_________________________________________________________________
dense_11 (Dense)             (None, 600)               69120600  
_________________________________________________________________
dense_12 (Dense)             (None, 384)               230784    
Total params: 69,351,384
Trainable params: 69,351,384
Non-trainable params: 0
_________________________________________________________________


In [124]:
model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

In [125]:
model.fit(X_train, y_train, epochs=10)

Epoch 1/10

KeyboardInterrupt: 

In [99]:
model.evaluate(X_test, y_test)



[5.2082183494567875, 0.012]