<a href="https://colab.research.google.com/github/karino2/US-patent-analysis/blob/triplet_loss_colab/colab/bert_tokenized_claim_setup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Create tokenized dataset for BERT, 2000-2000 prediction benchmark trial**

Basic code is from here: 
https://github.com/karino2/US-patent-analysis/blob/master/notebook/tfidf_nearest.ipynb

In [0]:
import os
import datetime
import pickle
import gzip

import tensorflow as tf

In [2]:
tf.__version__

'1.13.1'

In [3]:
!git clone -b docker https://github.com/yoheikikuta/bert.git

fatal: destination path 'bert' already exists and is not an empty directory.


In [4]:
!ls

adc.json  bert	sample_data


**Check tpu name**

In [5]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()

with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

TPU address is grpc://10.81.75.66:8470
TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 10635577033367471859),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 976606133020435086),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 4501356155637751213),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 13632525968536816513),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 8641085653688703972),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 369903914460703330),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 9085190748269255527),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 10709987216607880277),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:6, TPU, 17179869184, 4476668466533816

# Dataset setup

Download dataframe from cloud storage

In [6]:
!mkdir ./bert/data

mkdir: cannot create directory ‘./bert/data’: File exists


In [7]:
!gsutil cp gs://karino2-uspatent/citations_info_2000.df.gz ./bert/data/
!gsutil cp gs://karino2-uspatent/testset_app_1000.df.gz ./bert/data/
!gsutil cp gs://karino2-uspatent/training_app_1000.df.gz ./bert/data/
!gsutil cp gs://karino2-uspatent/grants_for_2000.df.gz ./bert/data/

Copying gs://karino2-uspatent/citations_info_2000.df.gz...
/ [1 files][234.3 KiB/234.3 KiB]                                                
Operation completed over 1 objects/234.3 KiB.                                    
Copying gs://karino2-uspatent/testset_app_1000.df.gz...
/ [1 files][ 13.4 MiB/ 13.4 MiB]                                                
Operation completed over 1 objects/13.4 MiB.                                     
Copying gs://karino2-uspatent/training_app_1000.df.gz...
/ [1 files][ 14.2 MiB/ 14.2 MiB]                                                
Operation completed over 1 objects/14.2 MiB.                                     
Copying gs://karino2-uspatent/grants_for_2000.df.gz...
- [1 files][ 44.6 MiB/ 44.6 MiB]                                                
Operation completed over 1 objects/44.6 MiB.                                     


In [0]:
import pandas as pd
import numpy as np

In [0]:
citations_info_target = pd.read_pickle("./bert/data/citations_info_2000.df.gz")
training_app_df = pd.read_pickle("./bert/data/training_app_1000.df.gz")
testset_app_df = pd.read_pickle("./bert/data/testset_app_1000.df.gz")
grants_target_df = pd.read_pickle("./bert/data/grants_for_2000.df.gz")

In [0]:
citations_info_target.shape

(4179, 41)

In [10]:
training_app_df.head().app_id

0    14222691
1    12515852
2    12033424
3    12402344
4    12155425
Name: app_id, dtype: int64

### Utility: Retrieve just claim. Remove all tags.

In [0]:
import re

In [0]:
CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)
TAG_PAT = re.compile(r"<.*?>")

In [0]:
def whole_xml_to_claim_xml(whole):
    mat = CLAIM_PAT.search(whole)
    return mat.group(1)
  
  
def whole_xml_to_claim(whole):
    return TAG_PAT.sub(' ', whole_xml_to_claim_xml(whole))

# Bert tokenizer setup

In [0]:
VOCAB_CONFIG_PATH=' gs://yohei-kikuta/mlstudy-phys/bert/models/pre-trained-models/uncased_L-12_H-768_A-12' #@param {type:"string"}

In [15]:
!gsutil cp {VOCAB_CONFIG_PATH}/vocab.txt ./bert/model/patent/vocab.txt
!gsutil cp {VOCAB_CONFIG_PATH}/bert_config.json ./bert/model/patent/bert_config.json

Copying gs://yohei-kikuta/mlstudy-phys/bert/models/pre-trained-models/uncased_L-12_H-768_A-12/vocab.txt...
/ [1 files][226.1 KiB/226.1 KiB]                                                
Operation completed over 1 objects/226.1 KiB.                                    
Copying gs://yohei-kikuta/mlstudy-phys/bert/models/pre-trained-models/uncased_L-12_H-768_A-12/bert_config.json...
/ [1 files][  313.0 B/  313.0 B]                                                
Operation completed over 1 objects/313.0 B.                                      


In [16]:
!ls ./bert/model/patent

bert_config.json  vocab.txt


In [0]:
import sys
sys.path.append("./bert")

In [0]:
import os

import modeling
import optimization
import tokenization

from run_classifier import RteProcessor
from run_classifier import model_fn_builder
from run_classifier import file_based_input_fn_builder
from run_classifier import file_based_convert_examples_to_features

### Tokenize

In [0]:
tokenizer = tokenization.FullTokenizer(
    vocab_file="./bert/model/patent/vocab.txt", do_lower_case=True)

In [0]:
testset_app_claims = testset_app_df["xml"].map(whole_xml_to_claim)

In [0]:
def claim_to_ids(claim):
  return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(claim))

In [0]:
%%time
test_ids = [claim_to_ids(claim) for claim in testset_app_claims]

CPU times: user 25.7 s, sys: 20.4 ms, total: 25.7 s
Wall time: 25.7 s


In [0]:
len(test_ids)

1000

In [0]:
%%time
grants_ids = [claim_to_ids(claim) for claim in grants_target_df["xml"].map(whole_xml_to_claim)]

CPU times: user 1min 21s, sys: 82.8 ms, total: 1min 21s
Wall time: 1min 21s


In [0]:
len(grants_ids)

2524

In [0]:
%%time
with gzip.open("test_grants_ids.pkl.gz", 'w') as f:
     pickle.dump((test_ids, grants_ids), f)

CPU times: user 1.87 s, sys: 15.9 ms, total: 1.89 s
Wall time: 1.89 s


In [0]:
!gsutil cp test_grants_ids.pkl.gz gs://karino2-uspatent/features/test_grants_ids.pkl.gz

Copying file://test_grants_ids.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/3.0 MiB.                                      


### Add training set application token too (for triplet loss)

In [21]:
%%time
training_ids = [claim_to_ids(claim) for claim in training_app_df["xml"].map(whole_xml_to_claim)]

CPU times: user 24.1 s, sys: 24.6 ms, total: 24.1 s
Wall time: 24.1 s


In [22]:
%%time
with gzip.open("training_app_ids.pkl.gz", 'w') as f:
     pickle.dump(training_ids, f)

CPU times: user 478 ms, sys: 6.95 ms, total: 485 ms
Wall time: 485 ms


In [23]:
!gsutil cp training_app_ids.pkl.gz gs://karino2-uspatent/features/training_app_ids.pkl.gz

Copying file://training_app_ids.pkl.gz [Content-Type=application/octet-stream]...
/ [1 files][794.7 KiB/794.7 KiB]                                                
Operation completed over 1 objects/794.7 KiB.                                    


# Tokenize work done. Below here is obsolete

Once, feature creation was slow and write following codes.
But now feature creation becomes fast enough to do it on the fly.

So we do not need to build feature here anymore.


### Load claim ids

In [0]:
%%time
with gzip.open("test_grants_ids.pkl.gz", 'rb') as f:
     (test_ids, grants_ids) = pickle.load(f)

CPU times: user 352 ms, sys: 158 ms, total: 509 ms
Wall time: 529 ms


In [0]:
CLS_ID = tokenizer.vocab["[CLS]"]
SEP_ID = tokenizer.vocab["[SEP]"]

In [0]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
  """Truncates a sequence pair in place to the maximum length."""

  # This is a simple heuristic which will always truncate the longer sequence
  # one token at a time. This makes more sense than truncating an equal percent
  # of tokens from each, since if one sequence is very short then each token
  # that's truncated likely contains more information than a longer sequence.
  while True:
    total_length = len(tokens_a) + len(tokens_b)
    if total_length <= max_length:
      break
    if len(tokens_a) > len(tokens_b):
      tokens_a.pop()
    else:
      tokens_b.pop()

In [0]:
label_list = ["not_entailment", "entailment"]
max_seq_length = 512

label_map = {}
for (i, label) in enumerate(label_list):
  label_map[label] = i

In [0]:
from run_classifier import InputFeatures

def original_convert_single_pair(ids_a_input, ids_b_input):

  # Clone and pop for truncate. Most of the case result len is the same for our purpose, but try simple implementation first.
  ids_a = list(ids_a_input)
  ids_b = list(ids_b_input)
    
  # Modifies `tokens_a` and `tokens_b` in place so that the total
  # length is less than the specified length.
  # Account for [CLS], [SEP], [SEP] with "- 3"
  _truncate_seq_pair(ids_a, ids_b, max_seq_length - 3)

  # The convention in BERT is:
  # (a) For sequence pairs:
  #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
  #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
  # (b) For single sequences:
  #  tokens:   [CLS] the dog is hairy . [SEP]
  #  type_ids: 0     0   0   0  0     0 0
  #
  # Where "type_ids" are used to indicate whether this is the first
  # sequence or the second sequence. The embedding vectors for `type=0` and
  # `type=1` were learned during pre-training and are added to the wordpiece
  # embedding vector (and position vector). This is not *strictly* necessary
  # since the [SEP] token unambiguously separates the sequences, but it makes
  # it easier for the model to learn the concept of sequences.
  #
  # For classification tasks, the first vector (corresponding to [CLS]) is
  # used as as the "sentence vector". Note that this only makes sense because
  # the entire model is fine-tuned.
  input_ids = []
  segment_ids = []
  input_ids.append(CLS_ID)
  segment_ids.append(0)
  for token in ids_a:
    input_ids.append(token)
    segment_ids.append(0)
  input_ids.append(SEP_ID)
  segment_ids.append(0)

  for token in ids_b:
    input_ids.append(token)
    segment_ids.append(1)
  input_ids.append(SEP_ID)
  segment_ids.append(1)

  # The mask has 1 for real tokens and 0 for padding tokens. Only real
  # tokens are attended to.
  input_mask = [1] * len(input_ids)

  # Zero-pad up to the sequence length.
  while len(input_ids) < max_seq_length:
    input_ids.append(0)
    input_mask.append(0)
    segment_ids.append(0)

  assert len(input_ids) == max_seq_length
  assert len(input_mask) == max_seq_length
  assert len(segment_ids) == max_seq_length

  return (input_ids, input_mask, segment_ids)


### Optimization for our application

dup ids consume huge memory.
We cach and use the same memory if ids_a_nput and ids_b_input have enough size.

In [0]:
NORMAL_INPUT_MASK = [1]*max_seq_length
NORMAL_SEGMENT_IDS = [0]*257+[1]*255

In [0]:
# [CLS](0), 1-255(len=255), [SEP], 257-510(len=254), [SEP]

In [0]:
LEN_255_CACHE = {}
LEN_254_CACHE = {}

In [0]:
def truncage_with_cache(ids, target_len):
  if target_len == 255:
    cache = LEN_255_CACHE
  else:
    assert target_len == 254
    cache = LEN_254_CACHE
  cache[ids] = ids[0:target_len]
  return cache[ids]

In [0]:

def fast_convert_single_pair(ids_a_input, ids_b_input):  
  if (len(ids_a_input) < 256) or (len(ids_b_input) < 256) :
    return original_convert_single_pair(ids_a_input, ids_b_input)
  
  ids_a = truncage_with_cache(ids_a_input, 255)
  ids_b = truncage_with_cache(ids_b_input, 254)

  input_ids = (CLS_ID,) + ids_a + (SEP_ID,) + ids_b + (SEP_ID,)
  
  return (input_ids, NORMAL_INPUT_MASK, NORMAL_SEGMENT_IDS)


In [0]:
def feature_tuplist_to_feature_dict(ftups):
    dic = {}
    dic['input_ids'] = [tup[0] for tup in ftups]
    dic["input_mask"] = [tup[1] for tup in ftups]
    dic["segment_ids"] = [tup[2] for tup in ftups]
    dic["label_ids"] = [[0] for _ in ftups]
    return dic

List is not hashable, but all element is int and tuple is hashable in this case. We use this ids as key for caching.

In [0]:
test_ids_tup = [tuple(ids) for ids in test_ids]

In [0]:
grants_ids_tup = [tuple(ids) for ids in grants_ids]

In [0]:
pairs = [("{}_{}".format(i, j), app_claim, grants_claim) for i, app_claim in enumerate(test_ids_tup) for j, grants_claim in enumerate(grants_ids_tup)]

In [0]:
[tup[0] for tup in pairs[0:5]]

['0_0', '0_1', '0_2', '0_3', '0_4']

In [0]:
len(pairs)

2524000

Convert all pair by below codes run up colab memory.

```
%%time
fdict = feature_tuplist_to_feature_dict([fast_convert_single_pair(tup[1], tup[2]) for tup in pairs])
```

Convert to 1/5 at a time.

In [0]:
# i = 0
i = 4

In [0]:
%%time
fdict = feature_tuplist_to_feature_dict([fast_convert_single_pair(tup[1], tup[2]) for tup in pairs[i*252400*2:(i+1)*252400*2]])

CPU times: user 37.8 s, sys: 2.1 s, total: 39.9 s
Wall time: 39.9 s


In [0]:
index_pairs = [tup[0] for tup in pairs[i*252400*2:(i+1)*252400*2]]

In [0]:
filename = 'test_fdict_pair_{}.pkl.gz'.format(i)
filename

'test_fdict_pair_4.pkl.gz'

In [0]:
%%time
with gzip.open(filename, 'w') as f:
     pickle.dump((index_pairs, fdict), f)

CPU times: user 1min 31s, sys: 1.14 s, total: 1min 32s
Wall time: 1min 32s


In [0]:
!ls -l

total 724056
-rw-r--r-- 1 root root      2553 Jan  8 01:14 adc.json
drwxr-xr-x 7 root root      4096 Jan  8 01:34 bert
drwxr-xr-x 1 root root      4096 Jan  3 17:15 sample_data
-rw-r--r-- 1 root root 147796337 Jan  8 10:05 test_fdict_pair_0.pkl.gz
-rw-r--r-- 1 root root 147522631 Jan  8 10:12 test_fdict_pair_1.pkl.gz
-rw-r--r-- 1 root root 148051738 Jan  8 10:15 test_fdict_pair_2.pkl.gz
-rw-r--r-- 1 root root 147540571 Jan  8 10:17 test_fdict_pair_3.pkl.gz
-rw-r--r-- 1 root root 147314921 Jan  8 10:20 test_fdict_pair_4.pkl.gz
-rw-r--r-- 1 root root   3178875 Jan  8 08:19 test_grants_ids.pkl.gz


In [0]:
!gsutil cp {filename} gs://karino2-uspatent/features/{filename}

Copying file://test_fdict_pair_4.pkl.gz [Content-Type=application/octet-stream]...
|
Operation completed over 1 objects/140.5 MiB.                                    


In [0]:
(i+1)*252400*2

2524000

In [0]:
len(pairs) == (i+1)*252400*2

True

Load feature dicts

In [0]:
with gzip.open(filename, 'rb') as f:
    (index_pairs, fdict) = pickle.load(f)

# Try and error

In [0]:
first_fs = convert_single_pair(test_ids[0], grants_ids[0])

In [0]:
test_ids[0][0:5]

In [0]:
with gzip.open(filename, 'rb') as f:
    (index_pairs2, fdict2) = pickle.load(f)

In [0]:
index_pairs2[0:5]

['0_0', '0_1', '0_2', '0_3', '0_4']

In [0]:
index_pairs2 = None

In [0]:
claim_to_ids(testset_app_claims[0])[0:5]

In [0]:
fdict2 = None

Min id len check

In [0]:
min([len(tup[1]) for tup in pairs])

6

In [0]:
len(pairs[2][1])

1369

In [0]:
[len(tup[1]) for tup in pairs].index(6)

1007076

In [0]:
1007076/2524

399.0

In [0]:
testset_app_claims[399]

'\n \n  1 . A soap rag. \n \n'

In [0]:
 testset_app_df["xml"][399]

'<us-patent-application lang="EN" dtd-version="v4.2 2006-08-23" file="US20110167576A1-20110714.XML" status="PRODUCTION" id="us-patent-application" country="US" date-produced="20110628" date-publ="20110714">\n<us-bibliographic-data-application lang="EN" country="US">\n<publication-reference>\n<document-id>\n<country>US</country>\n<doc-number>20110167576</doc-number>\n<kind>A1</kind>\n<date>20110714</date>\n</document-id>\n</publication-reference>\n<application-reference appl-type="utility">\n<document-id>\n<country>US</country>\n<doc-number>12614955</doc-number>\n<date>20100111</date>\n</document-id>\n</application-reference>\n<us-application-series-code>12</us-application-series-code>\n<classifications-ipcr>\n<classification-ipcr>\n<ipc-version-indicator><date>20060101</date></ipc-version-indicator>\n<classification-level>A</classification-level>\n<section>B</section>\n<class>08</class>\n<subclass>B</subclass>\n<main-group>1</main-group>\n<subgroup>00</subgroup>\n<symbol-position>F</sy

In [0]:
whole_xml_to_claim_xml(testset_app_df["xml"][399])

'\n<claim id="CLM-00001" num="00001">\n<claim-text><b>1</b>. A soap rag.</claim-text>\n</claim>\n'

In [0]:
print( testset_app_df["xml"][399])

<us-patent-application lang="EN" dtd-version="v4.2 2006-08-23" file="US20110167576A1-20110714.XML" status="PRODUCTION" id="us-patent-application" country="US" date-produced="20110628" date-publ="20110714">
<us-bibliographic-data-application lang="EN" country="US">
<publication-reference>
<document-id>
<country>US</country>
<doc-number>20110167576</doc-number>
<kind>A1</kind>
<date>20110714</date>
</document-id>
</publication-reference>
<application-reference appl-type="utility">
<document-id>
<country>US</country>
<doc-number>12614955</doc-number>
<date>20100111</date>
</document-id>
</application-reference>
<us-application-series-code>12</us-application-series-code>
<classifications-ipcr>
<classification-ipcr>
<ipc-version-indicator><date>20060101</date></ipc-version-indicator>
<classification-level>A</classification-level>
<section>B</section>
<class>08</class>
<subclass>B</subclass>
<main-group>1</main-group>
<subgroup>00</subgroup>
<symbol-position>F</symbol-position>
<classificati

In [0]:
test_ids[399]

[1015, 1012, 1037, 7815, 17768, 1012]

In [0]:
tmp = {}

In [0]:
tmp[tuple(test_ids[399])] = "hello"

In [0]:
tmp

{(1015, 1012, 1037, 7815, 17768, 1012): 'hello'}

In [0]:
type(test_ids[0])

list

In [0]:
dic2 = fast_convert_single_pair(test_ids_tup[0], grants_ids_tup[0])

In [0]:
dic = original_convert_single_pair(test_ids[0], grants_ids[0])

In [0]:
dic[0] == list(dic2[0])

True

In [0]:
dic[0].index(SEP_ID)

256

In [0]:
dic[1][511]

1

In [0]:
dic[2][256:270]

[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [0]:
dic[2][510:]

[1, 1]

In [0]:
NORMAL_SEGMENT_IDS == dic[2]

True

In [0]:
dic[0].index(SEP_ID)

256

In [0]:
dic[0].index(SEP_ID, 257)

511