<a href="https://colab.research.google.com/github/yoheikikuta/US-patent-analysis/blob/master/colab/BERT_pretrain_with_patent_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT pretraining with patent data

In [0]:
from google.colab import auth
auth.authenticate_user()

## Data preparation

In [0]:
!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-info/citations_info_3000+3000.df.gz ./

!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/training_app_3000.df.gz ./  
!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/testset_app_3000.df.gz ./
!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/grants_for_3000+3000.df.gz ./

Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-info/citations_info_3000+3000.df.gz...
- [1 files][506.5 KiB/506.5 KiB]                                                
Operation completed over 1 objects/506.5 KiB.                                    
Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/training_app_3000.df.gz...
\ [1 files][ 45.0 MiB/ 45.0 MiB]                                                
Operation completed over 1 objects/45.0 MiB.                                     
Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/testset_app_3000.df.gz...
/ [1 files][ 45.5 MiB/ 45.5 MiB]                                                
Operation completed over 1 objects/45.5 MiB.                                     
Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/grants_for_3000+3000.df.gz...
/ [1 files][129.4 MiB/129.4 MiB]                                                
Operation completed over 1 objects/129.4 MiB.           

In [0]:
import pandas as pd
import numpy as np

In [0]:
citations_info_target = pd.read_pickle("./citations_info_3000+3000.df.gz")
test_app = pd.read_pickle("./testset_app_3000.df.gz")
grants = pd.read_pickle("./grants_for_3000+3000.df.gz")
train_app = pd.read_pickle("./training_app_3000.df.gz")

In [0]:
train_app.head(3)

Unnamed: 0,app_id,xml
0,12130785,"<us-patent-application lang=""EN"" dtd-version=""..."
1,12652424,"<us-patent-application lang=""EN"" dtd-version=""..."
2,12214532,"<us-patent-application lang=""EN"" dtd-version=""..."


In [0]:
import re


CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)
TAG_PAT = re.compile(r"<.*?>")
LB_PAT = re.compile(r'[\t\n\r\f\v][" "]*')
CANCELED_PAT = re.compile(r'[0-9]+.*\. \(canceled\)[" "]')
NUM_PAT = re.compile(r'[" "]?[0-9]+[" "]?\.[" "]?')


def whole_xml_to_claim_xml(whole):
    mat = CLAIM_PAT.search(whole)
    return mat.group(1)


def whole_xml_to_claim(whole):
    return TAG_PAT.sub(' ', whole_xml_to_claim_xml(whole))


def remove_linebreak_from_claim(claim):
    return LB_PAT.sub('', claim)


def remove_canceled_claim(claim):
    return CANCELED_PAT.sub('', claim)


def remove_claim_numbers(claim):
    return NUM_PAT.sub('', claim)  

Test data will NOT be used for pretraining.

In [0]:
%%time

train_app["claim_app"] = train_app["xml"].map(whole_xml_to_claim).map(remove_canceled_claim).map(remove_claim_numbers).map(remove_linebreak_from_claim)
train_app = train_app.drop("xml", axis=1)
train_app.head()

# test_app["claim_app"] = test_app["xml"].map(whole_xml_to_claim).map(remove_canceled_claim).map(remove_claim_numbers).map(remove_linebreak_from_claim)
# test_app = test_app.drop("xml", axis=1)
# test_app.head()

grants["claim_cited_grant"] = grants["xml"].map(whole_xml_to_claim).map(remove_canceled_claim).map(remove_claim_numbers).map(remove_linebreak_from_claim)
grants = grants.drop("xml", axis=1)
grants.head()

CPU times: user 7.66 s, sys: 188 ms, total: 7.84 s
Wall time: 7.85 s


In [0]:
train_app.head(3)

Unnamed: 0,app_id,claim_app
0,12130785,A system for differentiating noise from an arr...
1,12652424,A method of allocating resources in a data war...
2,12214532,A controlling method of a media processing app...


In [0]:
test = train_app['claim_app'][0]

In [0]:
test.replace(".", "\n")

'A system for differentiating noise from an arrhythmia of a heart, comprising:a noise discriminator configured to receive an electrocardiogram (EGM) signal and to discriminate between an organized EGM signal and a chaotic EGM signal based at least in part on an impedance parameter associated with a lead that provides an electrical connection to the heart; a signal analyzer configured to determine whether a chaotic signal is caused by a disturbance in the lead\n The system of  claim 1 , further comprising a high voltage delivery system configured to deliver a high voltage therapy signal to the heart if the EGM signal is an organized signal\n The system of  claim 2 , further comprising a high voltage confirmation system configured to adjust or terminate the high voltage therapy based on the impedance parameter\n The system of  claim 3 , wherein the signal analyzer is part of the high voltage confirmation system\n The system of  claim 3 , wherein the lead comprises a high voltage lead for

In order to make a pretraining data, use simple preprocessing: replacing a (period + space) with (period + line break).

In [0]:
# with open("./test.txt", "w+") as f:
#     f.write(test.replace(". ", ".\n").lower())

In [0]:
# !cat ./test.txt

In [0]:
# len(pd.concat([train_app['claim_app'], grants['claim_cited_grant']]))

In [0]:
# %%time

# with open("./training_data.txt", "w+") as f:
#     for one_stuff in pd.concat([train_app['claim_app'], grants['claim_cited_grant']]):
#         f.write(one_stuff.replace(". ", ".\n").lower())

## Create training data for BERT

In [0]:
### Patent 2017 text data.
# !gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/patent-2017-pretrain-BERT/training_data.txt ./

In [0]:
# !wc -l ./training_data.txt

In [0]:
# !sed -n 510000,510010p ./training_data.txt

In [0]:
# !sed -n 1020000,1020010p ./training_data.txt

In [0]:
# !sed -n 1,510004p ./training_data.txt > ./training_data_1.txt
# !sed -n 510005,1020003p ./training_data.txt > ./training_data_2.txt
# !sed -n 1020004,1528943p ./training_data.txt > ./training_data_3.txt

In [0]:
!git clone https://github.com/google-research/bert.git

Cloning into 'bert'...
remote: Enumerating objects: 333, done.[K
remote: Total 333 (delta 0), reused 0 (delta 0), pack-reused 333[K
Receiving objects: 100% (333/333), 279.30 KiB | 3.83 MiB/s, done.
Resolving deltas: 100% (183/183), done.


In [0]:
!gsutil cp -r gs://yohei-kikuta/mlstudy-phys/bert/models/pre-trained-models/uncased_L-12_H-768_A-12 ./

Copying gs://yohei-kikuta/mlstudy-phys/bert/models/pre-trained-models/uncased_L-12_H-768_A-12/bert_config.json...
Copying gs://yohei-kikuta/mlstudy-phys/bert/models/pre-trained-models/uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001...
Copying gs://yohei-kikuta/mlstudy-phys/bert/models/pre-trained-models/uncased_L-12_H-768_A-12/bert_model.ckpt.index...
Copying gs://yohei-kikuta/mlstudy-phys/bert/models/pre-trained-models/uncased_L-12_H-768_A-12/bert_model.ckpt.meta...
\ [4 files][420.9 MiB/420.9 MiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://yohei-kikuta/mlstudy-phys/bert/models/pre-trained-models/uncased_L-12_H-768_A-12/checkpoint...
Copying gs://yohei-kikuta/mlstudy-phys/bert/models/pre-trained-models/uncas

In [0]:
!ls

adc.json			sample_data
bert				testset_app_3000.df.gz
citations_info_3000+3000.df.gz	training_app_3000.df.gz
grants_for_3000+3000.df.gz	uncased_L-12_H-768_A-12


In [0]:
%cd bert

/content/bert


In [0]:
!ls

CONTRIBUTING.md		    predicting_movie_reviews_with_bert_on_tf_hub.ipynb
create_pretraining_data.py  README.md
extract_features.py	    requirements.txt
__init__.py		    run_classifier.py
LICENSE			    run_classifier_with_tfhub.py
modeling.py		    run_pretraining.py
modeling_test.py	    run_squad.py
multilingual.md		    sample_text.txt
optimization.py		    tokenization.py
optimization_test.py	    tokenization_test.py


In [0]:
# %%time

# !python create_pretraining_data.py \
#   --input_file=../training_data_3.txt \
#   --output_file=../training_data_3.tfrecord \
#   --vocab_file=../uncased_L-12_H-768_A-12/vocab.txt \
#   --do_lower_case=True \
#   --max_seq_length=512 \
#   --max_predictions_per_seq=20 \
#   --masked_lm_prob=0.15 \
#   --random_seed=12345 \
#   --dupe_factor=5

In [0]:
# About 350 [MiB]

# !stat -c %s ../training_data.tfrecord

In [0]:
# !gsutil cp ../training_data.tfrecord gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-pretrain-BERT/
# !gsutil cp ../training_data_1.tfrecord gs://yohei-kikuta/mlstudy-phys/patent-analysis/patent-2017-pretrain-BERT/
# !gsutil cp ../training_data_2.tfrecord gs://yohei-kikuta/mlstudy-phys/patent-analysis/patent-2017-pretrain-BERT/
# !gsutil cp ../training_data_3.tfrecord gs://yohei-kikuta/mlstudy-phys/patent-analysis/patent-2017-pretrain-BERT/

## Pretraining

In [0]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

TPU address is grpc://10.103.140.138:8470
TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 4676900968595902608),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 9114757423999702320),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 7352254183043986764),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 17823823701263564679),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 4511093959183907878),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 8119742463973857164),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 7218021206714621563),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 18249821039556106535),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:6, TPU, 17179869184, 146419785169

NOTE: need to give access rights to INIT_CKPT GCS to cloud TPUs.

In [0]:
# INPUT_FILE = "gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-pretrain-BERT/training_data.tfrecord"
# OUTPUT_GCS = "gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-pretrain-BERT"
# INIT_CKPT = "gs://yohei-kikuta/mlstudy-phys/bert/models/pre-trained-models/uncased_L-12_H-768_A-12/bert_model.ckpt"

INPUT_FILE = "gs://yohei-kikuta/mlstudy-phys/patent-analysis/patent-2017-pretrain-BERT"
OUTPUT_GCS = "gs://yohei-kikuta/mlstudy-phys/patent-analysis/patent-2017-pretrain-BERT"
INIT_CKPT = "gs://yohei-kikuta/mlstudy-phys/bert/models/pre-trained-models/uncased_L-12_H-768_A-12/bert_model.ckpt"

In [0]:
%%time

!python run_pretraining.py \
  --input_file={INPUT_FILE}/training_data_1.tfrecord,{INPUT_FILE}/training_data_2.tfrecord,{INPUT_FILE}/training_data_3.tfrecord \
  --output_dir={OUTPUT_GCS} \
  --use_tpu=True \
  --tpu_name={TPU_ADDRESS} \
  --num_tpu_cores=8 \
  --do_train=True \
  --do_eval=True \
  --bert_config_file=../uncased_L-12_H-768_A-12/bert_config.json \
  --train_batch_size=64 \
  --max_seq_length=512 \
  --max_predictions_per_seq=20 \
  --num_train_steps=300000 \
  --num_warmup_steps=1000 \
  --learning_rate=5e-5
#   --init_checkpoint={INIT_CKPT}  # Only need to add at the first training time.

W0820 05:35:43.306933 139671258257280 deprecation_wrapper.py:119] From /content/bert/optimization.py:87: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0820 05:35:43.308188 139671258257280 deprecation_wrapper.py:119] From run_pretraining.py:493: The name tf.app.run is deprecated. Please use tf.compat.v1.app.run instead.

W0820 05:35:43.308779 139671258257280 deprecation_wrapper.py:119] From run_pretraining.py:407: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.

W0820 05:35:43.308923 139671258257280 deprecation_wrapper.py:119] From run_pretraining.py:407: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.

W0820 05:35:43.309074 139671258257280 deprecation_wrapper.py:119] From /content/bert/modeling.py:93: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

W0820 05:35:43.309868 139671258257280 deprecation_wrapper.py:119] Fro

## Memo (train3000 + grants6000)

```
I0811 12:50:14.190979 140514602837888 run_pretraining.py:483] ***** Eval results *****
I0811 12:50:14.191113 140514602837888 run_pretraining.py:485]   global_step = 20
I0811 12:50:14.191493 140514602837888 run_pretraining.py:485]   loss = 1.0284224
I0811 12:50:14.191617 140514602837888 run_pretraining.py:485]   masked_lm_accuracy = 0.8165625
I0811 12:50:14.191753 140514602837888 run_pretraining.py:485]   masked_lm_loss = 0.86381125
I0811 12:50:14.191835 140514602837888 run_pretraining.py:485]   next_sentence_accuracy = 0.95
I0811 12:50:14.191914 140514602837888 run_pretraining.py:485]   next_sentence_loss = 0.1793103
CPU times: user 1.56 s, sys: 244 ms, total: 1.8 s
Wall time: 4min 17s
```

```
I0811 14:17:34.821915 139717559273344 run_pretraining.py:483] ***** Eval results *****
I0811 14:17:34.822047 139717559273344 run_pretraining.py:485]   global_step = 15000
I0811 14:17:34.822362 139717559273344 run_pretraining.py:485]   loss = 0.18834595
I0811 14:17:34.822492 139717559273344 run_pretraining.py:485]   masked_lm_accuracy = 0.9595625
I0811 14:17:34.822575 139717559273344 run_pretraining.py:485]   masked_lm_loss = 0.14227556
I0811 14:17:34.822673 139717559273344 run_pretraining.py:485]   next_sentence_accuracy = 0.99875
I0811 14:17:34.822756 139717559273344 run_pretraining.py:485]   next_sentence_loss = 0.0048195585
```

```
I0812 03:15:34.621796 140024347654016 run_pretraining.py:483] ***** Eval results *****
I0812 03:15:34.621907 140024347654016 run_pretraining.py:485]   global_step = 70000
I0812 03:15:34.622360 140024347654016 run_pretraining.py:485]   loss = 0.0005139347
I0812 03:15:34.622489 140024347654016 run_pretraining.py:485]   masked_lm_accuracy = 0.9999375
I0812 03:15:34.622562 140024347654016 run_pretraining.py:485]   masked_lm_loss = 0.00031679025
I0812 03:15:34.622630 140024347654016 run_pretraining.py:485]   next_sentence_accuracy = 1.0
I0812 03:15:34.622724 140024347654016 run_pretraining.py:485]   next_sentence_loss = 1.9996969e-07
```

## Memo (patent-2017 data)

```
I0818 07:42:20.731508 140399826028416 run_pretraining.py:483] ***** Eval results *****
I0818 07:42:20.731633 140399826028416 run_pretraining.py:485]   global_step = 10000
I0818 07:42:20.731997 140399826028416 run_pretraining.py:485]   loss = 0.53506017
I0818 07:42:20.732118 140399826028416 run_pretraining.py:485]   masked_lm_accuracy = 0.88384515
I0818 07:42:20.732220 140399826028416 run_pretraining.py:485]   masked_lm_loss = 0.4997728
I0818 07:42:20.732293 140399826028416 run_pretraining.py:485]   next_sentence_accuracy = 0.99
I0818 07:42:20.732356 140399826028416 run_pretraining.py:485]   next_sentence_loss = 0.02872859
CPU times: user 14.9 s, sys: 2.67 s, total: 17.6 s
Wall time: 50min 5s
```

```
I0818 08:36:06.739647 140538574493568 error_handling.py:96] evaluation_loop marked as finished
I0818 08:36:06.740006 140538574493568 run_pretraining.py:483] ***** Eval results *****
I0818 08:36:06.740121 140538574493568 run_pretraining.py:485]   global_step = 20000
I0818 08:36:06.740490 140538574493568 run_pretraining.py:485]   loss = 0.39322364
I0818 08:36:06.740602 140538574493568 run_pretraining.py:485]   masked_lm_accuracy = 0.9102295
I0818 08:36:06.740676 140538574493568 run_pretraining.py:485]   masked_lm_loss = 0.36800206
I0818 08:36:06.740731 140538574493568 run_pretraining.py:485]   next_sentence_accuracy = 0.99875
I0818 08:36:06.740785 140538574493568 run_pretraining.py:485]   next_sentence_loss = 0.006405907
CPU times: user 13.8 s, sys: 2.64 s, total: 16.4 s
Wall time: 49min 31s
```

```
I0818 17:46:50.941742 140184802654080 run_pretraining.py:483] ***** Eval results *****
I0818 17:46:50.941880 140184802654080 run_pretraining.py:485]   global_step = 50000
I0818 17:46:50.942254 140184802654080 run_pretraining.py:485]   loss = 0.3655657
I0818 17:46:50.942408 140184802654080 run_pretraining.py:485]   masked_lm_accuracy = 0.9144256
I0818 17:46:50.942533 140184802654080 run_pretraining.py:485]   masked_lm_loss = 0.33980185
I0818 17:46:50.942634 140184802654080 run_pretraining.py:485]   next_sentence_accuracy = 0.99625
I0818 17:46:50.942719 140184802654080 run_pretraining.py:485]   next_sentence_loss = 0.013906714
CPU times: user 48.4 s, sys: 7.54 s, total: 56 s
Wall time: 2h 22min 54s
```

```
I0819 06:43:39.739752 139906677770112 run_pretraining.py:483] ***** Eval results *****
I0819 06:43:39.739871 139906677770112 run_pretraining.py:485]   global_step = 100000
I0819 06:43:39.740208 139906677770112 run_pretraining.py:485]   loss = 0.2965531
I0819 06:43:39.740323 139906677770112 run_pretraining.py:485]   masked_lm_accuracy = 0.9267595
I0819 06:43:39.740409 139906677770112 run_pretraining.py:485]   masked_lm_loss = 0.27952158
I0819 06:43:39.740486 139906677770112 run_pretraining.py:485]   next_sentence_accuracy = 0.99875
I0819 06:43:39.740583 139906677770112 run_pretraining.py:485]   next_sentence_loss = 0.002464915
CPU times: user 1min 16s, sys: 11.4 s, total: 1min 27s
Wall time: 3h 55min 46s
```

```
I0819 16:27:09.458713 139819099187072 run_pretraining.py:483] ***** Eval results *****
I0819 16:27:09.458858 139819099187072 run_pretraining.py:485]   global_step = 200000
I0819 16:27:09.459412 139819099187072 run_pretraining.py:485]   loss = 0.25941798
I0819 16:27:09.459626 139819099187072 run_pretraining.py:485]   masked_lm_accuracy = 0.93457943
I0819 16:27:09.459730 139819099187072 run_pretraining.py:485]   masked_lm_loss = 0.23920113
I0819 16:27:09.459816 139819099187072 run_pretraining.py:485]   next_sentence_accuracy = 0.99875
I0819 16:27:09.459905 139819099187072 run_pretraining.py:485]   next_sentence_loss = 0.0025768017
CPU times: user 2min 42s, sys: 21.3 s, total: 3min 3s
Wall time: 7h 50min 29s
```

```
I0820 10:55:42.340659 139671258257280 run_pretraining.py:483] ***** Eval results *****
I0820 10:55:42.340777 139671258257280 run_pretraining.py:485]   global_step = 300000
I0820 10:55:42.341297 139671258257280 run_pretraining.py:485]   loss = 0.19169405
I0820 10:55:42.341431 139671258257280 run_pretraining.py:485]   masked_lm_accuracy = 0.95320743
I0820 10:55:42.341532 139671258257280 run_pretraining.py:485]   masked_lm_loss = 0.16689053
I0820 10:55:42.341613 139671258257280 run_pretraining.py:485]   next_sentence_accuracy = 1.0
I0820 10:55:42.341696 139671258257280 run_pretraining.py:485]   next_sentence_loss = 4.6706875e-05
CPU times: user 1min 38s, sys: 15.4 s, total: 1min 53s
Wall time: 5h 20min 8s
```