<a href="https://colab.research.google.com/github/yoheikikuta/US-patent-analysis/blob/master/colab/feature_extract_ELMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature extraction from ELMo

Use "default" feature defined in TensorFlow Hub.

ELMO: https://arxiv.org/abs/1802.05365

In [0]:
from google.colab import auth
auth.authenticate_user()

## Data preparation

In [0]:
DATA_DIR = "./"

In [0]:
!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/grants_for_3000+3000.df.gz {DATA_DIR}
!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/testset_app_3000.df.gz {DATA_DIR}
!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/training_app_3000.df.gz {DATA_DIR}

Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/grants_for_3000+3000.df.gz...
| [1 files][129.4 MiB/129.4 MiB]                                                
Operation completed over 1 objects/129.4 MiB.                                    
Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/testset_app_3000.df.gz...
| [1 files][ 45.5 MiB/ 45.5 MiB]                                                
Operation completed over 1 objects/45.5 MiB.                                     
Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/training_app_3000.df.gz...
\ [1 files][ 45.0 MiB/ 45.0 MiB]                                                
Operation completed over 1 objects/45.0 MiB.                                     


In [0]:
import gzip
import pickle
import datetime
import json
import os
import pprint
import random
import string
import sys
import pandas as pd

In [0]:
grants = pd.read_pickle(f"{DATA_DIR}grants_for_3000+3000.df.gz")
test_app = pd.read_pickle(f"{DATA_DIR}testset_app_3000.df.gz")
train_app = pd.read_pickle(f"{DATA_DIR}training_app_3000.df.gz")

In [0]:
train_app.head(3)

Unnamed: 0,app_id,xml
0,12130785,"<us-patent-application lang=""EN"" dtd-version=""..."
1,12652424,"<us-patent-application lang=""EN"" dtd-version=""..."
2,12214532,"<us-patent-application lang=""EN"" dtd-version=""..."


In [0]:
# TOKEN_LENGTH = 512
TOKEN_LENGTH = 4096  # Used as the maximum allowable token length.


import re


CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)
TAG_PAT = re.compile(r"<.*?>")
LB_PAT = re.compile(r'[\t\n\r\f\v][" "]*')
CANCELED_PAT = re.compile(r'[0-9]+.*\. \(canceled\)[" "]')
NUM_PAT = re.compile(r'[" "]?[0-9]+[" "]?\.[" "]?')


def whole_xml_to_claim_xml(whole):
    mat = CLAIM_PAT.search(whole)
    return mat.group(1)


def whole_xml_to_claim(whole):
    return TAG_PAT.sub(' ', whole_xml_to_claim_xml(whole))


def remove_linebreak_from_claim(claim):
    return LB_PAT.sub('', claim)


def remove_canceled_claim(claim):
    return CANCELED_PAT.sub('', claim)


def remove_claim_numbers(claim):
    return NUM_PAT.sub('', claim)

def claim_to_tokens(claim):
    return claim.split()[:TOKEN_LENGTH]

In [0]:
%%time

train_app["claim_app"] = train_app["xml"].map(whole_xml_to_claim).map(remove_canceled_claim).map(remove_claim_numbers).map(remove_linebreak_from_claim).map(claim_to_tokens)
train_app = train_app.drop("xml", axis=1)

test_app["claim_app"] = test_app["xml"].map(whole_xml_to_claim).map(remove_canceled_claim).map(remove_claim_numbers).map(remove_linebreak_from_claim).map(claim_to_tokens)
test_app = test_app.drop("xml", axis=1)

grants["claim_cited_grant"] = grants["xml"].map(whole_xml_to_claim).map(remove_canceled_claim).map(remove_claim_numbers).map(remove_linebreak_from_claim).map(claim_to_tokens)
grants = grants.drop("xml", axis=1)

CPU times: user 10.1 s, sys: 463 ms, total: 10.6 s
Wall time: 10.6 s


In [0]:
train_app.head(3)

Unnamed: 0,app_id,claim_app
0,12130785,"[A, system, for, differentiating, noise, from,..."
1,12652424,"[A, method, of, allocating, resources, in, a, ..."
2,12214532,"[A, controlling, method, of, a, media, process..."


In [0]:
train_app['claim_app'].map(len)[:5]

0     677
1     913
2    1512
3     451
4     816
Name: claim_app, dtype: int64

## Feature extraction

In [0]:
from typing import List
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

tf.__version__

'1.14.0'

ELMo's feature includes "" token feature that is meaningless, and returned feature's shape is fixed as (data_num, TOKEN_LENGH, FEATURE_DIM).  
In order to compute the mean of features that do not include "", we do the following:


*   get (data_num, TOKEN_LENGH, FEATURE_DIM) from the ELMo model
*   then compute the mean of each feature considering those sequence length that does not include "" token


In [0]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

def extract_elmo_feature(sentences: List[str]):
  elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=False)
  
  def _padding(sentence: List[str]):
    if len(sentence) < TOKEN_LENGTH:
      sentence = sentence + [""] * (TOKEN_LENGTH - len(sentence))
    return sentence

  def _get_elmo_features(sentences: List[str]):
    embeddings = elmo(
        inputs = {
            "tokens": list(map(lambda x: _padding(x), sentences)),
            "sequence_len": list(map(lambda x: len(x), sentences))
        },
        signature="tokens",
        as_dict=True
    )["elmo"]
    return embeddings  
  
  with tf.Session(config=config) as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    result = sess.run(_get_elmo_features(sentences))

  tf.reset_default_graph()
  return result

In [0]:
# GS_BASE = "gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/ELMO-512tokenmean"
GS_BASE = "gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/ELMO-4096tokenmean"

def dump_and_send(fname, obj):
  with gzip.open(fname, 'w') as f:
     pickle.dump(obj, f)
  !gsutil cp {fname} {GS_BASE}/{fname}
  print(f"send to {GS_BASE}/{fname}")

In [0]:
### For TOKEN_LENGTH = 512 case.
# BATCH_SIZE = 32

### For TOKEN_LENGTH = 4096 case.
BATCH_SIZE = 4

FEATURE_DIM = 1024

Due to the memory error, compute the mean features for each batch.

In [0]:
%%time

train_features = np.empty((BATCH_SIZE, TOKEN_LENGTH, FEATURE_DIM))
mean_train_features = np.empty((len(train_app), FEATURE_DIM))
token_lengths = list(map(lambda x: len(x), list(train_app['claim_app'])))

for start in range(0, len(train_app), BATCH_SIZE):
  train_features = extract_elmo_feature(list(train_app['claim_app'])[start: start + BATCH_SIZE])
  for (idx, (feature, token_length)) in enumerate(zip(train_features, token_lengths[start: start + BATCH_SIZE])):
    mean_train_features[start + idx] = np.mean(feature[:token_length], axis=0)

CPU times: user 1h 45min 5s, sys: 21min 23s, total: 2h 6min 28s
Wall time: 1h 23min 17s


In [0]:
mean_train_features.shape

(3000, 1024)

In [0]:
mean_train_features[0][:5]

array([ 0.08837943,  0.19150606, -0.51628566, -0.01050414, -0.07731926])

In [0]:
# dump_and_send("elmo_512_feature_train_app_3000.pkl.gz", mean_train_features)
dump_and_send("elmo_4096_feature_train_app_3000.pkl.gz", mean_train_features)

Copying file://elmo_4096_feature_train_app_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/12.5 MiB.                                     
send to gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/ELMO-512tokenmean/elmo_4096_feature_train_app_3000.pkl.gz


In [0]:
%%time

test_features = np.empty((BATCH_SIZE, TOKEN_LENGTH, FEATURE_DIM))
mean_test_features = np.empty((len(test_app), FEATURE_DIM))
token_lengths = list(map(lambda x: len(x), list(test_app['claim_app'])))

for start in range(0, len(test_app), BATCH_SIZE):
  test_features = extract_elmo_feature(list(test_app['claim_app'])[start: start + BATCH_SIZE])
  for (idx, (feature, token_length)) in enumerate(zip(test_features, token_lengths[start: start + BATCH_SIZE])):
    mean_test_features[start + idx] = np.mean(feature[:token_length], axis=0)

CPU times: user 1h 41min 31s, sys: 21min 24s, total: 2h 2min 55s
Wall time: 1h 22min 7s


In [0]:
mean_test_features.shape

(3000, 1024)

In [0]:
# dump_and_send("elmo_512_feature_test_app_3000.pkl.gz", mean_test_features)
dump_and_send("elmo_4096_feature_test_app_3000.pkl.gz", mean_test_features)

Copying file://elmo_4096_feature_test_app_3000.pkl.gz [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/ 12.5 MiB]                                                / [1 files][ 12.5 MiB/ 12.5 MiB]                                                -
Operation completed over 1 objects/12.5 MiB.                                     
send to gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/ELMO-512tokenmean/elmo_4096_feature_test_app_3000.pkl.gz


In [0]:
%%time

grants_features = np.empty((BATCH_SIZE, TOKEN_LENGTH, FEATURE_DIM))
mean_grants_features = np.empty((len(grants), FEATURE_DIM))
token_lengths = list(map(lambda x: len(x), list(grants['claim_cited_grant'])))

for start in range(0, len(grants), BATCH_SIZE):
  grants_features = extract_elmo_feature(list(grants['claim_cited_grant'])[start: start + BATCH_SIZE])
  for (idx, (feature, token_length)) in enumerate(zip(grants_features, token_lengths[start: start + BATCH_SIZE])):
    mean_grants_features[start + idx] = np.mean(feature[:token_length], axis=0)

CPU times: user 4h 19min 41s, sys: 57min 6s, total: 5h 16min 47s
Wall time: 3h 23min 6s


In [0]:
mean_grants_features.shape

(6440, 1024)

In [0]:
# dump_and_send("elmo_512_feature_grants_3000_3000.pkl.gz", mean_grants_features)
dump_and_send("elmo_4096_feature_grants_3000_3000.pkl.gz", mean_grants_features)

Copying file://elmo_4096_feature_grants_3000_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/26.8 MiB.                                     
send to gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/ELMO-512tokenmean/elmo_4096_feature_grants_3000_3000.pkl.gz


## Trial and errors

In [0]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=False)

embeddings = elmo(
    ["the cat is on the mat", "dogs are in the fog"],
    signature="default",
    as_dict=True)["elmo"]

averaged_embeddings = tf.reduce_mean(embeddings, axis=1)

In [0]:
averaged_embeddings

In [0]:
with tf.Session() as sess:
  sess.run(tf.global_variables_initializer())
  sess.run(tf.tables_initializer())
  test = sess.run(averaged_embeddings)

In [0]:
test.shape

In [0]:
test = extract_elmo_feature(["the cat is on the mat", "dogs are in the fog"])

In [0]:
test.shape

In [0]:
embeddings = elmo(
    [grants_target_df['claim'][0]],
    signature="default",  # Treat input as raw string.
    as_dict=True
)["elmo"]

In [0]:
embeddings

In [0]:
grants_target_df['claim'][0].split()[0:512]

In [0]:
embeddings = elmo(
    inputs = {
        "tokens": [grants_target_df['claim'][0].split()[0:512]],
        "sequence_len": [512]
    },
    signature="tokens",
    as_dict=True
)["elmo"]

In [0]:
embeddings

In [0]:
len(training_app_df)

In [0]:
sum(training_app_df['claim'].map(len)) / len(training_app_df)

In [0]:
list(training_app_df['claim'].map(len))[:5]

In [0]:
training_app_df['claim'][3]

In [0]:
training_app_df['claim'].map(len).hist()

In [0]:
testset_app_df['claim'].map(len).hist()

In [0]:
grants_target_df['claim'].map(len).hist()