<a href="https://colab.research.google.com/github/yoheikikuta/US-patent-analysis/blob/master/colab/feature_extract_TransformerXL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature extraction from TransformerXL model

Based on https://github.com/huggingface/pytorch-transformers

In [0]:
from google.colab import auth
auth.authenticate_user()

## Data preparation

In [0]:
DATA_DIR = "./"

In [0]:
!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/grants_for_3000+3000.df.gz {DATA_DIR}
!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/testset_app_3000.df.gz {DATA_DIR}
!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/training_app_3000.df.gz {DATA_DIR}

Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/grants_for_3000+3000.df.gz...
/ [1 files][129.4 MiB/129.4 MiB]                                                
Operation completed over 1 objects/129.4 MiB.                                    
Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/testset_app_3000.df.gz...
\ [1 files][ 45.5 MiB/ 45.5 MiB]                                                
Operation completed over 1 objects/45.5 MiB.                                     
Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/training_app_3000.df.gz...
| [1 files][ 45.0 MiB/ 45.0 MiB]                                                
Operation completed over 1 objects/45.0 MiB.                                     


In [0]:
import gzip
import pickle
import datetime
import json
import os
import pprint
import random
import string
import sys
import pandas as pd

In [0]:
grants = pd.read_pickle(f"{DATA_DIR}grants_for_3000+3000.df.gz")
test_app = pd.read_pickle(f"{DATA_DIR}testset_app_3000.df.gz")
train_app = pd.read_pickle(f"{DATA_DIR}training_app_3000.df.gz")

In [0]:
import re


CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)
TAG_PAT = re.compile(r"<.*?>")
LB_PAT = re.compile(r'[\t\n\r\f\v][" "]*')
CANCELED_PAT = re.compile(r'[0-9]+.*\. \(canceled\)[" "]')
NUM_PAT = re.compile(r'[" "]?[0-9]+[" "]?\.[" "]?')


def whole_xml_to_claim_xml(whole):
    mat = CLAIM_PAT.search(whole)
    return mat.group(1)


def whole_xml_to_claim(whole):
    return TAG_PAT.sub(' ', whole_xml_to_claim_xml(whole))


def remove_linebreak_from_claim(claim):
    return LB_PAT.sub('', claim)


def remove_canceled_claim(claim):
    return CANCELED_PAT.sub('', claim)


def remove_claim_numbers(claim):
    return NUM_PAT.sub('', claim)  

In [0]:
%%time

train_app["claim_app"] = train_app["xml"].map(whole_xml_to_claim).map(remove_canceled_claim).map(remove_claim_numbers).map(remove_linebreak_from_claim)
train_app = train_app.drop("xml", axis=1)

test_app["claim_app"] = test_app["xml"].map(whole_xml_to_claim).map(remove_canceled_claim).map(remove_claim_numbers).map(remove_linebreak_from_claim)
test_app = test_app.drop("xml", axis=1)

grants["claim_cited_grant"] = grants["xml"].map(whole_xml_to_claim).map(remove_canceled_claim).map(remove_claim_numbers).map(remove_linebreak_from_claim)
grants = grants.drop("xml", axis=1)

CPU times: user 8.73 s, sys: 148 ms, total: 8.87 s
Wall time: 8.88 s


In [0]:
train_app.head(3)

Unnamed: 0,app_id,claim_app
0,12130785,A system for differentiating noise from an arr...
1,12652424,A method of allocating resources in a data war...
2,12214532,A controlling method of a media processing app...


## Feature extraction: TransformerXL

In [0]:
!pip install --quiet pytorch-transformers

In [0]:
import torch
from pytorch_transformers import *

In [0]:
torch.__version__

'1.1.0'

In [0]:
# Check cuda device is available.
torch.cuda.device_count()

1

In [0]:
MODELS = [(TransfoXLModel,  TransfoXLTokenizer, 'transfo-xl-wt103')]

In [0]:
# Let's encode some text in a sequence of hidden-states using each model:
for model_class, tokenizer_class, pretrained_weights in MODELS:
    # Load pretrained model/tokenizer
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights,
                                      output_hidden_states=False,
                                      output_attentions=False)

In [0]:
# Set cuda device to the model.
device = torch.device('cuda:0')
model = model.to(device)

In [0]:
import numpy as np

In [0]:
GS_BASE = "gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/TransformerXL"

def dump_and_send(fname, obj):
  with gzip.open(fname, 'w') as f:
     pickle.dump(obj, f)
  !gsutil cp {fname} {GS_BASE}/{fname}
  print(f"send to {GS_BASE}/{fname}")

In [0]:
MAX_STR_LEN = 20000

In [0]:
def model_predict(claim):
    with torch.no_grad():
        input_ids = torch.tensor([tokenizer.encode(claim)]).to(device)
        last_hidden_states = model(input_ids)[0]
    return np.mean(last_hidden_states.to('cpu').numpy().squeeze(), axis=0)

In [0]:
%%time

train_features = np.empty((len(train_app), 1024))

for idx, claim in enumerate(train_app['claim_app']):
    train_features[idx] = model_predict(claim[:MAX_STR_LEN])

CPU times: user 20min 14s, sys: 15min 28s, total: 35min 43s
Wall time: 35min 46s


In [0]:
train_features.shape

(3000, 1024)

In [0]:
train_features[0][:5]

array([-0.25386164,  0.01783523,  0.01317462, -0.08112203, -0.08672263])

In [0]:
dump_and_send("TransformerXL_feature_train_app_3000.pkl.gz", train_features)

Copying file://TransformerXL_feature_train_app_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/12.5 MiB.                                     
send to gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/TransformerXL/TransformerXL_feature_train_app_3000.pkl.gz


In [0]:
%%time

test_features = np.empty((len(test_app), 1024))

for idx, claim in enumerate(test_app['claim_app']):
    test_features[idx] = model_predict(claim[:MAX_STR_LEN])

CPU times: user 20min 35s, sys: 15min 52s, total: 36min 27s
Wall time: 36min 30s


In [0]:
dump_and_send("TransformerXL_feature_test_app_3000.pkl.gz", test_features)

Copying file://TransformerXL_feature_test_app_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/12.5 MiB.                                     
send to gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/TransformerXL/TransformerXL_feature_test_app_3000.pkl.gz


In [0]:
%%time

grants_features = np.empty((len(grants), 1024))

for idx, claim in enumerate(grants['claim_cited_grant']):
    grants_features[idx] = model_predict(claim[:MAX_STR_LEN])

CPU times: user 1h 1min 8s, sys: 47min 59s, total: 1h 49min 7s
Wall time: 1h 49min 15s


In [0]:
dump_and_send("TransformerXL_feature_grants_app_3000.pkl.gz", grants_features)

Copying file://TransformerXL_feature_grants_app_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/26.8 MiB.                                     
send to gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/TransformerXL/TransformerXL_feature_grants_app_3000.pkl.gz


# Trial and Errors

In [0]:
input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])

In [0]:
print(input_ids)
print(input_ids.shape)

tensor([[   24,   611,    73,    24,     5, 34559,    15,    56,  2043]])
torch.Size([1, 9])


In [0]:
model_output = model(input_ids)

In [0]:
len(model_output)

3

In [0]:
model_output[0].shape

torch.Size([1, 9, 1024])

In [0]:
print(len(model_output[1]))
print(model_output[1][0].shape)

18
torch.Size([1600, 1, 1024])


In [0]:
print(len(model_output[2]))
print(model_output[2][0].shape)

19
torch.Size([1, 9, 1024])


In [0]:
model_output[0] == model_output[2][-1]

tensor([[[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]], dtype=torch.uint8)

In [0]:
# Encode text
input_ids = torch.tensor([tokenizer.encode("Here is some text to encode"), tokenizer.encode("here is some text to encode but in the lower case"),])
with torch.no_grad():
    last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples

ValueError: ignored

10 apps (CPU): 6 min 7s  
10 apps (GPU): 6 sec

In [0]:
# # Encode text
# with torch.no_grad():
#     input_ids = torch.tensor([tokenizer.encode("Here is some text to encode")]).to(device)
#     print(input_ids)
#     last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples
# test = last_hidden_states.to('cpu')
# del input_ids, last_hidden_states
# torch.cuda.empty_cache()

In [0]:
# test.shape