<a href="https://colab.research.google.com/github/yoheikikuta/US-patent-analysis/blob/master/colab/feature_extract_USE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature extraction from Universal Sentence Encoder model

Use "default" feature defined in TensorFlow Hub.

In [0]:
from google.colab import auth
auth.authenticate_user()

## Data preparation

In [0]:
DATA_DIR = "./"

In [0]:
!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/grants_for_3000+3000.df.gz {DATA_DIR}
!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/testset_app_3000.df.gz {DATA_DIR}
!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/training_app_3000.df.gz {DATA_DIR}

Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/grants_for_3000+3000.df.gz...
/ [1 files][129.4 MiB/129.4 MiB]                                                
Operation completed over 1 objects/129.4 MiB.                                    
Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/testset_app_3000.df.gz...
\ [1 files][ 45.5 MiB/ 45.5 MiB]                                                
Operation completed over 1 objects/45.5 MiB.                                     
Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/training_app_3000.df.gz...
\ [1 files][ 45.0 MiB/ 45.0 MiB]                                                
Operation completed over 1 objects/45.0 MiB.                                     


In [0]:
import gzip
import pickle
import datetime
import json
import os
import pprint
import random
import string
import sys
import pandas as pd

In [0]:
grants = pd.read_pickle(f"{DATA_DIR}grants_for_3000+3000.df.gz")
test_app = pd.read_pickle(f"{DATA_DIR}testset_app_3000.df.gz")
train_app = pd.read_pickle(f"{DATA_DIR}training_app_3000.df.gz")

Define some reg exps for preprocessing.

In [0]:
import re


CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)
TAG_PAT = re.compile(r"<.*?>")
LB_PAT = re.compile(r'[\t\n\r\f\v][" "]*')
CANCELED_PAT = re.compile(r'[0-9]+.*\. \(canceled\)[" "]')
NUM_PAT = re.compile(r'[" "]?[0-9]+[" "]?\.[" "]?')


def whole_xml_to_claim_xml(whole):
    mat = CLAIM_PAT.search(whole)
    return mat.group(1)


def whole_xml_to_claim(whole):
    return TAG_PAT.sub(' ', whole_xml_to_claim_xml(whole))


def remove_linebreak_from_claim(claim):
    return LB_PAT.sub('', claim)


def remove_canceled_claim(claim):
    return CANCELED_PAT.sub('', claim)


def remove_claim_numbers(claim):
    return NUM_PAT.sub('', claim)  

In [0]:
%%time

train_app["claim_app"] = train_app["xml"].map(whole_xml_to_claim).map(remove_canceled_claim).map(remove_claim_numbers).map(remove_linebreak_from_claim)
train_app = train_app.drop("xml", axis=1)
train_app.head()

test_app["claim_app"] = test_app["xml"].map(whole_xml_to_claim).map(remove_canceled_claim).map(remove_claim_numbers).map(remove_linebreak_from_claim)
test_app = test_app.drop("xml", axis=1)
test_app.head()

grants["claim_cited_grant"] = grants["xml"].map(whole_xml_to_claim).map(remove_canceled_claim).map(remove_claim_numbers).map(remove_linebreak_from_claim)
grants = grants.drop("xml", axis=1)
grants.head()

CPU times: user 8.1 s, sys: 134 ms, total: 8.23 s
Wall time: 8.24 s


In [0]:
train_app.head(3)

Unnamed: 0,app_id,claim_app
0,12130785,A system for differentiating noise from an arr...
1,12652424,A method of allocating resources in a data war...
2,12214532,A controlling method of a media processing app...


## Feature extraction

In [0]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

tf.__version__

'1.14.0'

In [0]:
GS_BASE = "gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/USE"
TRAIN_FEATURE_FNAME = "use_feature_train_app_3000.pkl.gz"
TEST_FEATURE_FNAME = "use_feature_test_app_3000.pkl.gz"
GRANTS_FEATURE_FNAME = "use_feature_grants_3000_3000.pkl.gz"

def dump_and_send(fname, obj):
  with gzip.open(fname, 'w') as f:
     pickle.dump(obj, f)
  !gsutil cp {fname} {GS_BASE}/{fname}

In [0]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
embed = hub.Module(module_url)
tf.logging.set_verbosity(tf.logging.ERROR)

In [0]:
%%time

train_features = np.empty((len(train_app), 512))  # data num * feature dim

for i in range(3):
  messages = train_app["claim_app"].values.tolist()[1000*i : 1000*(i+1)]

  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    train_features[1000*i : 1000*(i+1)] = session.run(embed(messages))

CPU times: user 18.2 s, sys: 2.36 s, total: 20.6 s
Wall time: 26.9 s


In [0]:
train_features[0][:5]

array([ 0.02815786,  0.00533784,  0.05273789, -0.06909271, -0.06702347])

In [0]:
train_features[-1][:5]

array([ 0.02958738,  0.08695184,  0.11543479, -0.07463112, -0.02645765])

In [0]:
%%time

test_features = np.empty((len(test_app), 512))  # data num * feature dim

for i in range(3):
  messages = test_app["claim_app"].values.tolist()[1000*i : 1000*(i+1)]

  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    test_features[1000*i : 1000*(i+1)] = session.run(embed(messages))

CPU times: user 21.4 s, sys: 1.77 s, total: 23.2 s
Wall time: 27.1 s


In [0]:
%%time

grant_features = np.empty((len(grants), 512))  # data num * feature dim

for i in range(7):
  messages = grants["claim_cited_grant"].values.tolist()[1000*i : 1000*(i+1)]

  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    grant_features[1000*i : 1000*(i+1)] = session.run(embed(messages))

CPU times: user 1min 8s, sys: 4.71 s, total: 1min 12s
Wall time: 1min 21s


In [0]:
dump_and_send(TRAIN_FEATURE_FNAME, train_features)
dump_and_send(TEST_FEATURE_FNAME, test_features)
dump_and_send(GRANTS_FEATURE_FNAME, grant_features)

Copying file://use_feature_train_app_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/6.2 MiB.                                      
Copying file://use_feature_test_app_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/6.2 MiB.                                      
Copying file://use_feature_grants_3000_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/13.3 MiB.                                     
