<a href="https://colab.research.google.com/github/yoheikikuta/US-patent-analysis/blob/master/colab/feature_extract_W2V.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature extraction from w2v model



In [0]:
from google.colab import auth
auth.authenticate_user()

## Data preparation

In [0]:
DATA_DIR = "./"

In [3]:
!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/grants_for_3000+3000.df.gz {DATA_DIR}
!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/testset_app_3000.df.gz {DATA_DIR}
!gsutil cp gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/training_app_3000.df.gz {DATA_DIR}

Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/grants_for_3000+3000.df.gz...
\ [1 files][129.4 MiB/129.4 MiB]                                                
Operation completed over 1 objects/129.4 MiB.                                    
Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/testset_app_3000.df.gz...
- [1 files][ 45.5 MiB/ 45.5 MiB]                                                
Operation completed over 1 objects/45.5 MiB.                                     
Copying gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-xml/training_app_3000.df.gz...
\ [1 files][ 45.0 MiB/ 45.0 MiB]                                                
Operation completed over 1 objects/45.0 MiB.                                     


In [0]:
import gzip
import pickle
import datetime
import json
import os
import pprint
import random
import string
import sys
import pandas as pd

In [0]:
grants = pd.read_pickle(f"{DATA_DIR}grants_for_3000+3000.df.gz")
test_app = pd.read_pickle(f"{DATA_DIR}testset_app_3000.df.gz")
train_app = pd.read_pickle(f"{DATA_DIR}training_app_3000.df.gz")

In [0]:
import re


CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)
TAG_PAT = re.compile(r"<.*?>")
LB_PAT = re.compile(r'[\t\n\r\f\v][" "]*')
CANCELED_PAT = re.compile(r'[0-9]+.*\. \(canceled\)[" "]')
NUM_PAT = re.compile(r'[" "]?[0-9]+[" "]?\.[" "]?')


def whole_xml_to_claim_xml(whole):
    mat = CLAIM_PAT.search(whole)
    return mat.group(1)


def whole_xml_to_claim(whole):
    return TAG_PAT.sub(' ', whole_xml_to_claim_xml(whole))


def remove_linebreak_from_claim(claim):
    return LB_PAT.sub('', claim)


def remove_canceled_claim(claim):
    return CANCELED_PAT.sub('', claim)


def remove_claim_numbers(claim):
    return NUM_PAT.sub('', claim)  

In [7]:
%%time

train_app["claim_app"] = train_app["xml"].map(whole_xml_to_claim).map(remove_canceled_claim).map(remove_claim_numbers).map(remove_linebreak_from_claim)
train_app = train_app.drop("xml", axis=1)
train_app.head()

test_app["claim_app"] = test_app["xml"].map(whole_xml_to_claim).map(remove_canceled_claim).map(remove_claim_numbers).map(remove_linebreak_from_claim)
test_app = test_app.drop("xml", axis=1)
test_app.head()

grants["claim_cited_grant"] = grants["xml"].map(whole_xml_to_claim).map(remove_canceled_claim).map(remove_claim_numbers).map(remove_linebreak_from_claim)
grants = grants.drop("xml", axis=1)
grants.head()

CPU times: user 9.72 s, sys: 258 ms, total: 9.98 s
Wall time: 10 s


In [8]:
train_app.head(3)

Unnamed: 0,app_id,claim_app
0,12130785,A system for differentiating noise from an arr...
1,12652424,A method of allocating resources in a data war...
2,12214532,A controlling method of a media processing app...


## Feature extraction: Word2Vec

In [0]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from scipy import spatial

import multiprocessing
CPUNUM = multiprocessing.cpu_count()

In [0]:
### Use (train + grants) for w2v training

all_training_list = train_app['claim_app'].map(lambda x:x.split()).tolist() + grants['claim_cited_grant'].map(lambda x:x.split()).tolist()

In [11]:
len(all_training_list)

9440

In [12]:
all_training_list[0][:5]

['A', 'system', 'for', 'differentiating', 'noise']

In [0]:
%%time

w2v = Word2Vec(
    all_training_list,
    size=100, window=5, min_count=5, workers=CPUNUM, iter=10, hs=1, seed=23
)

CPU times: user 7min 29s, sys: 1.61 s, total: 7min 31s
Wall time: 3min 50s


In [0]:
w2v.most_similar('generator')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('generator,', 0.6032677888870239),
 ('generation', 0.5030529499053955),
 ('oscillator', 0.5020977258682251),
 ('detector', 0.496626079082489),
 ('controller', 0.4901273846626282),
 ('regulator', 0.4858677089214325),
 ('driver', 0.48575833439826965),
 ('source', 0.48183900117874146),
 ('sensor', 0.481015145778656),
 ('motor', 0.46535730361938477)]

In [0]:
GS_BASE = "gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/W2V"

def dump_and_send(fname, obj):
  with gzip.open(fname, 'w') as f:
     pickle.dump(obj, f)
  !gsutil cp {fname} {GS_BASE}/{fname}
  print(f"send to {GS_BASE}/{fname}")

In [0]:
def text_to_vec(text):
    words = text.split()
    filtered_words = []
    
    for word in words:
        if word in w2v.wv.vocab:
            filtered_words.append(word)
    vec = np.average(w2v[filtered_words], axis=0)
    return vec

In [0]:
%%time

train_features = np.array([text_to_vec(claim) for claim in train_app['claim_app']])
dump_and_send("w2v_100_feature_train_app_3000.pkl.gz", train_features)

  


Copying file://w2v_100_feature_train_app_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/1.1 MiB.                                      
send to gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/W2V/w2v_100_feature_train_app_3000.pkl.gz
CPU times: user 9.79 s, sys: 104 ms, total: 9.89 s
Wall time: 14.5 s


In [0]:
train_features.shape

(3000, 100)

In [0]:
train_features[0][:5]

array([-0.30962306, -0.01635869,  0.04177235, -0.27590832,  0.02144193],
      dtype=float32)

In [0]:
train_features[0][-5:]

array([ 0.40655544, -0.37481394, -0.82976675,  0.00378964,  0.11228371],
      dtype=float32)

In [0]:
%%time

test_features = np.array([text_to_vec(claim) for claim in test_app['claim_app']])
dump_and_send("w2v_100_feature_test_app_3000.pkl.gz", test_features)

  


Copying file://w2v_100_feature_test_app_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/1.1 MiB.                                      
send to gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/W2V/w2v_100_feature_test_app_3000.pkl.gz
CPU times: user 9.98 s, sys: 98.9 ms, total: 10.1 s
Wall time: 13.7 s


In [0]:
%%time

grants_features = np.array([text_to_vec(claim) for claim in grants['claim_cited_grant']])
dump_and_send("w2v_100_feature_grants_3000_3000.pkl.gz", grants_features)

  


Copying file://w2v_100_feature_grants_3000_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/2.3 MiB.                                      
send to gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/W2V/w2v_100_feature_grants_3000_3000.pkl.gz
CPU times: user 28.2 s, sys: 181 ms, total: 28.3 s
Wall time: 32.3 s


## Feature extraction: fasttext

In [0]:
import pandas as pd
import numpy as np
from scipy import spatial

import multiprocessing
CPUNUM = multiprocessing.cpu_count()

In [0]:
import gensim.downloader as api

model = api.load("fasttext-wiki-news-subwords-300")  # download the model and return as object ready for use



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
model.most_similar("generator")

  if np.issubdtype(vec.dtype, np.int):


[('generators', 0.8244898319244385),
 ('cogenerator', 0.7042113542556763),
 ('Generator', 0.6945025324821472),
 ('motor-generator', 0.6781628131866455),
 ('alternator', 0.6766893863677979),
 ('turbine-generator', 0.6669933795928955),
 ('engine-generator', 0.6461780071258545),
 ('turbogenerator', 0.6457508206367493),
 ('turbo-generator', 0.6451094150543213),
 ('regenerator', 0.6378977298736572)]

In [0]:
list(model.vocab.keys())[:10], list(model.vocab.keys())[-5:]

([',', 'the', '.', 'and', 'of', 'to', 'in', 'a', '"', ':'],
 ['whitespotted', 'sacoglossan', 'Iseya', 'Bayyah', 'Vilaya'])

In [0]:
model['generator'].shape

(300,)

In [0]:
GS_BASE = "gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/fastText"

def dump_and_send(fname, obj):
  with gzip.open(fname, 'w') as f:
     pickle.dump(obj, f)
  !gsutil cp {fname} {GS_BASE}/{fname}
  print(f"send to {GS_BASE}/{fname}")

In [0]:
def text_to_vec(text):
    words = text.split()
    filtered_words = []
    
    for word in words:
        if word in model.vocab:
            filtered_words.append(word)
    vec = np.average(model[filtered_words], axis=0)
    return vec

In [0]:
%%time

train_features = np.array([text_to_vec(claim) for claim in train_app['claim_app']])
dump_and_send("fasttext_300_feature_train_app_3000.pkl.gz", train_features)

Copying file://fasttext_300_feature_train_app_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/3.2 MiB.                                      
send to gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/fastText/fasttext_300_feature_train_app_3000.pkl.gz
CPU times: user 9.49 s, sys: 158 ms, total: 9.65 s
Wall time: 15.5 s


In [0]:
train_features.shape

(3000, 300)

In [0]:
train_features[0][:5]

array([ 0.00127815, -0.00225469,  0.02024586,  0.01990329, -0.02750648],
      dtype=float32)

In [0]:
%%time

test_features = np.array([text_to_vec(claim) for claim in test_app['claim_app']])
dump_and_send("fasttext_300_feature_test_app_3000.pkl.gz", test_features)

Copying file://fasttext_300_feature_test_app_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/3.2 MiB.                                      
send to gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/fastText/fasttext_300_feature_test_app_3000.pkl.gz
CPU times: user 9.7 s, sys: 220 ms, total: 9.92 s
Wall time: 14.7 s


In [0]:
%%time

grants_features = np.array([text_to_vec(claim) for claim in grants['claim_cited_grant']])
dump_and_send("fasttext_300_feature_grants_3000_3000.pkl.gz", grants_features)

Copying file://fasttext_300_feature_grants_3000_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/6.9 MiB.                                      
send to gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/fastText/fasttext_300_feature_grants_3000_3000.pkl.gz
CPU times: user 28.6 s, sys: 382 ms, total: 29 s
Wall time: 34 s


## Feature extraction: Word2Vec trained on 2017 data

In [13]:
import gensim.downloader as api

corpus = api.load("patent-2017")



In [0]:
import json

In [15]:
"This; is : a, test.".translate(str.maketrans({'.': ' ', ',': ' ', ':': ' ', ';': ' '})).split()

['This', 'is', 'a', 'test']

In [0]:
def extract_one_grant(document_claims_claim):
    one_grant = []
    for claim_text in document_claims_claim:
        try:
            text_list = claim_text['claim-text']['claim-text']
            if type(text_list) is list:
                # Preprocessings.
                one_grant.extend(sum(map(lambda x: x.translate(str.maketrans({'.': ' ', ',': ' ', ':': ' ', ';': ' '})).split(), text_list), []))
            else:
                one_grant.extend(text_list.translate(str.maketrans({'.': ' ', ',': ' ', ':': ' ', ';': ' '})).split())
        except:
            continue

    if one_grant:
        return one_grant
    else:
        return None

In [17]:
%%time

all_training_list = []

for idx, document in enumerate(corpus):
    if idx % 10000 == 0:
        print(idx)
    try:
        document_claims_claim = document['claims']['claim']
        one_grant = extract_one_grant(document_claims_claim)
        if one_grant:
            all_training_list.append(one_grant)
    except:
        pass

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
CPU times: user 16min 50s, sys: 15.8 s, total: 17min 5s
Wall time: 17min 4s


In [18]:
len(all_training_list)

268847

In [19]:
all_training_list[0][:5], all_training_list[10000][:5], all_training_list[100000][:5]

(['a', 'first', 'lens', 'group', 'having'],
 ['a', 'resource', 'having', 'configuration', 'information'],
 ['receiving', 'from', 'a', 'plurality', 'of'])

In [0]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from scipy import spatial

import multiprocessing
CPUNUM = multiprocessing.cpu_count()

In [21]:
%%time

w2v = Word2Vec(
    all_training_list,
    size=100, window=5, min_count=5, workers=CPUNUM, iter=10, hs=1, seed=23
)

CPU times: user 1h 33min 20s, sys: 20.2 s, total: 1h 33min 40s
Wall time: 47min 36s


In [22]:
w2v.most_similar('generator')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('controller', 0.5964593291282654),
 ('generators', 0.585529625415802),
 ('generation', 0.5550556182861328),
 ('microcontroller', 0.5353302955627441),
 ('modulate', 0.5267602801322937),
 ('module', 0.5182672739028931),
 ('circuitry', 0.5132623910903931),
 ('generate', 0.5110422372817993),
 ('demodulator', 0.5064047574996948),
 ('engine', 0.505030632019043)]

In [0]:
GS_BASE = "gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/W2V-patent-2017"

def dump_and_send(fname, obj):
  with gzip.open(fname, 'w') as f:
     pickle.dump(obj, f)
  !gsutil cp {fname} {GS_BASE}/{fname}
  print(f"send to {GS_BASE}/{fname}")

In [0]:
# def text_to_vec(text):
#     words = text.split()
#     filtered_words = []
    
#     for word in words:
#         if word in w2v.wv.vocab:
#             filtered_words.append(word)
#     vec = np.average(w2v[filtered_words], axis=0)
#     return vec

In [0]:
# Use only 512 tokens.
TOKEN_LEN = 512

def text_to_vec(text):
    idx = 0
    words = text.split()
    filtered_words = []
    
    for word in words:
        if word in w2v.wv.vocab:
            filtered_words.append(word)
            idx += 1
            if (TOKEN_LEN == idx):
                break
    vec = np.average(w2v[filtered_words], axis=0)
    return vec

In [28]:
%%time

train_features = np.array([text_to_vec(claim) for claim in train_app['claim_app']])
# dump_and_send("w2v_patent2017_100_feature_train_app_3000.pkl.gz", train_features)
dump_and_send("w2v_patent2017_100_512tokens_feature_train_app_3000.pkl.gz", train_features)

  


Copying file://w2v_patent2017_100_512tokens_feature_train_app_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/1.1 MiB.                                      
send to gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/W2V-patent-2017/w2v_patent2017_100_512tokens_feature_train_app_3000.pkl.gz
CPU times: user 5.55 s, sys: 158 ms, total: 5.71 s
Wall time: 11.9 s


In [29]:
train_features.shape

(3000, 100)

In [30]:
train_features[0][:5]

array([-0.20959938,  0.10858778,  0.61401975,  0.28779393,  0.08379511],
      dtype=float32)

In [31]:
%%time

test_features = np.array([text_to_vec(claim) for claim in test_app['claim_app']])
# dump_and_send("w2v_patent2017_100_feature_test_app_3000.pkl.gz", test_features)
dump_and_send("w2v_patent2017_100_512tokens_feature_test_app_3000.pkl.gz", test_features)

  


Copying file://w2v_patent2017_100_512tokens_feature_test_app_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/1.1 MiB.                                      
send to gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/W2V-patent-2017/w2v_patent2017_100_512tokens_feature_test_app_3000.pkl.gz
CPU times: user 5.52 s, sys: 184 ms, total: 5.71 s
Wall time: 10.4 s


In [32]:
%%time

grants_features = np.array([text_to_vec(claim) for claim in grants['claim_cited_grant']])
# dump_and_send("w2v_patent2017_100_feature_grants_3000_3000.pkl.gz", grants_features)
dump_and_send("w2v_patent2017_100_512tokens_feature_grants_3000_3000.pkl.gz", grants_features)

  


Copying file://w2v_patent2017_100_512tokens_feature_grants_3000_3000.pkl.gz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/2.3 MiB.                                      
send to gs://yohei-kikuta/mlstudy-phys/patent-analysis/3000-extracted-feature/W2V-patent-2017/w2v_patent2017_100_512tokens_feature_grants_3000_3000.pkl.gz
CPU times: user 12.3 s, sys: 165 ms, total: 12.5 s
Wall time: 18 s


# Making data for BERT pretraining

In [0]:
def extract_one_grant(document_claims_claim):
    one_grant = []
    for claim_text in document_claims_claim:
        try:
            text_list = claim_text['claim-text']['claim-text']
            if type(text_list) is list:
                # No preprocessings. Just concatenate.
                one_grant.append(" ".join(text_list))
            else:
                one_grant.append(text_list)
        except:
            continue

    if one_grant:
        return one_grant
    else:
        return None

In [0]:
%%time

with open("./training_data_patent2017.txt", "w+") as f:
    for idx, document in enumerate(corpus):
        if idx % 10000 == 0:
            print(idx)
        try:
            document_claims_claim = document['claims']['claim']
            one_grant = extract_one_grant(document_claims_claim)
            if one_grant:
                for sentence in one_grant:
                    f.write(sentence)
                    f.write("\n")
                # Add line break between grants.
                f.write("\n")
        except:
            pass

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
CPU times: user 17min 29s, sys: 11.2 s, total: 17min 40s
Wall time: 17min 41s


In [0]:
!head -n 20 ./training_data_patent2017.txt

a first lens group having a positive refractive power closest to a subject; a second lens group closest to the first lens group and having a negative refractive power; a third lens group having one of a positive or negative power; and a fourth lens group having one of a positive or negative power, wherein each of the first to fourth lens groups comprises an aspheric lens having at least one aspheric surface thereof; and one of a thin film filter or an optical filter for restricting the transmission of a portion of light between the second and third lens groups or between the third and fourth lens groups.

a comparator having a first input connected to receive the reference voltage and a second input connected, for the first regulation circuitry, to receive a voltage derived from the output voltage and, for the second regulation circuitry, a voltage derived from the voltage driving the internal load, and an oscillator controlled by the output of the comparator to provide the oscillator 

In [0]:
!tail -n 20 ./training_data_patent2017.txt

a first compartment having a first upper side; a first circuit board positioned in the first compartment and having a first heat generating component; a second circuit board positioned in the first compartment in vertical spaced apart relation to the first circuit board and having a second heat generating component; a liquid cooling device positioned in the first compartment between the first circuit board and the second circuit board, the liquid cooling device including a first surface in thermal contact with the first heat generating component and a second surface in thermal contact with the second heat generating component, wherein the second surface opposes the first surface of the liquid cooling device; a second compartment having a lower side including an air inlet and a second upper side including an air outlet; a hub connecting the second compartment to the first compartment in spaced apart relation so as to leave a gap between the first upper side and the lower side.
fabricati

In [0]:
!gsutil cp ./training_data_patent2017.txt gs://yohei-kikuta/mlstudy-phys/patent-analysis/patent-2017-pretrain-BERT/

Copying file://./training_data_patent2017.txt [Content-Type=text/plain]...
/ [0 files][    0.0 B/729.8 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/
Operation completed over 1 objects/729.8 MiB.                                    
