# extract patent information from Google Patents

In [74]:
url_list = ["https://patents.google.com/patent/US9858496B2/en"]
# seed prior art
# Faster R-CNN

In [5]:
import requests
from bs4 import BeautifulSoup
import re
from collections import defaultdict

TWO_BYTE = re.compile(r'[^\x01-\x7E]')

def remove_two_byte(text):
  return TWO_BYTE.sub('', text)

def get_results(soup):
  result_1 = defaultdict(str)
  result_2 = defaultdict(list)
  result_3 = defaultdict(list)

  # for elm in soup.find_all('div'):
  #   if elm.get('itemprop') == 'content':
  #     if elm.contents[0].name == 'abstract':
  #       result_1[elm.contents[0].name] = remove_two_byte(elm.get_text(separator=" ", strip=True)).replace("\n"," ")
  #     else:
  #       result_1[elm.contents[0]['class'][0]] = remove_two_byte(elm.get_text(separator=" ", strip=True)).replace("\n"," ")

  tmp = soup.find('section', attrs={'itemprop':'abstract'})
  if tmp:
    result_1["abstract"] = remove_two_byte(tmp.get_text(separator=" ", strip=True)).replace("\n"," ")

  tmp = soup.find('section', attrs={'itemprop':'claims'})
  if tmp:
    result_1["claims"] = remove_two_byte(tmp.get_text(separator=" ", strip=True)).replace("\n"," ")
  
  tmp = soup.find('section', attrs={'itemprop':'description'})
  if tmp:
    result_1["description"]  = remove_two_byte(tmp.get_text(separator=" ", strip=True)).replace("\n"," ")

  tmp = soup.find('span', attrs={'itemprop':'assigneeSearch'})
  if tmp:
    result_1["assignee"]  = tmp.get_text(separator=" ", strip=True)
  
  tmp = soup.find('time', attrs={'itemprop':'priorityDate'})
  if tmp:
    result_1["priority_date"]  = tmp.get_text(separator=" ", strip=True)

  for elm in soup.find_all("span", attrs = {"itemprop":"examinerCited"}):
    if elm.find_parent("tr").get("itemprop") == "forwardReferencesOrig" or elm.find_parent("tr").get("itemprop") == "forwardReferencesFamily":
      result_2["ForwardReferences"] += ["https://patents.google.com" + elm.find_previous_sibling("a").get("href")]
    if elm.find_parent("tr").get("itemprop") == "backwardReferencesOrig" or elm.find_parent("tr").get("itemprop") == "backwardReferencesFamily":
      result_2["BackwardReferences"] += ["https://patents.google.com" + elm.find_previous_sibling("a").get("href")]
  
  for elm in soup.find_all('meta', attrs={'itemprop':'Leaf'}):
    result_3["cpc"] += [elm.find_previous_sibling("span", attrs={'itemprop':'Code'}).get_text(strip=True)]
    result_3["description"] += [elm.find_previous_sibling("span", attrs={'itemprop':'Description'}).get_text(strip=True)]
    
  return result_1, result_2, result_3

def get_text_dict(url):
  res = requests.get(url)
  res.encoding = res.apparent_encoding
  soup = BeautifulSoup(res.text, 'html.parser')
  result_1, result_2, result_3 = get_results(soup)
  if result_1["abstract"] == "":
    print("abstract none")
    result_1["abstract"] = "abstract"

  if result_1["claims"] == "":
    print("claims none")
    result_1["claims"] = "claims"

  if result_1["description"] == "":
    print("description none")
    result_1["description"] = "description"

  if result_1["assignee"] == "":
    print("assignee none")
    result_1["assignee"] = "unkown"

  if result_1["priority_date"] == "":
    print("priority_date none")
    result_1["priority_date"] = ""

  return result_1, result_2, result_3

In [6]:
%%time

from collections import defaultdict

references = defaultdict(dict)
cpc = defaultdict(dict)
abstract = []
claims = []
description = []
assignee = []
pub_num = []
priority_date = []
new_url_list = []

i=0

while len(claims) <= 1000:
  i += 1
  if i ==1 :
    for j, url in enumerate(url_list):
      num = url.split(sep="/")[-2]
      pub_num.append(num)
      tmp, references[num], cpc[num] = get_text_dict(url)
      abstract.append(tmp["abstract"])
      claims.append(tmp["claims"])
      description.append(tmp["description"])
      assignee.append(tmp["assignee"])
      priority_date.append(tmp["priority_date"])
      print("{}-{} : {}".format(i, j+1, url))

  if i > 1:
    tmp_dict = references.values()
    for d in tmp_dict:
      new_url_list += d["ForwardReferences"]
      new_url_list += d["BackwardReferences"]
    for url in new_url_list:
      if url in url_list:
        new_url_list.remove(url)
    print("Loop {}. {} data will be extracted.".format(i, len(new_url_list)))
    if len(new_url_list) > 1000:
      print("Terminate.")
      print("Total {} data.".format(len(claims)))
      break
    for j, url in enumerate(new_url_list):
      num = url.split(sep="/")[-2]
      pub_num.append(num)
      tmp, references[num], cpc[num] = get_text_dict(url)
      abstract.append(tmp["abstract"])
      claims.append(tmp["claims"])
      description.append(tmp["description"])
      assignee.append(tmp["assignee"])
      priority_date.append(tmp["priority_date"])
      print("{}-{} : {}".format(i, j+1, url))
    url_list += new_url_list
 
  print("Total {} data.".format(len(claims)))

1-1 : https://patents.google.com/patent/US9858496B2/en
Total 1 data.
Loop 2. 42 data will be extracted.
2-1 : https://patents.google.com/patent/US20180107866A1/en
2-2 : https://patents.google.com/patent/CN108520229A/en
2-3 : https://patents.google.com/patent/CN108573228A/en
2-4 : https://patents.google.com/patent/US10242294B2/en
2-5 : https://patents.google.com/patent/US10304009B1/en
2-6 : https://patents.google.com/patent/US10366430B2/en
2-7 : https://patents.google.com/patent/WO2019148729A1/en
2-8 : https://patents.google.com/patent/US10380741B2/en
2-9 : https://patents.google.com/patent/US10262237B2/en
2-10 : https://patents.google.com/patent/US20180260415A1/en
2-11 : https://patents.google.com/patent/US10255525B1/en
2-12 : https://patents.google.com/patent/US10303956B2/en
2-13 : https://patents.google.com/patent/CN107562925A/en
2-14 : https://patents.google.com/patent/WO2019068141A1/en
2-15 : https://patents.google.com/patent/US10169679B1/en
2-16 : https://patents.google.com/patent

3-98 : https://patents.google.com/patent/US20190129413A1/en
3-99 : https://patents.google.com/patent/DE102018210632A1/en
3-100 : https://patents.google.com/patent/US5729471A/en
3-101 : https://patents.google.com/patent/US5995651A/en
3-102 : https://patents.google.com/patent/US20020186144A1/en
3-103 : https://patents.google.com/patent/US20070239494A1/en
3-104 : https://patents.google.com/patent/WO2008081143A2/en
3-105 : https://patents.google.com/patent/US20080183535A1/en
3-106 : https://patents.google.com/patent/US20090116698A1/en
3-107 : https://patents.google.com/patent/US20140254923A1/en
3-108 : https://patents.google.com/patent/US20150023553A1/en
3-109 : https://patents.google.com/patent/US20160173568A1/en
3-110 : https://patents.google.com/patent/US20170206431A1/en
3-111 : https://patents.google.com/patent/US9870649B1/en
3-112 : https://patents.google.com/patent/US10013620B1/en
3-113 : https://patents.google.com/patent/US20180229737A1/en
3-114 : https://patents.google.com/patent/C

3-236 : https://patents.google.com/patent/US20170169315A1/en
3-237 : https://patents.google.com/patent/US20170206431A1/en
3-238 : https://patents.google.com/patent/US20060245653A1/en
3-239 : https://patents.google.com/patent/US9202144B2/en
3-240 : https://patents.google.com/patent/US20170011281A1/en
3-241 : https://patents.google.com/patent/US20170132472A1/en
3-242 : https://patents.google.com/patent/US20170206431A1/en
3-243 : https://patents.google.com/patent/US20170220876A1/en
3-244 : https://patents.google.com/patent/US20170294124A1/en
3-245 : https://patents.google.com/patent/US20180129887A1/en
3-246 : https://patents.google.com/patent/US20180137642A1/en
3-247 : https://patents.google.com/patent/US20180158189A1/en
3-248 : https://patents.google.com/patent/US20180165551A1/en
3-249 : https://patents.google.com/patent/US20180253622A1/en
3-250 : https://patents.google.com/patent/US10223610B1/en
3-251 : https://patents.google.com/patent/US20190102646A1/en
3-252 : https://patents.google.

3-372 : https://patents.google.com/patent/US20190020871A1/en
3-373 : https://patents.google.com/patent/US10198671B1/en
3-374 : https://patents.google.com/patent/US20190050681A1/en
3-375 : https://patents.google.com/patent/US20190073553A1/en
3-376 : https://patents.google.com/patent/US20190095777A1/en
3-377 : https://patents.google.com/patent/US10560696B2/en
3-378 : https://patents.google.com/patent/US20160148079A1/en
3-379 : https://patents.google.com/patent/US20170011281A1/en
3-380 : https://patents.google.com/patent/US20170124409A1/en
3-381 : https://patents.google.com/patent/US20170124415A1/en
3-382 : https://patents.google.com/patent/US20170132496A1/en
3-383 : https://patents.google.com/patent/US20170169315A1/en
3-384 : https://patents.google.com/patent/US20170206431A1/en
3-385 : https://patents.google.com/patent/US20180032857A1/en
3-386 : https://patents.google.com/patent/US20180096457A1/en
3-387 : https://patents.google.com/patent/US20180107926A1/en
3-388 : https://patents.google

3-508 : https://patents.google.com/patent/US9207760B1/en
3-509 : https://patents.google.com/patent/US9208404B2/en
3-510 : https://patents.google.com/patent/US9443198B1/en
3-511 : https://patents.google.com/patent/US9489598B2/en
3-512 : https://patents.google.com/patent/US10560362B2/en
3-513 : https://patents.google.com/patent/US10433112B2/en
3-514 : https://patents.google.com/patent/US20140278390A1/en
3-515 : https://patents.google.com/patent/US20170185871A1/en
3-516 : https://patents.google.com/patent/TWI624793B/en
3-517 : https://patents.google.com/patent/US10382770B2/en
3-518 : https://patents.google.com/patent/CN102210559A/en
3-519 : https://patents.google.com/patent/JP2011257805A/en
3-520 : https://patents.google.com/patent/CN102479329A/en
3-521 : https://patents.google.com/patent/US9207760B1/en
3-522 : https://patents.google.com/patent/JP2014153837A/en
abstract none
3-523 : https://patents.google.com/patent/JP5808371B2/en
3-524 : https://patents.google.com/patent/CN104680120B/en


3-648 : https://patents.google.com/patent/US10346693B1/en
3-649 : https://patents.google.com/patent/US10395140B1/en
3-650 : https://patents.google.com/patent/US10325352B1/en
3-651 : https://patents.google.com/patent/US10496899B1/en
3-652 : https://patents.google.com/patent/US10373323B1/en
3-653 : https://patents.google.com/patent/US10373027B1/en
3-654 : https://patents.google.com/patent/CN110222565A/en
3-655 : https://patents.google.com/patent/US6819790B2/en
3-656 : https://patents.google.com/patent/US7219085B2/en
3-657 : https://patents.google.com/patent/US7603000B2/en
3-658 : https://patents.google.com/patent/US7634137B2/en
3-659 : https://patents.google.com/patent/US7813822B1/en
3-660 : https://patents.google.com/patent/US8463025B2/en
3-661 : https://patents.google.com/patent/US7006881B1/en
3-662 : https://patents.google.com/patent/US7904187B2/en
3-663 : https://patents.google.com/patent/US6820897B2/en
3-664 : https://patents.google.com/patent/US8100552B2/en
3-665 : https://patents.

In [7]:
import pandas as pd

df = pd.DataFrame({
    "id" : pub_num,
    "priority_date": priority_date,
    "assignee" : assignee,
    "abstract" : abstract,
    "claims" : claims,
    "description" : description
})

df.head()

Unnamed: 0,id,priority_date,assignee,abstract,claims,description
0,US9858496B2,2016-01-20,Microsoft Technology Licensing LLC,"Abstract Systems, methods, and computer-readab...",Claims ( 17 ) What is claimed is: 1. A method ...,Description BACKGROUND As search engine capabi...
1,US20180107866A1,2016-10-19,Snap Inc,"Abstract Systems, devices, media, and methods ...",Claims ( 20 ) What is claimed is: 1 . A method...,Description TECHNICAL FIELD Embodiments of the...
2,CN108520229A,2018-04-04,北京旷视科技有限公司,Abstract The present invention provides a kind...,Claims ( 13 ) 1. a kind of image detecting met...,"Description Image detecting method, device, el..."
3,CN108573228A,2018-04-09,杭州华雁云态信息技术有限公司,Abstract A kind of electric line foreign matte...,Claims ( 10 ) 1. a kind of electric line forei...,Description A kind of electric line foreign ma...
4,US10242294B2,2017-05-01,Intel Corp,Abstract An example apparatus for classifying ...,Claims ( 16 ) What is claimed is: 1. An appara...,Description BACKGROUND Various object classifi...


In [9]:
import re
import string

TOP_CLAIM = re.compile(r'Claims\ \(\ [0-9]{1,3}\ \)(.{20,}?)\.')
TOP_CLAIM_2 = re.compile(r'Claims\ (.{20,}?)\.') 

def extract_top_claim(text):
    text = re.sub("What is claimed is: ", "", text)
    m = TOP_CLAIM.search(text)
    if m == None:
        m = TOP_CLAIM_2.search(text)
        if m == None:
            return
    text = re.sub("1\.\ ", "", m.group(1))
    text = re.sub("1 \.\ ", "", text)
    text = text.lower()
    text = re.sub("what is claimed is: ", "", text)
    text = text.translate(str.maketrans('','',string.punctuation))
    text = " ".join([w for w in text.split() if not re.match(r"^[0-9]{1,5}[a-z]$|^[0-9]{1,5}.*[0-9]$|^\(.*\)$|\\n|\\t|^\\", w)])
    return text

def abstract_preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('','',string.punctuation))
    text = " ".join([w for w in text.split() if not re.match(r"^[0-9]{1,5}[a-z]$|^[0-9]{1,5}.*[0-9]$|^\(.*\)$|\\n|\\t|^\\", w)])    
    return re.sub("^abstract ","", text)

In [64]:
df["top_claims"] = df["claims"].map(extract_top_claim)
df["preprocessed_abstract"] = df["abstract"].map(abstract_preprocess)

In [65]:
df.head()

Unnamed: 0,id,priority_date,assignee,abstract,claims,description,top_claims,preprocessed_abstract
0,US9858496B2,2016-01-20,Microsoft Technology Licensing LLC,"Abstract Systems, methods, and computer-readab...",Claims ( 17 ) What is claimed is: 1. A method ...,Description BACKGROUND As search engine capabi...,a method comprising receiving an input image g...,systems methods and computerreadable media for...
1,US20180107866A1,2016-10-19,Snap Inc,"Abstract Systems, devices, media, and methods ...",Claims ( 20 ) What is claimed is: 1 . A method...,Description TECHNICAL FIELD Embodiments of the...,a method comprising receiving by one or more p...,systems devices media and methods are presente...
2,CN108520229A,2018-04-04,北京旷视科技有限公司,Abstract The present invention provides a kind...,Claims ( 13 ) 1. a kind of image detecting met...,"Description Image detecting method, device, el...",a kind of image detecting method which is char...,the present invention provides a kind of image...
3,CN108573228A,2018-04-09,杭州华雁云态信息技术有限公司,Abstract A kind of electric line foreign matte...,Claims ( 10 ) 1. a kind of electric line forei...,Description A kind of electric line foreign ma...,a kind of electric line foreign matter intrusi...,a kind of electric line foreign matter intrusi...
4,US10242294B2,2017-05-01,Intel Corp,Abstract An example apparatus for classifying ...,Claims ( 16 ) What is claimed is: 1. An appara...,Description BACKGROUND Various object classifi...,an apparatus for classifying target objects us...,an example apparatus for classifying target ob...


In [66]:
df[df["preprocessed_abstract"]=="abstract"]

Unnamed: 0,id,priority_date,assignee,abstract,claims,description,top_claims,preprocessed_abstract
107,JP6268960B2,2013-11-15,オムロン株式会社,abstract,Claims ( 8 ) An extractor for extracting featu...,Description The present invention relates to a...,an extractor for extracting feature data of th...,abstract
565,JP5808371B2,2013-08-28,ヤフー株式会社,abstract,Claims ( 11 ) A storage unit for storing infor...,Description The present invention relates to a...,a storage unit for storing information about l...,abstract
574,JP3173040B2,1991-05-10,ミノルタ株式会社,abstract,Claims ( 4 ) (57) [Claims] 1. A device for pro...,Description DETAILED DESCRIPTION OF THE INVENT...,claims a device for processing image data cons...,abstract
601,JP5469216B2,2012-07-31,ファナック株式会社,abstract,Claims ( 7 ) A three-dimensional measuring mac...,Description The present invention relates to a...,a threedimensional measuring machine that meas...,abstract


In [67]:
ind_list = df[df["preprocessed_abstract"]=="abstract"].index
for ind in ind_list:
    df["preprocessed_abstract"][ind] = df["top_claims"][ind]

df.loc[ind_list]

Unnamed: 0,id,priority_date,assignee,abstract,claims,description,top_claims,preprocessed_abstract
107,JP6268960B2,2013-11-15,オムロン株式会社,abstract,Claims ( 8 ) An extractor for extracting featu...,Description The present invention relates to a...,an extractor for extracting feature data of th...,an extractor for extracting feature data of th...
565,JP5808371B2,2013-08-28,ヤフー株式会社,abstract,Claims ( 11 ) A storage unit for storing infor...,Description The present invention relates to a...,a storage unit for storing information about l...,a storage unit for storing information about l...
574,JP3173040B2,1991-05-10,ミノルタ株式会社,abstract,Claims ( 4 ) (57) [Claims] 1. A device for pro...,Description DETAILED DESCRIPTION OF THE INVENT...,claims a device for processing image data cons...,claims a device for processing image data cons...
601,JP5469216B2,2012-07-31,ファナック株式会社,abstract,Claims ( 7 ) A three-dimensional measuring mac...,Description The present invention relates to a...,a threedimensional measuring machine that meas...,a threedimensional measuring machine that meas...


In [114]:
import pickle
import gzip

def dump(fname, obj):
  with gzip.open(fname, 'wb') as f:
     pickle.dump(obj, f)

dump("df_20200413.pkl.gz", df)

In [71]:
references["US9858496B2"]

defaultdict(list,
            {'ForwardReferences': ['https://patents.google.com/patent/US20180107866A1/en',
              'https://patents.google.com/patent/CN108520229A/en',
              'https://patents.google.com/patent/CN108573228A/en',
              'https://patents.google.com/patent/US10242294B2/en',
              'https://patents.google.com/patent/US10304009B1/en',
              'https://patents.google.com/patent/US10366430B2/en',
              'https://patents.google.com/patent/WO2019148729A1/en',
              'https://patents.google.com/patent/US10380741B2/en',
              'https://patents.google.com/patent/US10262237B2/en',
              'https://patents.google.com/patent/US20180260415A1/en',
              'https://patents.google.com/patent/US10255525B1/en',
              'https://patents.google.com/patent/US10303956B2/en',
              'https://patents.google.com/patent/CN107562925A/en',
              'https://patents.google.com/patent/WO2019068141A1/en',
             

In [72]:
cpc["US9858496B2"]

defaultdict(list,
            {'cpc': ['G06K9/4671',
              'G06F16/5838',
              'G06F16/951',
              'G06F17/30864',
              'G06K9/3233',
              'G06K9/4628',
              'G06K9/6267',
              'G06K9/685',
              'G06N3/0454',
              'G06N3/084'],
             'description': ['Extracting features based on salient regional features, e.g. Scale Invariant Feature Transform [SIFT] keypoints',
              'Retrieval characterised by using metadata, e.g. metadata not derived from the content or metadata generated manually using metadata automatically derived from the content using colour',
              'Indexing; Web crawling techniques',
              '',
              'Determination of region of interest',
              'Integrating the filters into a hierarchical structure',
              'Classification techniques',
              'Involving plural approaches, e.g. verification by template match; resolving confusion among simil

In [73]:
dump("references_20200413.pkl.gz", references)
dump("cpc_20200413.pkl.gz", cpc)

In [82]:
df["preprocessed_abstract"].map(len).describe()

count     712.000000
mean      796.014045
std       226.456641
min       162.000000
25%       653.000000
50%       853.000000
75%       911.000000
max      1631.000000
Name: preprocessed_abstract, dtype: float64

# create ALBERT model

In [6]:
import pickle
import gzip

def load(fname):
    with gzip.open(fname, 'rb') as f:
        r = pickle.load(f)
        return r
    
df = load("df_20200413.pkl.gz")
references = load("references_20200413.pkl.gz")
cpc = load("cpc_20200413.pkl.gz")

In [7]:
df.head()

Unnamed: 0_level_0,priority_date,assignee,abstract,claims,description,top_claims,preprocessed_abstract
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
US9858496B2,2016-01-20,Microsoft Technology Licensing LLC,"Abstract Systems, methods, and computer-readab...",Claims ( 17 ) What is claimed is: 1. A method ...,Description BACKGROUND As search engine capabi...,a method comprising receiving an input image g...,systems methods and computerreadable media for...
US20180107866A1,2016-10-19,Snap Inc,"Abstract Systems, devices, media, and methods ...",Claims ( 20 ) What is claimed is: 1 . A method...,Description TECHNICAL FIELD Embodiments of the...,a method comprising receiving by one or more p...,systems devices media and methods are presente...
CN108520229A,2018-04-04,北京旷视科技有限公司,Abstract The present invention provides a kind...,Claims ( 13 ) 1. a kind of image detecting met...,"Description Image detecting method, device, el...",a kind of image detecting method which is char...,the present invention provides a kind of image...
CN108573228A,2018-04-09,杭州华雁云态信息技术有限公司,Abstract A kind of electric line foreign matte...,Claims ( 10 ) 1. a kind of electric line forei...,Description A kind of electric line foreign ma...,a kind of electric line foreign matter intrusi...,a kind of electric line foreign matter intrusi...
US10242294B2,2017-05-01,Intel Corp,Abstract An example apparatus for classifying ...,Claims ( 16 ) What is claimed is: 1. An appara...,Description BACKGROUND Various object classifi...,an apparatus for classifying target objects us...,an example apparatus for classifying target ob...


In [8]:
df.shape

(433, 7)

In [11]:
cite_pair = []

for num in df.index[0:43]:
    for cited_url in references[num]["ForwardReferences"]:
        text_a = df.at[num, "preprocessed_abstract"]
        text_b = df.at[cited_url.split(sep="/")[-2], "preprocessed_abstract"]
        cite_pair.append([text_a, text_b])
    
    for citing_url in references[num]["BackwardReferences"]:
        text_a = df.at[citing_url.split(sep="/")[-2], "preprocessed_abstract"]
        text_b = df.at[num, "preprocessed_abstract"]
        cite_pair.append([text_a, text_b])

len(cite_pair)

722

In [12]:
import random

not_cite_pair = []

for num in df.index[0:43]:
    text_a = df.at[num, "preprocessed_abstract"]
    cite = references[num]["ForwardReferences"] + references[num]["BackwardReferences"]
    cite = set([s.split(sep="/")[-2] for s in cite])
    length = len(cite)
    
    not_cite = set(df.index[43:]) - cite
    
    sample = random.sample(not_cite, length)
    for num in sample:      
        text_b = df.at[num, "preprocessed_abstract"]
        not_cite_pair.append([text_a, text_b])

len(not_cite_pair)

722

In [13]:
not_cite_pair[0:2]

[['systems methods and computerreadable media for providing fast and accurate object detection and classification in images are described herein in some examples a computing device can receive an input image the computing device can process the image and generate a convolutional feature map in some configurations the convolutional feature map can be processed through a region proposal network rpn to generate proposals for candidate objects in the image in various examples the computing device can process the convolutional feature map with the proposals through a fast regionbased convolutional neural network frcn proposal classifier to determine a class of each object in the image and a confidence score associated therewith the computing device can then provide a requestor with an output including the object classification andor confidence score',
  'field image processing means substance group of inventions relates to the field of automatic image analysis device of cascade processing o

In [14]:
pair = cite_pair + not_cite_pair
label = [1]*len(cite_pair) + [0]*len(not_cite_pair)

In [15]:
%%time

import torch
from transformers import AlbertTokenizer

# torch.set_default_tensor_type(torch.cuda.FloatTensor)

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

token = tokenizer.batch_encode_plus(pair, add_special_tokens=True, 
                                    return_token_type_ids=True, max_length=512, return_attention_masks=True, pad_to_max_length=True)

CPU times: user 3.28 s, sys: 1.27 s, total: 4.55 s
Wall time: 6.6 s


In [16]:
token["input_ids"][10]

[2,
 1242,
 3195,
 17,
 1428,
 10647,
 579,
 941,
 26,
 2674,
 1512,
 17,
 8137,
 3095,
 11643,
 17,
 4039,
 19,
 3502,
 50,
 745,
 235,
 108,
 19,
 109,
 3770,
 21,
 10626,
 3646,
 92,
 2588,
 40,
 6367,
 1961,
 14,
 10626,
 3646,
 92,
 953,
 14,
 1961,
 17,
 7920,
 21,
 1065,
 16261,
 3309,
 192,
 1580,
 2942,
 19,
 109,
 8091,
 18,
 14,
 1065,
 16261,
 3309,
 192,
 1580,
 2942,
 92,
 44,
 16697,
 120,
 21,
 632,
 5149,
 982,
 13,
 6952,
 103,
 20,
 7920,
 10869,
 26,
 2316,
 3916,
 19,
 14,
 1961,
 19,
 617,
 3770,
 14,
 10626,
 3646,
 92,
 953,
 14,
 1065,
 16261,
 3309,
 192,
 1580,
 2942,
 29,
 14,
 10869,
 120,
 21,
 1512,
 632,
 1281,
 1065,
 16261,
 3309,
 192,
 17371,
 982,
 6034,
 9881,
 5149,
 718,
 16292,
 20,
 3746,
 21,
 718,
 16,
 206,
 3095,
 19,
 14,
 1961,
 17,
 21,
 6548,
 1618,
 1598,
 80,
 1410,
 14,
 10626,
 3646,
 92,
 94,
 1181,
 21,
 3772,
 248,
 29,
 40,
 5196,
 215,
 14,
 3095,
 4039,
 17,
 248,
 6548,
 1618,
 3,
 1961,
 5511,
 1242,
 92,
 468,
 53,
 54,
 91

In [17]:
import torch
from torch.utils.data import Dataset, DataLoader

class PairDataset(Dataset):
    def __init__(self, input_ids, token_type_ids, attention_mask, label):
        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.label = label

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        one_token = self.input_ids[idx]
        one_token_type = self.token_type_ids[idx]
        one_mask = self.attention_mask[idx]
        one_label = self.label[idx]

#         device = torch.device("cuda")
        device = torch.device("cpu")
        sample = {'input_ids': torch.tensor(one_token, device=device), 
                'token_type_ids': torch.tensor(one_token_type, device=device), 
                'attention_mask': torch.tensor(one_mask, device=device), 
                'label': torch.tensor(one_label, device=device)
                }

        return sample

In [18]:
train_dataset = PairDataset(token["input_ids"], token["token_type_ids"], token["attention_mask"], label)

In [19]:
import numpy as np

np.array(token["input_ids"]).shape, np.array(token["token_type_ids"]).shape, np.array(token["attention_mask"]).shape, np.array(label).shape

((1444, 512), (1444, 512), (1444, 512), (1444,))

In [20]:
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [21]:
from transformers import AlbertForSequenceClassification

model =  AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels = 2)
model.train()

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [22]:
!pip install sklearn



In [23]:
from sklearn.metrics import classification_report
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=2e-5)

total_loss = []

for epoch in range(10):
    pred = []
    gt = []
    running_loss = 0.0

    for i, data in enumerate(train_dataloader, 0):
        input_ids = data["input_ids"]
        token_type_ids = data["token_type_ids"]
        attention_mask = data["attention_mask"]
        labels = data["label"]

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs[:2]
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        total_loss.append(loss.item())

        if i % 100 == 99:    # print every 100 mini-batches (400 pairs)
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0

    prob = torch.softmax(logits, dim=1).tolist()
    pred += [int(np.argmax(elm)) for elm in prob]
    gt += [int(elm) for elm in labels]
    print(classification_report(gt, pred, target_names=["not_cite", "cite"]))
    
print('Finished Training')

KeyboardInterrupt: 

training on Colab

https://colab.research.google.com/drive/1XPYkvmJSxwKeDGWGuj-MnlvQYwWuZHNg?hl=ja

In [24]:
df.head()

Unnamed: 0_level_0,priority_date,assignee,abstract,claims,description,top_claims,preprocessed_abstract
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
US9858496B2,2016-01-20,Microsoft Technology Licensing LLC,"Abstract Systems, methods, and computer-readab...",Claims ( 17 ) What is claimed is: 1. A method ...,Description BACKGROUND As search engine capabi...,a method comprising receiving an input image g...,systems methods and computerreadable media for...
US20180107866A1,2016-10-19,Snap Inc,"Abstract Systems, devices, media, and methods ...",Claims ( 20 ) What is claimed is: 1 . A method...,Description TECHNICAL FIELD Embodiments of the...,a method comprising receiving by one or more p...,systems devices media and methods are presente...
CN108520229A,2018-04-04,北京旷视科技有限公司,Abstract The present invention provides a kind...,Claims ( 13 ) 1. a kind of image detecting met...,"Description Image detecting method, device, el...",a kind of image detecting method which is char...,the present invention provides a kind of image...
CN108573228A,2018-04-09,杭州华雁云态信息技术有限公司,Abstract A kind of electric line foreign matte...,Claims ( 10 ) 1. a kind of electric line forei...,Description A kind of electric line foreign ma...,a kind of electric line foreign matter intrusi...,a kind of electric line foreign matter intrusi...
US10242294B2,2017-05-01,Intel Corp,Abstract An example apparatus for classifying ...,Claims ( 16 ) What is claimed is: 1. An appara...,Description BACKGROUND Various object classifi...,an apparatus for classifying target objects us...,an example apparatus for classifying target ob...


In [25]:
df_part = df[["priority_date", "assignee", "preprocessed_abstract"]]
df_part.head()

Unnamed: 0_level_0,priority_date,assignee,preprocessed_abstract
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
US9858496B2,2016-01-20,Microsoft Technology Licensing LLC,systems methods and computerreadable media for...
US20180107866A1,2016-10-19,Snap Inc,systems devices media and methods are presente...
CN108520229A,2018-04-04,北京旷视科技有限公司,the present invention provides a kind of image...
CN108573228A,2018-04-09,杭州华雁云态信息技术有限公司,a kind of electric line foreign matter intrusi...
US10242294B2,2017-05-01,Intel Corp,an example apparatus for classifying target ob...


In [26]:
df.shape, df_part.shape

((433, 7), (433, 3))

In [28]:
import pickle
import gzip

def dump(fname, obj):
  with gzip.open(fname, 'wb') as f:
     pickle.dump(obj, f)
        
dump("df_part_20200413.pkl.gz", df_part)

In [30]:
!ls

cpc_20200413.pkl.gz  df_part_20200413.pkl.gz  references_20200413.pkl.gz
df_20200413.pkl.gz   patent_search.ipynb


# Inference

In [1]:
import multiprocessing
multiprocessing.cpu_count()

4

In [2]:
import torch
from transformers import AlbertForSequenceClassification

model = AlbertForSequenceClassification.from_pretrained('./', output_hidden_states=True)
model.eval()

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [3]:
import os

def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)

Size (MB): 46.746421


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

class PairDataset(Dataset):
    def __init__(self, input_ids, token_type_ids, attention_mask):
        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        one_token = self.input_ids[idx]
        one_token_type = self.token_type_ids[idx]
        one_mask = self.attention_mask[idx]

#         device = torch.device("cuda")
        device = torch.device("cpu")
        sample = {'input_ids': torch.tensor(one_token, device=device), 
                'token_type_ids': torch.tensor(one_token_type, device=device), 
                'attention_mask': torch.tensor(one_mask, device=device)
                }

        return sample

In [10]:
input_sentence = "Prior work on object detection repurposes classifiers to perform detection. Instead, we frame object detection as a regression problem to spatially separated bounding boxes and associated class probabilities. A single neural network predicts bounding boxes and class probabilities directly from full images in one evaluation. Since the whole detection pipeline is a single network, it can be optimized end-to-end directly on detection performance."

input_sentence = abstract_preprocess(input_sentence)
input_sentence

'prior work on object detection repurposes classifiers to perform detection instead we frame object detection as a regression problem to spatially separated bounding boxes and associated class probabilities a single neural network predicts bounding boxes and class probabilities directly from full images in one evaluation since the whole detection pipeline is a single network it can be optimized endtoend directly on detection performance'

In [11]:
def set_pair(input_sentence):
    input_sentence = abstract_preprocess(input_sentence)
    pair = []
    for num in df.index:
        text_a = df.at[num, "preprocessed_abstract"]
        text_b = input_sentence
        pair.append([text_a, text_b])
    
    return pair

In [12]:
test_pair = set_pair(input_sentence)

In [13]:
from transformers import AlbertTokenizer

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

token = tokenizer.batch_encode_plus(test_pair, add_special_tokens=True, 
                                    return_token_type_ids=True, max_length=512, return_attention_masks=True, pad_to_max_length=True)

In [14]:
test_dataset = PairDataset(token["input_ids"], token["token_type_ids"], token["attention_mask"])
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [15]:
torch.set_num_threads(multiprocessing.cpu_count())
print(torch.__config__.parallel_info())

ATen/Parallel:
	at::get_num_threads() : 4
	at::get_num_interop_threads() : 2
OpenMP 201511 (a.k.a. OpenMP 4.5)
	omp_get_max_threads() : 4
Intel(R) Math Kernel Library Version 2019.0.4 Product Build 20190411 for Intel(R) 64 architecture applications
	mkl_get_max_threads() : 4
Intel(R) MKL-DNN v0.21.1 (Git Hash 7d2fd500bc78936d1d648ca713b901012f470dbc)
std::thread::hardware_concurrency() : 4
Environment variables:
	OMP_NUM_THREADS : [not set]
	MKL_NUM_THREADS : [not set]
ATen parallel backend: OpenMP



In [34]:
%%time

import numpy as np

prob = []

with torch.no_grad():
    for i, data in enumerate(test_dataloader, 0):
        input_ids = data["input_ids"]
        token_type_ids = data["token_type_ids"]
        attention_mask = data["attention_mask"]

        outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        print(outputs[1][-1].mean(axis=1).shape)
        break
        logits, = outputs
        prob += torch.softmax(logits, dim=1).tolist()
        
        if i%10==9:
            print("{} done".format(i+1))

torch.Size([4, 768])
CPU times: user 15.8 s, sys: 1.78 s, total: 17.6 s
Wall time: 4.75 s


In [40]:
p = [(i, elm[1]) for i, elm in enumerate(prob)]
p = sorted(p, key=lambda x:x[1], reverse=True)
p[0:10]

[(218, 0.9893274307250977),
 (376, 0.9882809519767761),
 (162, 0.9878667593002319),
 (375, 0.9867842197418213),
 (81, 0.986143946647644),
 (217, 0.986143946647644),
 (186, 0.9860714077949524),
 (250, 0.9858617782592773),
 (288, 0.9857050180435181),
 (219, 0.9846290349960327)]

In [45]:
prob[218]

[0.010672561824321747, 0.9893274307250977]

In [48]:
df.iloc[ [t[0] for t in p[0:10]] ]

Unnamed: 0_level_0,priority_date,assignee,abstract,claims,description,top_claims,preprocessed_abstract
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
US20180211403A1,2017-01-20,Ford Global Technologies LLC,"Abstract According to one embodiment, a system...",Claims ( 20 ) 1 . A method comprising: determi...,Description TECHNICAL FIELD The disclosure rel...,a method comprising determining using one or m...,according to one embodiment a system includes ...
US8396268B2,2010-03-31,Oxford University Innovation Ltd,Abstract A method for processing a sequence of...,Claims ( 39 ) 1. A method of determining a plu...,Description BACKGROUND OF INVENTION Various sy...,a method of determining a plurality of relatio...,a method for processing a sequence of images i...
US20160182874A1,2014-12-22,Conbraco Industries Inc,Abstract An apparatus includes a plurality of ...,"Claims ( 20 ) 1 . An apparatus, comprising: a ...",Description FIELD OF THE DISCLOSURE The presen...,an apparatus comprising a plurality of camera ...,an apparatus includes a plurality of camera un...
US8577130B2,2009-03-16,Siemens Medical Solutions USA Inc,Abstract Described herein is a technology for ...,Claims ( 23 ) The invention claimed is: 1. A m...,Description CROSS-REFERENCE TO RELATED APPLICA...,the invention claimed is 1,described herein is a technology for facilitat...
US10354362B2,2016-09-08,Carnegie Mellon University,Abstract Methods of detecting an object in an ...,Claims ( 20 ) What is claimed is: 1. A method ...,Description RELATED APPLICATION DATA This appl...,a method of processing an image to detect the ...,methods of detecting an object in an image usi...
US20180096457A1,2016-09-08,Carnegie Mellon University,Abstract Methods of detecting an object in an ...,Claims ( 20 ) What is claimed is: 1 . A method...,Description RELATED APPLICATION DATA This appl...,a method of processing an image to detect the ...,methods of detecting an object in an image usi...
WO2016145379A1,2015-03-12,William Marsh Rice University,Abstract A mechanism for compiling a generativ...,Claims CLAIMS What is claimed is: 1. A compute...,Description TITLE: Automated Compilation of Pr...,claims a computerimplemented method for constr...,a mechanism for compiling a generative descrip...
US20190164290A1,2016-08-25,Intel Corp,Abstract Techniques related to implementing fu...,Claims ( 24 ) 1 - 29 . (canceled) 30 . A compu...,Description BACKGROUND Semantic image segmenta...,1 canceled,techniques related to implementing fully convo...
US8687893B2,2011-03-31,Microsoft Corp,Abstract Classification algorithm optimization...,Claims ( 20 ) The invention claimed is: 1. A c...,Description BACKGROUND Classification algorith...,the invention claimed is 1,classification algorithm optimization is descr...
US20180268234A1,2016-10-10,Gyrfalcon Technology Inc,Abstract A deep learning object detection and ...,Claims ( 19 ) What is claimed is: 1 . A deep l...,Description CROSS REFERENCE TO RELATED APPLICA...,a deep learning object detection and recogniti...,a deep learning object detection and recogniti...
