In [2]:
import pandas as pd

### Load patent id granted in 2012 and application id in 2017.

In [3]:
grants2012id_df = pd.read_csv('../data/grant2012_all.csv')

In [4]:
grants2012id_df = grants2012id_df.drop("Unnamed: 0", axis=1)

In [5]:
grants2012id_df.head()

Unnamed: 0,parsed
0,8166569
1,8166570
2,8166571
3,8166572
4,8166573


In [6]:
app2017id_df = pd.read_csv('../data/application2017_all.csv')

In [7]:
app2017id_df = app2017id_df.drop("Unnamed: 0", axis=1)

In [8]:
app2017id_df.head()

Unnamed: 0,app_id
0,14988955
1,15299629
2,15239362
3,15462276
4,14998443


### Filter citation of office action which is for 2017 application (only once)


citations.csv and office_actions.csv is large. Just filter app_id to reduce size. Do it only once

Below cell need memory more than 15GB.

In [17]:
citations = pd.read_csv("../data/citations.csv")

In [18]:
citations.head()

Unnamed: 0,app_id,citation_pat_pgpub_id,parsed,ifw_number,action_type,action_subtype,form892,form1449,citation_in_oa
0,12000001,20060218340,20060218340,,,,0,1,0
1,12000001,2343564,2343564,,,,1,0,0
2,12000001,6622200,6622200,,,,1,0,0
3,12000001,6182004,6182004,,,,0,1,0
4,12000001,2323432,2323432,,,,1,0,0


In [23]:
citations.shape

(58862278, 9)

In [20]:
citations_2017 = pd.merge(citations, app2017id_df, on="app_id")

In [22]:
citations_2017.shape

(278403, 9)

In [21]:
citations_2017.head()

Unnamed: 0,app_id,citation_pat_pgpub_id,parsed,ifw_number,action_type,action_subtype,form892,form1449,citation_in_oa
0,12022531,20050240098,20050240098,,,,0,1,0
1,12022531,20050148995,20050148995,,,,0,1,0
2,12022531,5873827,5873827,,,,1,0,0
3,12022531,6544185,6544185,,,,1,0,0
4,12022531,6725083,6725083,,,,1,0,0


In [9]:
sum(app2017id_df["app_id"]==12022531)

1

In [24]:
citations_2017.to_pickle("../data/citations_2017.dat")

### Filer office_actions of app_id is in 2017 (onlly once)

In [25]:
office_actions = pd.read_csv("../data/office_actions.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [26]:
office_actions.shape

(4384532, 23)

In [27]:
office_actions.head()

Unnamed: 0,app_id,ifw_number,document_cd,mail_dt,art_unit,uspc_class,uspc_subclass,header_missing,fp_missing,rejection_fp_mismatch,...,rejection_103,rejection_112,rejection_dp,objection,allowed_claims,cite102_gt1,cite103_gt3,cite103_eq1,cite103_max,signature_type
0,14150981,100867762,CTFR,2015-10-15,2632,375,219000,0,0,0,...,1,0,0,0,0,0,0,1,2,1
1,14198961,100867788,CTFR,2015-10-15,2699,345,173000,0,0,0,...,1,0,0,0,0,0,0,0,2,1
2,13796589,100867794,CTNF,2015-10-15,3776,606,159000,0,0,0,...,1,0,0,0,0,0,0,0,3,3
3,14673475,100867844,CTNF,2015-10-15,2627,345,175000,0,0,0,...,1,1,1,0,0,0,1,0,4,1
4,14669113,100867850,CTNF,2015-10-15,2666,382,118000,0,0,0,...,0,1,1,0,0,0,0,0,0,1


In [28]:
office_2017 = pd.merge(office_actions, app2017id_df, on="app_id")

In [29]:
office_2017.shape

(26342, 23)

In [30]:
office_2017.to_pickle("../data/office_2017.dat")

### Load citations and office actions which app_id in 2017 only

To reduce footprint, restart kernel here and reload it.

In [9]:
citations_2017 = pd.read_pickle("../data/citations_2017.dat")

In [10]:
office_2017 = pd.read_pickle("../data/office_2017.dat")

### Match office action and citation for 2017 application 2012 grants

In [11]:
citations_2012_2017 = pd.merge(citations_2017, grants2012id_df.astype("str"), on="parsed")

In [12]:
citations_2012_2017.shape, citations_2017.shape

((6745, 9), (278403, 9))

## Data setup assumption

Download application of 2017 and grants of 2012, unzip xml, and place under data/

From here: https://bulkdata.uspto.gov/#pats

### Grants 2012

Use "Patent Grant Full Text Data (No Images) (JAN 1976 - PRESENT)".

- Download all zip of https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2012/
   - Use shell script to download zip (https://colab.research.google.com/drive/1gJO0VQ72Xfr_SY2dwWyiTvczIv0_gd4L)
       - save to download.sh and run
- place xml under data/grants2012/
   - ex.data/grants2012/ipg120103.xml data/grants2012/ipg120221.xml
   - Total about 24GB

### Application 2017

Use "Patent Application Full Text Data (No Images) (MAR 15, 2001 - PRESENT)"

- Download all zip of https://bulkdata.uspto.gov/data/patent/application/redbook/fulltext/2017/
   - Use above colab shell script again
- place xml under data/application/2017
    - ex. data/application2017/ipa170105.xml data/appli cation2017/ipa170223.xml
    - Total about 35GB

In [13]:
INDEX_PATH="../data/index"

In [10]:
# currenty lindxr is often updated, so install herer instead of include in Dockerfile

!go get github.com/karino2/lindxr

In [14]:
import subprocess

In [15]:
# lindxr index -indexdest index/grants2012 -pattern "<doc-number>" -target "../data/grants2012/ipg12011*.xml"

def build_index(indexdest, pattern, targetpat):
    subprocess.call(["lindxr", "index", "-indexdest", indexdest, "-pattern", pattern, "-target", targetpat])

In [16]:
def subfile(fpath, start, end):
    return subprocess.check_output(["lindxr", "sub", "-start", str(start), "-end", str(end), "-input", fpath],universal_newlines=True).split("\n")

### Create poor man's index (do it only once)

In [19]:
%%time

build_index(INDEX_PATH+"/grants2012/docnum/", "<doc-number>", "../data/grants2012/ipg*.xml")

CPU times: user 4 ms, sys: 8 ms, total: 12 ms
Wall time: 1min 2s


In [17]:
%%time

build_index(INDEX_PATH+"/grants2012/uspatgra/", "<us-patent-grant", "../data/grants2012/ipg*.xml")

CPU times: user 0 ns, sys: 8 ms, total: 8 ms
Wall time: 59.3 s


In [18]:
%%time

build_index(INDEX_PATH+"/grants2012/close_uspatgra/", "</us-patent-grant>", "../data/grants2012/ipg*.xml")

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 57.1 s


# Load index from files

In [17]:
import glob

In [18]:
indexDNGrants2012 = glob.glob("../data/index/grants2012/docnum/*.idx")

## First load one index from file and filter only real doc-number index

In [19]:
def collect_lines(fpath):
    with open(fpath, "r") as f:
        return [int(l.split(":", 1)[0]) for l in f]

In [20]:
def collect_line_match_tupple(indexfile):
    with open(indexfile, "r") as f:
        return [(int(lnumstr), match) for (lnumstr, match) in (l.rstrip("\n").split(":", 1) for l in f)]

In [60]:
indexDNGrants2012[0]

'../data/index/grants2012/docnum/ipg120501.xml.idx'

In [20]:
fpath = indexDNGrants2012[0]

In [22]:
open_lines = collect_lines(fpath.replace("docnum", "uspatgra"))
close_lines = collect_lines(fpath.replace("docnum", "close_uspatgra"))

In [23]:
len(open_lines), len(close_lines)

(5520, 5520)

In [25]:
tups = collect_line_match_tupple(fpath)

In [26]:
len(tups)

201316

In [27]:
tups[0]

(8, '<doc-number>D0658346</doc-number>')

Retrieve doc-number. Filter non-number value and convert to int here.

In [21]:
import re

In [22]:
doc_num_pat = re.compile(r'<doc-number>([0-9]+)</doc-number>')

In [23]:
def filer_doc_number_only_number(tups):
    tups_ids = []
    for lnum, docline in tups:
        mat = doc_num_pat.match(docline)
        if mat:
            tups_ids.append((lnum, int(mat.group(1))))
    return tups_ids


In [31]:
tups_ids = filer_doc_number_only_number(tups)

In [32]:
len(tups_ids), tups_ids[0]

(126788, (16, 29385429))

### Filter doc-number which start from 5 line from &lt;us-patent-grant...&gt;

In [24]:
DOC_NUMBER_OFFSET_FROM_OPEN=5

In [25]:
def search_closest_at(open_lines, line, candidate):
    """
    return maximum index which satisfy open_lines[idx] < line.
    Search is started from candidate.
    """
    for i in range(candidate, len(open_lines)):
        if open_lines[i] > line:
            return i-1
    return len(open_lines)-1


In [26]:
def list_matched_open_idx(open_lines, tups_ids):
    """
    return index of open_lines which tups_ids belong to.
    """
    matched_opens = []
    candidate = 0
    for lnum, id in tups_ids:
        res = search_closest_at(open_lines, lnum, candidate)
        matched_opens.append(res)
        candidate = res
    return matched_opens

In [36]:
matched_opens = list_matched_open_idx(open_lines, tups_ids)

### tupple of (line, doc-number, corresponding idx of open_lines)

In [27]:
def filter_offset_match_only(open_lines, tups_ids, matched_opens, expected_offset):
    return [(lnum, id, matched_opens[idx]) for idx, (lnum, id) in enumerate(tups_ids)
                if lnum == open_lines[matched_opens[idx]]+expected_offset ]

In [38]:
tups_only_5 = filter_offset_match_only(open_lines, tups_ids, matched_opens, DOC_NUMBER_OFFSET_FROM_OPEN)

In [39]:
len(tups_only_5), tups_only_5[0:5]

(4981,
 [(308158, 8166569, 539),
  (309038, 8166570, 540),
  (309428, 8166571, 541),
  (310710, 8166572, 542),
  (311438, 8166573, 543)])

### Check whether how many match to grant2012_all.csv

In [40]:
grants2012id = set(grants2012id_df["parsed"])

In [41]:
sum([id in grants2012id for _, id, _ in tups_only_5])

4981

In [42]:
# seems match to all, but filter for sure.
tups_only_5 = [(lnum, id, openidx) for lnum, id, openidx in tups_only_5 if id in grants2012id]

In [43]:
len(tups_only_5)

4981

### Filter docnumber only in citations_2012_2017

In [89]:
pat_id_set = set(citations_2012_2017["parsed"].astype("int"))

In [88]:
sum(citations_2012_2017["parsed"].astype("int").isna())

0

In [90]:
len(pat_id_set)

5424

In [91]:
final_tups = [(lnum, id, openidx) for lnum, id, openidx in tups_only_5 if id in pat_id_set]

In [92]:
len(final_tups)

98

In [94]:
final_tups[0:5]

[(359894, 8166649, 619),
 (426845, 8166734, 704),
 (445288, 8166761, 731),
 (447021, 8166763, 733),
 (603329, 8166969, 939)]

In [101]:
tups_only_5[0:5]

[(308158, 8166569, 539),
 (309038, 8166570, 540),
 (309428, 8166571, 541),
 (310710, 8166572, 542),
 (311438, 8166573, 543)]

In [102]:
sum(citations_2012_2017["parsed"].astype("int") == 8166649), sum(citations_2012_2017["parsed"].astype("int") == 8166569)

(1, 0)

### Make what I did so far as function

In [28]:
# doc_num_tupples is a list of (line, doc-number, corresponding idx of open_lines)
from collections import namedtuple

DocNumIndex = namedtuple('DocNumIndex', ['open_lines', 'close_lines', "docnum_tupples"])

In [29]:
def one_index_to_docid_tupples(fpath, patidset):
    """
    fpath: path for docnum index.
    patidset: Patent id set that we want to keep.
    """
    open_lines = collect_lines(fpath.replace("docnum", "uspatgra"))
    close_lines = collect_lines(fpath.replace("docnum", "close_uspatgra"))
    tups = collect_line_match_tupple(fpath)
    tups_ids = filer_doc_number_only_number(tups)
    matched_opens = list_matched_open_idx(open_lines, tups_ids)
    tups_only_5 = filter_offset_match_only(open_lines, tups_ids, matched_opens, DOC_NUMBER_OFFSET_FROM_OPEN)
    final_tups = [(lnum, id, openidx) for lnum, id, openidx in tups_only_5 if id in patidset]
    return DocNumIndex(open_lines, close_lines, final_tups)


In [105]:
docnum_ind = one_index_to_docid_tupples(fpath, pat_id_set)

In [108]:
len(final_tups), len(docnum_ind.docnum_tupples)

(98, 98)

In [109]:
docnum_ind.docnum_tupples[0:5]

[(359894, 8166649, 619),
 (426845, 8166734, 704),
 (445288, 8166761, 731),
 (447021, 8166763, 733),
 (603329, 8166969, 939)]

In [110]:
# confirm result by checking xml
_, patid, openidx = docnum_ind.docnum_tupples[0]

In [112]:
openidx

619

In [113]:
fpath

'../data/index/grants2012/docnum/ipg120501.xml.idx'

In [117]:
subfile("../data/grants2012/ipg120501.xml",
        docnum_ind.open_lines[openidx]-2, docnum_ind.close_lines[openidx])[0:10]

['<?xml version="1.0" encoding="UTF-8"?>',
 '<!DOCTYPE us-patent-grant SYSTEM "us-patent-grant-v42-2006-08-23.dtd" [ ]>',
 '<us-patent-grant lang="EN" dtd-version="v4.2 2006-08-23" file="US08166649-20120501.XML" status="PRODUCTION" id="us-patent-grant" country="US" date-produced="20120418" date-publ="20120501">',
 '<us-bibliographic-data-grant>',
 '<publication-reference>',
 '<document-id>',
 '<country>US</country>',
 '<doc-number>08166649</doc-number>',
 '<kind>B2</kind>',
 '<date>20120501</date>']

### Now do it for all files under grants2012

In [30]:
pat_id_set = set(citations_2012_2017["parsed"].astype("int"))

In [31]:
import tqdm

In [32]:
file_dni_dict = {}
for fpath in tqdm.tqdm(indexDNGrants2012):
    dni = one_index_to_docid_tupples(fpath, pat_id_set)
    if len(dni.docnum_tupples) == 0:
        print("skip {}".format(fpath))
    else:
        file_dni_dict[fpath] = dni

100%|██████████| 52/52 [00:27<00:00,  1.89it/s]


In [122]:
len(file_dni_dict.keys())

52

### Is patent id unique? (A. yes).

In [123]:
all_docnum = [patid for fpath in file_dni_dict.keys() for _, patid, _ in file_dni_dict[fpath].docnum_tupples]

In [124]:
len(all_docnum)

5424

In [125]:
len(set(all_docnum))

5424

### Then create dict of {patent-id: (path, openlinenum, closelinenum)}

In [33]:
def indexpath_to_xmlpath(fpath):
    return fpath.replace("data/index", "data").replace("docnum/", "")[:-4]

In [137]:
indexpath_to_xmlpath('../data/index/grants2012/docnum/ipg120821.xml.idx')

'../data/grants2012/ipg120821.xml'

In [35]:
patid_to_subfile = {patid: (indexpath_to_xmlpath(fpath), dni.open_lines[idx], dni.close_lines[idx])
                    for fpath, dni in file_dni_dict.items()
                    for _, patid, idx in dni.docnum_tupples}

In [140]:
len(patid_to_subfile.keys())

5424

In [141]:
list(patid_to_subfile.keys())[0:5]

[8249344, 8126465, 8232962, 8200771, 8118276]

In [142]:
patid_to_subfile[8249344]

('../data/grants2012/ipg120821.xml', 5342754, 5344633)

In [145]:
subfile(*patid_to_subfile[8249344])[0:10]

['<us-patent-grant lang="EN" dtd-version="v4.2 2006-08-23" file="US08249344-20120821.XML" status="PRODUCTION" id="us-patent-grant" country="US" date-produced="20120806" date-publ="20120821">',
 '<us-bibliographic-data-grant>',
 '<publication-reference>',
 '<document-id>',
 '<country>US</country>',
 '<doc-number>08249344</doc-number>',
 '<kind>B2</kind>',
 '<date>20120821</date>',
 '</document-id>',
 '</publication-reference>']

In [38]:
%%time
grants_all_xml_dict = {patid: "\n".join(subfile(fpath, openl-2, closel))
                       for patid, (fpath, openl, closel) in tqdm.tqdm(patid_to_subfile.items())}


  0%|          | 0/5424 [00:00<?, ?it/s][A
  0%|          | 1/5424 [00:00<24:44,  3.65it/s][A
  0%|          | 2/5424 [00:04<3:22:15,  2.24s/it][A
  0%|          | 3/5424 [00:08<4:11:37,  2.78s/it][A
  0%|          | 4/5424 [00:09<3:29:34,  2.32s/it][A
  0%|          | 5/5424 [00:16<4:56:01,  3.28s/it][A
  0%|          | 6/5424 [00:20<5:15:10,  3.49s/it][A
  0%|          | 7/5424 [00:26<5:43:31,  3.81s/it][A
  0%|          | 8/5424 [00:30<5:42:12,  3.79s/it][A
  0%|          | 9/5424 [00:32<5:23:05,  3.58s/it][A
  0%|          | 10/5424 [00:32<4:53:56,  3.26s/it][A
  0%|          | 11/5424 [00:32<4:29:03,  2.98s/it][A
  0%|          | 12/5424 [00:33<4:13:33,  2.81s/it][A
  0%|          | 13/5424 [00:37<4:17:49,  2.86s/it][A
  0%|          | 14/5424 [00:37<4:01:35,  2.68s/it][A
  0%|          | 15/5424 [00:40<4:01:42,  2.68s/it][A
  0%|          | 16/5424 [00:45<4:15:49,  2.84s/it][A
  0%|          | 17/5424 [00:46<4:06:21,  2.73s/it][A
  0%|          | 18/5424 [00:47

  3%|▎         | 156/5424 [03:00<1:41:46,  1.16s/it][A
  3%|▎         | 158/5424 [03:01<1:40:36,  1.15s/it][A
  3%|▎         | 159/5424 [03:01<1:40:08,  1.14s/it][A
  3%|▎         | 160/5424 [03:06<1:42:20,  1.17s/it][A
  3%|▎         | 161/5424 [03:06<1:41:46,  1.16s/it][A
  3%|▎         | 162/5424 [03:07<1:41:24,  1.16s/it][A
  3%|▎         | 163/5424 [03:07<1:41:04,  1.15s/it][A
  3%|▎         | 164/5424 [03:08<1:40:32,  1.15s/it][A
  3%|▎         | 165/5424 [03:08<1:40:10,  1.14s/it][A
  3%|▎         | 167/5424 [03:09<1:39:10,  1.13s/it][A
  3%|▎         | 168/5424 [03:09<1:38:42,  1.13s/it][A
  3%|▎         | 169/5424 [03:10<1:38:57,  1.13s/it][A
  3%|▎         | 170/5424 [03:11<1:38:31,  1.13s/it][A
  3%|▎         | 171/5424 [03:12<1:38:31,  1.13s/it][A
  3%|▎         | 172/5424 [03:12<1:38:03,  1.12s/it][A
  3%|▎         | 173/5424 [03:12<1:37:34,  1.11s/it][A
  3%|▎         | 174/5424 [03:13<1:37:11,  1.11s/it][A
  3%|▎         | 175/5424 [03:13<1:36:40,  1.11s

  6%|▌         | 325/5424 [04:28<1:10:14,  1.21it/s][A
  6%|▌         | 326/5424 [04:29<1:10:09,  1.21it/s][A
  6%|▌         | 327/5424 [04:29<1:10:03,  1.21it/s][A
  6%|▌         | 329/5424 [04:30<1:09:45,  1.22it/s][A
  6%|▌         | 330/5424 [04:30<1:09:39,  1.22it/s][A
  6%|▌         | 331/5424 [04:31<1:09:30,  1.22it/s][A
  6%|▌         | 332/5424 [04:31<1:09:22,  1.22it/s][A
  6%|▌         | 333/5424 [04:31<1:09:15,  1.23it/s][A
  6%|▌         | 334/5424 [04:32<1:09:08,  1.23it/s][A
  6%|▌         | 335/5424 [04:33<1:09:17,  1.22it/s][A
  6%|▌         | 336/5424 [04:33<1:09:06,  1.23it/s][A
  6%|▌         | 337/5424 [04:34<1:08:56,  1.23it/s][A
  6%|▌         | 338/5424 [04:34<1:08:47,  1.23it/s][A
  6%|▋         | 340/5424 [04:34<1:08:25,  1.24it/s][A
  6%|▋         | 341/5424 [04:34<1:08:16,  1.24it/s][A
  6%|▋         | 342/5424 [04:35<1:08:06,  1.24it/s][A
  6%|▋         | 343/5424 [04:35<1:07:58,  1.25it/s][A
  6%|▋         | 344/5424 [04:35<1:07:48,  1.25i

  9%|▉         | 488/5424 [06:06<1:01:48,  1.33it/s][A
  9%|▉         | 489/5424 [06:07<1:01:44,  1.33it/s][A
  9%|▉         | 490/5424 [06:07<1:01:40,  1.33it/s][A
  9%|▉         | 491/5424 [06:07<1:01:33,  1.34it/s][A
  9%|▉         | 492/5424 [06:08<1:01:30,  1.34it/s][A
  9%|▉         | 493/5424 [06:08<1:01:25,  1.34it/s][A
  9%|▉         | 494/5424 [06:08<1:01:22,  1.34it/s][A
  9%|▉         | 495/5424 [06:09<1:01:17,  1.34it/s][A
  9%|▉         | 496/5424 [06:09<1:01:13,  1.34it/s][A
  9%|▉         | 497/5424 [06:10<1:01:09,  1.34it/s][A
  9%|▉         | 498/5424 [06:10<1:01:04,  1.34it/s][A
  9%|▉         | 500/5424 [06:10<1:00:50,  1.35it/s][A
  9%|▉         | 501/5424 [06:11<1:00:46,  1.35it/s][A
  9%|▉         | 503/5424 [06:11<1:00:34,  1.35it/s][A
  9%|▉         | 504/5424 [06:11<1:00:28,  1.36it/s][A
  9%|▉         | 505/5424 [06:11<1:00:22,  1.36it/s][A
  9%|▉         | 506/5424 [06:12<1:00:16,  1.36it/s][A
  9%|▉         | 507/5424 [06:12<1:00:12,  1.36i

 12%|█▏        | 662/5424 [07:01<50:33,  1.57it/s][A
 12%|█▏        | 663/5424 [07:02<50:34,  1.57it/s][A
 12%|█▏        | 664/5424 [07:03<50:32,  1.57it/s][A
 12%|█▏        | 665/5424 [07:03<50:29,  1.57it/s][A
 12%|█▏        | 666/5424 [07:03<50:25,  1.57it/s][A
 12%|█▏        | 668/5424 [07:03<50:18,  1.58it/s][A
 12%|█▏        | 670/5424 [07:04<50:10,  1.58it/s][A
 12%|█▏        | 671/5424 [07:04<50:09,  1.58it/s][A
 12%|█▏        | 673/5424 [07:05<50:01,  1.58it/s][A
 12%|█▏        | 674/5424 [07:05<49:59,  1.58it/s][A
 12%|█▏        | 675/5424 [07:05<49:56,  1.58it/s][A
 12%|█▏        | 676/5424 [07:06<49:53,  1.59it/s][A
 12%|█▏        | 677/5424 [07:06<49:50,  1.59it/s][A
 13%|█▎        | 679/5424 [07:06<49:40,  1.59it/s][A
 13%|█▎        | 680/5424 [07:06<49:37,  1.59it/s][A
 13%|█▎        | 681/5424 [07:07<49:34,  1.59it/s][A
 13%|█▎        | 682/5424 [07:07<49:31,  1.60it/s][A
 13%|█▎        | 684/5424 [07:07<49:24,  1.60it/s][A
 13%|█▎        | 685/5424 [0

 15%|█▌        | 840/5424 [07:52<42:59,  1.78it/s][A
 16%|█▌        | 842/5424 [07:53<42:55,  1.78it/s][A
 16%|█▌        | 843/5424 [07:53<42:53,  1.78it/s][A
 16%|█▌        | 844/5424 [07:53<42:51,  1.78it/s][A
 16%|█▌        | 845/5424 [07:54<42:50,  1.78it/s][A
 16%|█▌        | 846/5424 [07:55<42:54,  1.78it/s][A
 16%|█▌        | 847/5424 [07:56<42:52,  1.78it/s][A
 16%|█▌        | 849/5424 [07:56<42:47,  1.78it/s][A
 16%|█▌        | 850/5424 [07:56<42:46,  1.78it/s][A
 16%|█▌        | 851/5424 [07:57<42:44,  1.78it/s][A
 16%|█▌        | 852/5424 [07:57<42:41,  1.78it/s][A
 16%|█▌        | 853/5424 [07:57<42:39,  1.79it/s][A
 16%|█▌        | 854/5424 [07:58<42:38,  1.79it/s][A
 16%|█▌        | 855/5424 [07:58<42:35,  1.79it/s][A
 16%|█▌        | 856/5424 [07:58<42:34,  1.79it/s][A
 16%|█▌        | 857/5424 [07:59<42:32,  1.79it/s][A
 16%|█▌        | 859/5424 [07:59<42:27,  1.79it/s][A
 16%|█▌        | 860/5424 [07:59<42:25,  1.79it/s][A
 16%|█▌        | 863/5424 [0

 19%|█▉        | 1038/5424 [08:56<37:47,  1.93it/s][A
 19%|█▉        | 1040/5424 [08:57<37:43,  1.94it/s][A
 19%|█▉        | 1042/5424 [08:57<37:38,  1.94it/s][A
 19%|█▉        | 1043/5424 [08:57<37:37,  1.94it/s][A
 19%|█▉        | 1044/5424 [08:57<37:34,  1.94it/s][A
 19%|█▉        | 1045/5424 [08:57<37:34,  1.94it/s][A
 19%|█▉        | 1046/5424 [08:58<37:33,  1.94it/s][A
 19%|█▉        | 1047/5424 [08:58<37:31,  1.94it/s][A
 19%|█▉        | 1049/5424 [08:59<37:28,  1.95it/s][A
 19%|█▉        | 1051/5424 [08:59<37:25,  1.95it/s][A
 19%|█▉        | 1052/5424 [08:59<37:24,  1.95it/s][A
 19%|█▉        | 1054/5424 [09:00<37:19,  1.95it/s][A
 19%|█▉        | 1055/5424 [09:00<37:18,  1.95it/s][A
 19%|█▉        | 1056/5424 [09:00<37:16,  1.95it/s][A
 19%|█▉        | 1057/5424 [09:00<37:14,  1.95it/s][A
 20%|█▉        | 1059/5424 [09:01<37:10,  1.96it/s][A
 20%|█▉        | 1060/5424 [09:01<37:08,  1.96it/s][A
 20%|█▉        | 1062/5424 [09:01<37:03,  1.96it/s][A
 20%|█▉   

 23%|██▎       | 1245/5424 [09:47<32:53,  2.12it/s][A
 23%|██▎       | 1246/5424 [09:48<32:52,  2.12it/s][A
 23%|██▎       | 1248/5424 [09:48<32:49,  2.12it/s][A
 23%|██▎       | 1249/5424 [09:48<32:48,  2.12it/s][A
 23%|██▎       | 1251/5424 [09:49<32:45,  2.12it/s][A
 23%|██▎       | 1253/5424 [09:49<32:41,  2.13it/s][A
 23%|██▎       | 1255/5424 [09:49<32:38,  2.13it/s][A
 23%|██▎       | 1257/5424 [09:49<32:34,  2.13it/s][A
 23%|██▎       | 1258/5424 [09:49<32:33,  2.13it/s][A
 23%|██▎       | 1259/5424 [09:50<32:32,  2.13it/s][A
 23%|██▎       | 1260/5424 [09:50<32:31,  2.13it/s][A
 23%|██▎       | 1261/5424 [09:50<32:30,  2.13it/s][A
 23%|██▎       | 1262/5424 [09:51<32:29,  2.13it/s][A
 23%|██▎       | 1263/5424 [09:51<32:28,  2.14it/s][A
 23%|██▎       | 1266/5424 [09:51<32:23,  2.14it/s][A
 23%|██▎       | 1267/5424 [09:52<32:23,  2.14it/s][A
 23%|██▎       | 1268/5424 [09:52<32:21,  2.14it/s][A
 23%|██▎       | 1270/5424 [09:52<32:18,  2.14it/s][A
 23%|██▎  

 27%|██▋       | 1458/5424 [10:33<28:43,  2.30it/s][A
 27%|██▋       | 1460/5424 [10:33<28:40,  2.30it/s][A
 27%|██▋       | 1461/5424 [10:33<28:39,  2.31it/s][A
 27%|██▋       | 1462/5424 [10:34<28:38,  2.31it/s][A
 27%|██▋       | 1463/5424 [10:34<28:37,  2.31it/s][A
 27%|██▋       | 1464/5424 [10:34<28:36,  2.31it/s][A
 27%|██▋       | 1465/5424 [10:34<28:35,  2.31it/s][A
 27%|██▋       | 1466/5424 [10:35<28:34,  2.31it/s][A
 27%|██▋       | 1467/5424 [10:35<28:34,  2.31it/s][A
 27%|██▋       | 1468/5424 [10:35<28:33,  2.31it/s][A
 27%|██▋       | 1470/5424 [10:36<28:31,  2.31it/s][A
 27%|██▋       | 1472/5424 [10:36<28:28,  2.31it/s][A
 27%|██▋       | 1474/5424 [10:36<28:26,  2.32it/s][A
 27%|██▋       | 1475/5424 [10:36<28:25,  2.32it/s][A
 27%|██▋       | 1476/5424 [10:37<28:24,  2.32it/s][A
 27%|██▋       | 1478/5424 [10:37<28:21,  2.32it/s][A
 27%|██▋       | 1479/5424 [10:37<28:20,  2.32it/s][A
 27%|██▋       | 1481/5424 [10:38<28:18,  2.32it/s][A
 27%|██▋  

 31%|███       | 1675/5424 [11:15<25:12,  2.48it/s][A
 31%|███       | 1677/5424 [11:15<25:09,  2.48it/s][A
 31%|███       | 1679/5424 [11:16<25:08,  2.48it/s][A
 31%|███       | 1680/5424 [11:16<25:07,  2.48it/s][A
 31%|███       | 1682/5424 [11:16<25:04,  2.49it/s][A
 31%|███       | 1684/5424 [11:16<25:03,  2.49it/s][A
 31%|███       | 1686/5424 [11:17<25:01,  2.49it/s][A
 31%|███       | 1688/5424 [11:17<24:58,  2.49it/s][A
 31%|███       | 1690/5424 [11:17<24:56,  2.49it/s][A
 31%|███       | 1692/5424 [11:17<24:55,  2.50it/s][A
 31%|███       | 1693/5424 [11:18<24:54,  2.50it/s][A
 31%|███       | 1694/5424 [11:18<24:53,  2.50it/s][A
 31%|███▏      | 1696/5424 [11:18<24:51,  2.50it/s][A
 31%|███▏      | 1698/5424 [11:18<24:49,  2.50it/s][A
 31%|███▏      | 1700/5424 [11:18<24:47,  2.50it/s][A
 31%|███▏      | 1702/5424 [11:19<24:45,  2.51it/s][A
 31%|███▏      | 1704/5424 [11:19<24:43,  2.51it/s][A
 31%|███▏      | 1707/5424 [11:19<24:40,  2.51it/s][A
 32%|███▏ 

 35%|███▍      | 1886/5424 [11:53<22:18,  2.64it/s][A
 35%|███▍      | 1887/5424 [11:53<22:17,  2.64it/s][A
 35%|███▍      | 1888/5424 [11:54<22:17,  2.64it/s][A
 35%|███▍      | 1889/5424 [11:54<22:16,  2.64it/s][A
 35%|███▍      | 1890/5424 [11:54<22:16,  2.64it/s][A
 35%|███▍      | 1891/5424 [11:54<22:15,  2.65it/s][A
 35%|███▍      | 1893/5424 [11:55<22:14,  2.65it/s][A
 35%|███▍      | 1894/5424 [11:55<22:13,  2.65it/s][A
 35%|███▍      | 1895/5424 [11:55<22:12,  2.65it/s][A
 35%|███▍      | 1896/5424 [11:56<22:12,  2.65it/s][A
 35%|███▍      | 1897/5424 [11:56<22:12,  2.65it/s][A
 35%|███▍      | 1898/5424 [11:56<22:11,  2.65it/s][A
 35%|███▌      | 1899/5424 [11:56<22:10,  2.65it/s][A
 35%|███▌      | 1900/5424 [11:57<22:10,  2.65it/s][A
 35%|███▌      | 1901/5424 [11:57<22:09,  2.65it/s][A
 35%|███▌      | 1903/5424 [11:57<22:08,  2.65it/s][A
 35%|███▌      | 1904/5424 [11:57<22:07,  2.65it/s][A
 35%|███▌      | 1905/5424 [11:58<22:06,  2.65it/s][A
 35%|███▌ 

 38%|███▊      | 2075/5424 [12:36<20:21,  2.74it/s][A
 38%|███▊      | 2076/5424 [12:36<20:20,  2.74it/s][A
 38%|███▊      | 2077/5424 [12:37<20:20,  2.74it/s][A
 38%|███▊      | 2078/5424 [12:37<20:19,  2.74it/s][A
 38%|███▊      | 2079/5424 [12:37<20:18,  2.74it/s][A
 38%|███▊      | 2080/5424 [12:37<20:18,  2.74it/s][A
 38%|███▊      | 2081/5424 [12:38<20:17,  2.74it/s][A
 38%|███▊      | 2082/5424 [12:38<20:17,  2.74it/s][A
 38%|███▊      | 2083/5424 [12:38<20:17,  2.75it/s][A
 38%|███▊      | 2084/5424 [12:39<20:16,  2.75it/s][A
 38%|███▊      | 2085/5424 [12:39<20:16,  2.75it/s][A
 38%|███▊      | 2086/5424 [12:39<20:15,  2.75it/s][A
 38%|███▊      | 2087/5424 [12:39<20:15,  2.75it/s][A
 38%|███▊      | 2088/5424 [12:40<20:14,  2.75it/s][A
 39%|███▊      | 2089/5424 [12:40<20:14,  2.75it/s][A
 39%|███▊      | 2090/5424 [12:40<20:13,  2.75it/s][A
 39%|███▊      | 2091/5424 [12:41<20:13,  2.75it/s][A
 39%|███▊      | 2092/5424 [12:41<20:12,  2.75it/s][A
 39%|███▊ 

 42%|████▏     | 2258/5424 [13:17<18:38,  2.83it/s][A
 42%|████▏     | 2259/5424 [13:17<18:37,  2.83it/s][A
 42%|████▏     | 2260/5424 [13:18<18:37,  2.83it/s][A
 42%|████▏     | 2261/5424 [13:18<18:36,  2.83it/s][A
 42%|████▏     | 2262/5424 [13:18<18:35,  2.83it/s][A
 42%|████▏     | 2263/5424 [13:18<18:35,  2.83it/s][A
 42%|████▏     | 2264/5424 [13:18<18:35,  2.83it/s][A
 42%|████▏     | 2266/5424 [13:19<18:33,  2.84it/s][A
 42%|████▏     | 2268/5424 [13:19<18:32,  2.84it/s][A
 42%|████▏     | 2269/5424 [13:19<18:32,  2.84it/s][A
 42%|████▏     | 2270/5424 [13:20<18:31,  2.84it/s][A
 42%|████▏     | 2272/5424 [13:20<18:30,  2.84it/s][A
 42%|████▏     | 2273/5424 [13:21<18:30,  2.84it/s][A
 42%|████▏     | 2275/5424 [13:21<18:29,  2.84it/s][A
 42%|████▏     | 2276/5424 [13:21<18:29,  2.84it/s][A
 42%|████▏     | 2277/5424 [13:22<18:28,  2.84it/s][A
 42%|████▏     | 2278/5424 [13:22<18:27,  2.84it/s][A
 42%|████▏     | 2279/5424 [13:22<18:27,  2.84it/s][A
 42%|████▏

 45%|████▍     | 2438/5424 [14:00<17:09,  2.90it/s][A
 45%|████▍     | 2439/5424 [14:00<17:09,  2.90it/s][A
 45%|████▍     | 2440/5424 [14:01<17:08,  2.90it/s][A
 45%|████▌     | 2441/5424 [14:01<17:08,  2.90it/s][A
 45%|████▌     | 2442/5424 [14:01<17:07,  2.90it/s][A
 45%|████▌     | 2443/5424 [14:01<17:07,  2.90it/s][A
 45%|████▌     | 2444/5424 [14:02<17:06,  2.90it/s][A
 45%|████▌     | 2445/5424 [14:02<17:06,  2.90it/s][A
 45%|████▌     | 2446/5424 [14:02<17:05,  2.90it/s][A
 45%|████▌     | 2447/5424 [14:02<17:05,  2.90it/s][A
 45%|████▌     | 2448/5424 [14:03<17:05,  2.90it/s][A
 45%|████▌     | 2449/5424 [14:03<17:04,  2.90it/s][A
 45%|████▌     | 2450/5424 [14:03<17:04,  2.90it/s][A
 45%|████▌     | 2451/5424 [14:03<17:03,  2.90it/s][A
 45%|████▌     | 2452/5424 [14:04<17:03,  2.90it/s][A
 45%|████▌     | 2453/5424 [14:04<17:02,  2.90it/s][A
 45%|████▌     | 2454/5424 [14:04<17:02,  2.91it/s][A
 45%|████▌     | 2455/5424 [14:04<17:01,  2.91it/s][A
 45%|████▌

 48%|████▊     | 2606/5424 [14:39<15:51,  2.96it/s][A
 48%|████▊     | 2608/5424 [14:39<15:50,  2.96it/s][A
 48%|████▊     | 2609/5424 [14:40<15:49,  2.96it/s][A
 48%|████▊     | 2611/5424 [14:40<15:48,  2.97it/s][A
 48%|████▊     | 2613/5424 [14:40<15:47,  2.97it/s][A
 48%|████▊     | 2614/5424 [14:41<15:47,  2.97it/s][A
 48%|████▊     | 2616/5424 [14:41<15:46,  2.97it/s][A
 48%|████▊     | 2617/5424 [14:41<15:45,  2.97it/s][A
 48%|████▊     | 2618/5424 [14:41<15:45,  2.97it/s][A
 48%|████▊     | 2619/5424 [14:42<15:44,  2.97it/s][A
 48%|████▊     | 2620/5424 [14:42<15:44,  2.97it/s][A
 48%|████▊     | 2621/5424 [14:42<15:43,  2.97it/s][A
 48%|████▊     | 2622/5424 [14:42<15:43,  2.97it/s][A
 48%|████▊     | 2623/5424 [14:43<15:43,  2.97it/s][A
 48%|████▊     | 2624/5424 [14:43<15:42,  2.97it/s][A
 48%|████▊     | 2625/5424 [14:43<15:42,  2.97it/s][A
 48%|████▊     | 2627/5424 [14:43<15:40,  2.97it/s][A
 48%|████▊     | 2628/5424 [14:43<15:40,  2.97it/s][A
 48%|████▊

 51%|█████▏    | 2783/5424 [15:20<14:33,  3.02it/s][A
 51%|█████▏    | 2784/5424 [15:20<14:32,  3.02it/s][A
 51%|█████▏    | 2785/5424 [15:20<14:32,  3.02it/s][A
 51%|█████▏    | 2786/5424 [15:20<14:31,  3.03it/s][A
 51%|█████▏    | 2788/5424 [15:21<14:30,  3.03it/s][A
 51%|█████▏    | 2789/5424 [15:21<14:30,  3.03it/s][A
 51%|█████▏    | 2790/5424 [15:21<14:30,  3.03it/s][A
 51%|█████▏    | 2791/5424 [15:21<14:29,  3.03it/s][A
 51%|█████▏    | 2792/5424 [15:21<14:29,  3.03it/s][A
 51%|█████▏    | 2793/5424 [15:22<14:28,  3.03it/s][A
 52%|█████▏    | 2794/5424 [15:22<14:28,  3.03it/s][A
 52%|█████▏    | 2797/5424 [15:22<14:26,  3.03it/s][A
 52%|█████▏    | 2798/5424 [15:23<14:26,  3.03it/s][A
 52%|█████▏    | 2800/5424 [15:23<14:25,  3.03it/s][A
 52%|█████▏    | 2801/5424 [15:23<14:24,  3.03it/s][A
 52%|█████▏    | 2802/5424 [15:23<14:24,  3.03it/s][A
 52%|█████▏    | 2803/5424 [15:24<14:24,  3.03it/s][A
 52%|█████▏    | 2804/5424 [15:24<14:23,  3.03it/s][A
 52%|█████

 55%|█████▍    | 2958/5424 [15:58<13:18,  3.09it/s][A
 55%|█████▍    | 2959/5424 [15:58<13:18,  3.09it/s][A
 55%|█████▍    | 2961/5424 [15:58<13:17,  3.09it/s][A
 55%|█████▍    | 2962/5424 [15:58<13:16,  3.09it/s][A
 55%|█████▍    | 2963/5424 [15:59<13:16,  3.09it/s][A
 55%|█████▍    | 2964/5424 [15:59<13:16,  3.09it/s][A
 55%|█████▍    | 2965/5424 [15:59<13:15,  3.09it/s][A
 55%|█████▍    | 2966/5424 [15:59<13:15,  3.09it/s][A
 55%|█████▍    | 2967/5424 [16:00<13:15,  3.09it/s][A
 55%|█████▍    | 2968/5424 [16:00<13:14,  3.09it/s][A
 55%|█████▍    | 2969/5424 [16:00<13:14,  3.09it/s][A
 55%|█████▍    | 2970/5424 [16:00<13:14,  3.09it/s][A
 55%|█████▍    | 2971/5424 [16:01<13:13,  3.09it/s][A
 55%|█████▍    | 2972/5424 [16:01<13:13,  3.09it/s][A
 55%|█████▍    | 2974/5424 [16:01<13:12,  3.09it/s][A
 55%|█████▍    | 2975/5424 [16:02<13:12,  3.09it/s][A
 55%|█████▍    | 2978/5424 [16:02<13:10,  3.09it/s][A
 55%|█████▍    | 2979/5424 [16:02<13:10,  3.09it/s][A
 55%|█████

 58%|█████▊    | 3146/5424 [16:37<12:01,  3.16it/s][A
 58%|█████▊    | 3147/5424 [16:37<12:01,  3.16it/s][A
 58%|█████▊    | 3148/5424 [16:37<12:01,  3.16it/s][A
 58%|█████▊    | 3150/5424 [16:37<12:00,  3.16it/s][A
 58%|█████▊    | 3151/5424 [16:37<11:59,  3.16it/s][A
 58%|█████▊    | 3152/5424 [16:38<11:59,  3.16it/s][A
 58%|█████▊    | 3153/5424 [16:38<11:59,  3.16it/s][A
 58%|█████▊    | 3154/5424 [16:38<11:58,  3.16it/s][A
 58%|█████▊    | 3156/5424 [16:38<11:57,  3.16it/s][A
 58%|█████▊    | 3157/5424 [16:39<11:57,  3.16it/s][A
 58%|█████▊    | 3158/5424 [16:39<11:57,  3.16it/s][A
 58%|█████▊    | 3159/5424 [16:39<11:56,  3.16it/s][A
 58%|█████▊    | 3160/5424 [16:39<11:56,  3.16it/s][A
 58%|█████▊    | 3161/5424 [16:39<11:55,  3.16it/s][A
 58%|█████▊    | 3162/5424 [16:40<11:55,  3.16it/s][A
 58%|█████▊    | 3163/5424 [16:40<11:55,  3.16it/s][A
 58%|█████▊    | 3164/5424 [16:40<11:54,  3.16it/s][A
 58%|█████▊    | 3165/5424 [16:40<11:54,  3.16it/s][A
 58%|█████

 61%|██████▏   | 3326/5424 [17:17<10:54,  3.20it/s][A
 61%|██████▏   | 3328/5424 [17:18<10:53,  3.21it/s][A
 61%|██████▏   | 3330/5424 [17:18<10:53,  3.21it/s][A
 61%|██████▏   | 3331/5424 [17:18<10:52,  3.21it/s][A
 61%|██████▏   | 3332/5424 [17:18<10:52,  3.21it/s][A
 61%|██████▏   | 3334/5424 [17:19<10:51,  3.21it/s][A
 61%|██████▏   | 3335/5424 [17:19<10:51,  3.21it/s][A
 62%|██████▏   | 3336/5424 [17:19<10:50,  3.21it/s][A
 62%|██████▏   | 3337/5424 [17:20<10:50,  3.21it/s][A
 62%|██████▏   | 3338/5424 [17:20<10:50,  3.21it/s][A
 62%|██████▏   | 3339/5424 [17:20<10:49,  3.21it/s][A
 62%|██████▏   | 3340/5424 [17:20<10:49,  3.21it/s][A
 62%|██████▏   | 3341/5424 [17:21<10:49,  3.21it/s][A
 62%|██████▏   | 3342/5424 [17:21<10:48,  3.21it/s][A
 62%|██████▏   | 3343/5424 [17:21<10:48,  3.21it/s][A
 62%|██████▏   | 3345/5424 [17:22<10:47,  3.21it/s][A
 62%|██████▏   | 3347/5424 [17:22<10:46,  3.21it/s][A
 62%|██████▏   | 3349/5424 [17:22<10:46,  3.21it/s][A
 62%|█████

 65%|██████▍   | 3512/5424 [17:59<09:47,  3.25it/s][A
 65%|██████▍   | 3513/5424 [17:59<09:47,  3.25it/s][A
 65%|██████▍   | 3514/5424 [17:59<09:46,  3.25it/s][A
 65%|██████▍   | 3515/5424 [18:00<09:46,  3.25it/s][A
 65%|██████▍   | 3516/5424 [18:00<09:46,  3.25it/s][A
 65%|██████▍   | 3517/5424 [18:00<09:45,  3.25it/s][A
 65%|██████▍   | 3518/5424 [18:00<09:45,  3.25it/s][A
 65%|██████▍   | 3519/5424 [18:01<09:45,  3.25it/s][A
 65%|██████▍   | 3520/5424 [18:01<09:44,  3.25it/s][A
 65%|██████▍   | 3521/5424 [18:01<09:44,  3.25it/s][A
 65%|██████▍   | 3522/5424 [18:01<09:44,  3.26it/s][A
 65%|██████▍   | 3523/5424 [18:02<09:43,  3.26it/s][A
 65%|██████▍   | 3524/5424 [18:02<09:43,  3.26it/s][A
 65%|██████▌   | 3526/5424 [18:02<09:42,  3.26it/s][A
 65%|██████▌   | 3527/5424 [18:03<09:42,  3.26it/s][A
 65%|██████▌   | 3528/5424 [18:03<09:42,  3.26it/s][A
 65%|██████▌   | 3529/5424 [18:03<09:41,  3.26it/s][A
 65%|██████▌   | 3530/5424 [18:03<09:41,  3.26it/s][A
 65%|█████

 68%|██████▊   | 3682/5424 [18:38<08:49,  3.29it/s][A
 68%|██████▊   | 3683/5424 [18:38<08:48,  3.29it/s][A
 68%|██████▊   | 3684/5424 [18:38<08:48,  3.29it/s][A
 68%|██████▊   | 3685/5424 [18:39<08:48,  3.29it/s][A
 68%|██████▊   | 3686/5424 [18:39<08:47,  3.29it/s][A
 68%|██████▊   | 3687/5424 [18:39<08:47,  3.29it/s][A
 68%|██████▊   | 3688/5424 [18:39<08:47,  3.29it/s][A
 68%|██████▊   | 3689/5424 [18:40<08:46,  3.29it/s][A
 68%|██████▊   | 3690/5424 [18:40<08:46,  3.29it/s][A
 68%|██████▊   | 3691/5424 [18:40<08:46,  3.29it/s][A
 68%|██████▊   | 3692/5424 [18:41<08:45,  3.29it/s][A
 68%|██████▊   | 3693/5424 [18:41<08:45,  3.29it/s][A
 68%|██████▊   | 3694/5424 [18:41<08:45,  3.29it/s][A
 68%|██████▊   | 3695/5424 [18:42<08:45,  3.29it/s][A
 68%|██████▊   | 3696/5424 [18:42<08:44,  3.29it/s][A
 68%|██████▊   | 3697/5424 [18:42<08:44,  3.29it/s][A
 68%|██████▊   | 3698/5424 [18:43<08:44,  3.29it/s][A
 68%|██████▊   | 3699/5424 [18:43<08:43,  3.29it/s][A
 68%|█████

 71%|███████   | 3856/5424 [19:19<07:51,  3.33it/s][A
 71%|███████   | 3857/5424 [19:19<07:51,  3.33it/s][A
 71%|███████   | 3858/5424 [19:19<07:50,  3.33it/s][A
 71%|███████   | 3860/5424 [19:20<07:50,  3.33it/s][A
 71%|███████   | 3861/5424 [19:20<07:49,  3.33it/s][A
 71%|███████   | 3862/5424 [19:20<07:49,  3.33it/s][A
 71%|███████   | 3863/5424 [19:20<07:49,  3.33it/s][A
 71%|███████   | 3864/5424 [19:21<07:48,  3.33it/s][A
 71%|███████▏  | 3866/5424 [19:21<07:47,  3.33it/s][A
 71%|███████▏  | 3868/5424 [19:21<07:47,  3.33it/s][A
 71%|███████▏  | 3869/5424 [19:21<07:47,  3.33it/s][A
 71%|███████▏  | 3870/5424 [19:22<07:46,  3.33it/s][A
 71%|███████▏  | 3871/5424 [19:22<07:46,  3.33it/s][A
 71%|███████▏  | 3872/5424 [19:23<07:46,  3.33it/s][A
 71%|███████▏  | 3873/5424 [19:23<07:45,  3.33it/s][A
 71%|███████▏  | 3874/5424 [19:23<07:45,  3.33it/s][A
 71%|███████▏  | 3875/5424 [19:23<07:45,  3.33it/s][A
 71%|███████▏  | 3876/5424 [19:23<07:44,  3.33it/s][A
 71%|█████

 75%|███████▍  | 4046/5424 [20:00<06:48,  3.37it/s][A
 75%|███████▍  | 4048/5424 [20:01<06:48,  3.37it/s][A
 75%|███████▍  | 4049/5424 [20:01<06:47,  3.37it/s][A
 75%|███████▍  | 4050/5424 [20:01<06:47,  3.37it/s][A
 75%|███████▍  | 4051/5424 [20:01<06:47,  3.37it/s][A
 75%|███████▍  | 4053/5424 [20:02<06:46,  3.37it/s][A
 75%|███████▍  | 4054/5424 [20:02<06:46,  3.37it/s][A
 75%|███████▍  | 4055/5424 [20:02<06:45,  3.37it/s][A
 75%|███████▍  | 4057/5424 [20:02<06:45,  3.37it/s][A
 75%|███████▍  | 4058/5424 [20:02<06:44,  3.37it/s][A
 75%|███████▍  | 4060/5424 [20:03<06:44,  3.37it/s][A
 75%|███████▍  | 4062/5424 [20:03<06:43,  3.38it/s][A
 75%|███████▍  | 4064/5424 [20:03<06:42,  3.38it/s][A
 75%|███████▍  | 4065/5424 [20:03<06:42,  3.38it/s][A
 75%|███████▍  | 4066/5424 [20:03<06:42,  3.38it/s][A
 75%|███████▍  | 4067/5424 [20:04<06:41,  3.38it/s][A
 75%|███████▌  | 4068/5424 [20:04<06:41,  3.38it/s][A
 75%|███████▌  | 4069/5424 [20:04<06:41,  3.38it/s][A
 75%|█████

 78%|███████▊  | 4228/5424 [20:40<05:51,  3.41it/s][A
 78%|███████▊  | 4230/5424 [20:40<05:50,  3.41it/s][A
 78%|███████▊  | 4231/5424 [20:41<05:49,  3.41it/s][A
 78%|███████▊  | 4232/5424 [20:41<05:49,  3.41it/s][A
 78%|███████▊  | 4233/5424 [20:41<05:49,  3.41it/s][A
 78%|███████▊  | 4234/5424 [20:41<05:49,  3.41it/s][A
 78%|███████▊  | 4236/5424 [20:42<05:48,  3.41it/s][A
 78%|███████▊  | 4237/5424 [20:42<05:48,  3.41it/s][A
 78%|███████▊  | 4238/5424 [20:42<05:47,  3.41it/s][A
 78%|███████▊  | 4239/5424 [20:43<05:47,  3.41it/s][A
 78%|███████▊  | 4241/5424 [20:43<05:46,  3.41it/s][A
 78%|███████▊  | 4243/5424 [20:43<05:46,  3.41it/s][A
 78%|███████▊  | 4244/5424 [20:44<05:45,  3.41it/s][A
 78%|███████▊  | 4245/5424 [20:44<05:45,  3.41it/s][A
 78%|███████▊  | 4246/5424 [20:44<05:45,  3.41it/s][A
 78%|███████▊  | 4247/5424 [20:44<05:45,  3.41it/s][A
 78%|███████▊  | 4249/5424 [20:45<05:44,  3.41it/s][A
 78%|███████▊  | 4250/5424 [20:45<05:44,  3.41it/s][A
 78%|█████

 81%|████████▏ | 4414/5424 [21:24<04:53,  3.44it/s][A
 81%|████████▏ | 4415/5424 [21:24<04:53,  3.44it/s][A
 81%|████████▏ | 4416/5424 [21:24<04:53,  3.44it/s][A
 81%|████████▏ | 4418/5424 [21:24<04:52,  3.44it/s][A
 81%|████████▏ | 4419/5424 [21:25<04:52,  3.44it/s][A
 82%|████████▏ | 4421/5424 [21:25<04:51,  3.44it/s][A
 82%|████████▏ | 4422/5424 [21:25<04:51,  3.44it/s][A
 82%|████████▏ | 4423/5424 [21:26<04:51,  3.44it/s][A
 82%|████████▏ | 4424/5424 [21:26<04:50,  3.44it/s][A
 82%|████████▏ | 4426/5424 [21:26<04:50,  3.44it/s][A
 82%|████████▏ | 4428/5424 [21:26<04:49,  3.44it/s][A
 82%|████████▏ | 4429/5424 [21:26<04:49,  3.44it/s][A
 82%|████████▏ | 4430/5424 [21:27<04:48,  3.44it/s][A
 82%|████████▏ | 4431/5424 [21:27<04:48,  3.44it/s][A
 82%|████████▏ | 4432/5424 [21:28<04:48,  3.44it/s][A
 82%|████████▏ | 4433/5424 [21:28<04:47,  3.44it/s][A
 82%|████████▏ | 4434/5424 [21:28<04:47,  3.44it/s][A
 82%|████████▏ | 4435/5424 [21:29<04:47,  3.44it/s][A
 82%|█████

 85%|████████▍ | 4606/5424 [22:03<03:55,  3.48it/s][A
 85%|████████▍ | 4607/5424 [22:03<03:54,  3.48it/s][A
 85%|████████▍ | 4608/5424 [22:04<03:54,  3.48it/s][A
 85%|████████▍ | 4609/5424 [22:04<03:54,  3.48it/s][A
 85%|████████▍ | 4610/5424 [22:04<03:53,  3.48it/s][A
 85%|████████▌ | 4611/5424 [22:04<03:53,  3.48it/s][A
 85%|████████▌ | 4612/5424 [22:04<03:53,  3.48it/s][A
 85%|████████▌ | 4614/5424 [22:04<03:52,  3.48it/s][A
 85%|████████▌ | 4615/5424 [22:04<03:52,  3.48it/s][A
 85%|████████▌ | 4616/5424 [22:05<03:51,  3.48it/s][A
 85%|████████▌ | 4618/5424 [22:05<03:51,  3.48it/s][A
 85%|████████▌ | 4620/5424 [22:05<03:50,  3.49it/s][A
 85%|████████▌ | 4621/5424 [22:05<03:50,  3.49it/s][A
 85%|████████▌ | 4623/5424 [22:05<03:49,  3.49it/s][A
 85%|████████▌ | 4624/5424 [22:06<03:49,  3.49it/s][A
 85%|████████▌ | 4626/5424 [22:06<03:48,  3.49it/s][A
 85%|████████▌ | 4627/5424 [22:06<03:48,  3.49it/s][A
 85%|████████▌ | 4629/5424 [22:06<03:47,  3.49it/s][A
 85%|█████

 88%|████████▊ | 4796/5424 [22:43<02:58,  3.52it/s][A
 88%|████████▊ | 4798/5424 [22:43<02:57,  3.52it/s][A
 88%|████████▊ | 4799/5424 [22:44<02:57,  3.52it/s][A
 88%|████████▊ | 4800/5424 [22:44<02:57,  3.52it/s][A
 89%|████████▊ | 4801/5424 [22:44<02:57,  3.52it/s][A
 89%|████████▊ | 4802/5424 [22:44<02:56,  3.52it/s][A
 89%|████████▊ | 4803/5424 [22:45<02:56,  3.52it/s][A
 89%|████████▊ | 4805/5424 [22:45<02:55,  3.52it/s][A
 89%|████████▊ | 4806/5424 [22:45<02:55,  3.52it/s][A
 89%|████████▊ | 4807/5424 [22:45<02:55,  3.52it/s][A
 89%|████████▊ | 4808/5424 [22:46<02:55,  3.52it/s][A
 89%|████████▊ | 4810/5424 [22:46<02:54,  3.52it/s][A
 89%|████████▊ | 4812/5424 [22:46<02:53,  3.52it/s][A
 89%|████████▉ | 4814/5424 [22:47<02:53,  3.52it/s][A
 89%|████████▉ | 4816/5424 [22:47<02:52,  3.52it/s][A
 89%|████████▉ | 4818/5424 [22:47<02:52,  3.52it/s][A
 89%|████████▉ | 4819/5424 [22:47<02:51,  3.52it/s][A
 89%|████████▉ | 4820/5424 [22:48<02:51,  3.52it/s][A
 89%|█████

 92%|█████████▏| 4973/5424 [23:22<02:07,  3.55it/s][A
 92%|█████████▏| 4975/5424 [23:22<02:06,  3.55it/s][A
 92%|█████████▏| 4976/5424 [23:22<02:06,  3.55it/s][A
 92%|█████████▏| 4977/5424 [23:23<02:06,  3.55it/s][A
 92%|█████████▏| 4978/5424 [23:23<02:05,  3.55it/s][A
 92%|█████████▏| 4979/5424 [23:23<02:05,  3.55it/s][A
 92%|█████████▏| 4981/5424 [23:23<02:04,  3.55it/s][A
 92%|█████████▏| 4982/5424 [23:23<02:04,  3.55it/s][A
 92%|█████████▏| 4983/5424 [23:24<02:04,  3.55it/s][A
 92%|█████████▏| 4985/5424 [23:24<02:03,  3.55it/s][A
 92%|█████████▏| 4986/5424 [23:24<02:03,  3.55it/s][A
 92%|█████████▏| 4988/5424 [23:24<02:02,  3.55it/s][A
 92%|█████████▏| 4989/5424 [23:25<02:02,  3.55it/s][A
 92%|█████████▏| 4990/5424 [23:25<02:02,  3.55it/s][A
 92%|█████████▏| 4992/5424 [23:25<02:01,  3.55it/s][A
 92%|█████████▏| 4993/5424 [23:25<02:01,  3.55it/s][A
 92%|█████████▏| 4995/5424 [23:25<02:00,  3.55it/s][A
 92%|█████████▏| 4996/5424 [23:26<02:00,  3.55it/s][A
 92%|█████

 95%|█████████▌| 5158/5424 [24:01<01:14,  3.58it/s][A
 95%|█████████▌| 5160/5424 [24:01<01:13,  3.58it/s][A
 95%|█████████▌| 5162/5424 [24:01<01:13,  3.58it/s][A
 95%|█████████▌| 5163/5424 [24:01<01:12,  3.58it/s][A
 95%|█████████▌| 5164/5424 [24:02<01:12,  3.58it/s][A
 95%|█████████▌| 5165/5424 [24:02<01:12,  3.58it/s][A
 95%|█████████▌| 5166/5424 [24:02<01:12,  3.58it/s][A
 95%|█████████▌| 5167/5424 [24:02<01:11,  3.58it/s][A
 95%|█████████▌| 5168/5424 [24:03<01:11,  3.58it/s][A
 95%|█████████▌| 5169/5424 [24:03<01:11,  3.58it/s][A
 95%|█████████▌| 5170/5424 [24:03<01:10,  3.58it/s][A
 95%|█████████▌| 5171/5424 [24:04<01:10,  3.58it/s][A
 95%|█████████▌| 5173/5424 [24:04<01:10,  3.58it/s][A
 95%|█████████▌| 5174/5424 [24:04<01:09,  3.58it/s][A
 95%|█████████▌| 5176/5424 [24:05<01:09,  3.58it/s][A
 95%|█████████▌| 5177/5424 [24:05<01:08,  3.58it/s][A
 95%|█████████▌| 5178/5424 [24:05<01:08,  3.58it/s][A
 95%|█████████▌| 5179/5424 [24:05<01:08,  3.58it/s][A
 96%|█████

 98%|█████████▊| 5340/5424 [24:38<00:23,  3.61it/s][A
 98%|█████████▊| 5342/5424 [24:38<00:22,  3.61it/s][A
 99%|█████████▊| 5343/5424 [24:39<00:22,  3.61it/s][A
 99%|█████████▊| 5344/5424 [24:39<00:22,  3.61it/s][A
 99%|█████████▊| 5345/5424 [24:39<00:21,  3.61it/s][A
 99%|█████████▊| 5346/5424 [24:39<00:21,  3.61it/s][A
 99%|█████████▊| 5348/5424 [24:39<00:21,  3.61it/s][A
 99%|█████████▊| 5349/5424 [24:40<00:20,  3.61it/s][A
 99%|█████████▊| 5351/5424 [24:40<00:20,  3.61it/s][A
 99%|█████████▊| 5352/5424 [24:40<00:19,  3.61it/s][A
 99%|█████████▊| 5353/5424 [24:41<00:19,  3.61it/s][A
 99%|█████████▊| 5355/5424 [24:41<00:19,  3.61it/s][A
 99%|█████████▊| 5356/5424 [24:41<00:18,  3.62it/s][A
 99%|█████████▉| 5357/5424 [24:41<00:18,  3.62it/s][A
 99%|█████████▉| 5358/5424 [24:41<00:18,  3.62it/s][A
 99%|█████████▉| 5359/5424 [24:42<00:17,  3.62it/s][A
 99%|█████████▉| 5360/5424 [24:42<00:17,  3.62it/s][A
 99%|█████████▉| 5361/5424 [24:42<00:17,  3.62it/s][A
 99%|█████

CPU times: user 26.4 s, sys: 1min 9s, total: 1min 35s
Wall time: 24min 56s


In [47]:
list(grants_all_xml_dict.keys())[0:5]

[8249344, 8126465, 8232962, 8148779, 8118276]

In [51]:
grants_all_df = pd.DataFrame({"parsed": list(grants_all_xml_dict.keys()), "xml": list(grants_all_xml_dict.values())})

In [52]:
grants_all_df.head()

Unnamed: 0,parsed,xml
0,8249344,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
1,8126465,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
2,8232962,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
3,8148779,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
4,8118276,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."


In [53]:
grants_all_df.to_pickle("../data/grants_2012_from2017_xmldf.dat")

check result.

In [54]:
one_xml = grants_all_df.iloc[1]["xml"].split("\n")

In [55]:
len(one_xml)

539

In [56]:
one_xml[0:10]

['<?xml version="1.0" encoding="UTF-8"?>',
 '<!DOCTYPE us-patent-grant SYSTEM "us-patent-grant-v42-2006-08-23.dtd" [ ]>',
 '<us-patent-grant lang="EN" dtd-version="v4.2 2006-08-23" file="US08126465-20120228.XML" status="PRODUCTION" id="us-patent-grant" country="US" date-produced="20120214" date-publ="20120228">',
 '<us-bibliographic-data-grant>',
 '<publication-reference>',
 '<document-id>',
 '<country>US</country>',
 '<doc-number>08126465</doc-number>',
 '<kind>B2</kind>',
 '<date>20120228</date>']

# Below here is obsolete

In [67]:
xmls = glob.glob("../data/grants2012/*.xml")

In [68]:
xmls[0]

'../data/grants2012/ipg120911.xml'

In [69]:
xmls[0][len("../data/"):]

'grants2012/ipg120911.xml'

In [70]:
xmlrels = [xml[len("../data/"):] for xml in xmls]

In [72]:
xmlrels[0]

'grants2012/ipg120911.xml'

In [73]:
%%time

for xmlrel in xmlrels:
    create_one_index(xmlrel)

CPU times: user 4.49 s, sys: 812 ms, total: 5.3 s
Wall time: 3min 48s


In [9]:
xmls = glob.glob("../data/application2017/*.xml")
xmlrels = [xml[len("../data/"):] for xml in xmls]

In [10]:
%%time

for xmlrel in xmlrels:
    create_one_index(xmlrel)

CPU times: user 9.32 s, sys: 1.43 s, total: 10.8 s
Wall time: 7min 2s


In [None]:
    relxml = indexfile[len("../data/index/"):].rstrip(".idx")
    for tup in tupples:
        key = cut_head_zero_workaround(tup[1])
        dic.setdefault(key, []).append([relxml, int(tup[0])])

In [46]:
common = INDEX_PATH +'/grants2012/'

In [47]:
indexDNGrants2012[0][len(common):]

'docnum/ipg120501.xml.idx'

In [49]:
os.path.dirname(fpath)

'../data/index/grants2012/docnum'

In [50]:
os.path.basename(fpath)

'ipg120501.xml.idx'

In [51]:
indexUSPAGrants2012 = glob.glob("../data/index/grants2012/uspatgra/*.idx")

In [52]:
fpath = indexUSPAGrants2012[0]

In [59]:
with open(fpath, "r") as f:
    open_lines = [int(l.split(":", 1)[0]) for l in f]
with open(fpath.replace("uspatgra", "close_uspatgra"), "r") as f:
    close_lines = [int(l.split(":", 1)[0]) for l in f]
    

In [61]:
len(open_lines), len(close_lines)

(5520, 5520)

In [62]:
open_lines[0], close_lines[0]

(3, 364)

In [53]:
class SubFile:
    def __init__(self, fpath, start, end):
        self.start = start
        self.end = end
        self.parent = fpath
    def content():
        return subfile(fpath, start, end)

In [14]:
indexGrants2012 = {}

In [15]:
build_all_index_dict(indexfilesGrants2012, indexGrants2012)

In [7]:
# workaround. in *.xml, doc-number sometime starts with unknown 0, while citations.csv does not.
# so if digit is 8 and start from 0, cut first 0.

def cut_head_zero_workaround(key):
    if len(key) == 8 and key.startswith("0"):
        return key[1:]
    return key

In [8]:
# dict format:
# key is doc-number
# val is list of [relxml, linenumber]

def build_one_index_file_to_dict(indexfile, dic):
    with open(indexfile, "r") as f:
        tupples = [l.rstrip("\n").split(":", 1) for l in f]
    relxml = indexfile[len("../data/index/"):].rstrip(".idx")
    for tup in tupples:
        key = cut_head_zero_workaround(tup[1])
        dic.setdefault(key, []).append([relxml, int(tup[0])])

In [9]:
def build_all_index_dict(indexfiles, dic):
    for f in indexfiles:
        build_one_index_file_to_dict(f, dic)

In [10]:
indexfiles2017 = glob.glob("../data/index/application2017/*.idx")

In [11]:
index2017 = {}

In [12]:
build_all_index_dict(indexfiles2017, index2017)

In [13]:
indexfilesGrants2012 = glob.glob("../data/index/grants2012/*.idx")

In [14]:
indexGrants2012 = {}

In [15]:
build_all_index_dict(indexfilesGrants2012, indexGrants2012)

### Filter office action only of application 2017 doc-number.

In [16]:
# key of index2017 is string because doc-number sometime contains non-int value.
# so we create string row in office action data frame to use isin method.

office_15["app_id_str"] = office_15.app_id.map(str)

In [17]:
sum(office_15.app_id_str.isin(index2017) == True)

21820

In [18]:
office2017 = office_15[office_15.app_id_str.isin(index2017)]

In [19]:
len(office2017)

21820

In [20]:
office2017.head()

Unnamed: 0,app_id,ifw_number,document_cd,mail_dt,art_unit,uspc_class,uspc_subclass,header_missing,fp_missing,rejection_fp_mismatch,...,rejection_112,rejection_dp,objection,allowed_claims,cite102_gt1,cite103_gt3,cite103_eq1,cite103_max,signature_type,app_id_str
3561801,15005636,IKSG9XK1RXEAPX1,CTNF,2016-02-22,3766,600,509000,0,0,0,...,1,1,0,0,0,0,0,0,0,15005636
3564336,15001553,IKTT4P0TRXEAPX1,CTNF,2016-02-19,1625,514,279000,0,1,0,...,1,0,0,0,0,0,0,0,1,15001553
3568839,15009367,IKZGL7O1RXEAPX4,CTNF,2016-02-25,2137,711,103000,0,0,0,...,0,1,0,0,0,0,0,0,1,15009367
3569392,15041416,IKZI6SS3RXEAPX1,CTNF,2016-02-25,2618,345,619000,0,0,0,...,0,0,0,0,0,1,0,5,1,15041416
3574015,15014088,IL0U1CK0RXEAPX5,CTNF,2016-02-26,2852,399,85000,0,0,0,...,0,0,1,0,0,0,0,2,1,15014088


In [21]:
ifw_from_OA = set(office2017.ifw_number)

In [22]:
len(list(ifw_from_OA))

21820

In [23]:
citations_15.head()

Unnamed: 0,app_id,citation_pat_pgpub_id,parsed,ifw_number,action_type,action_subtype,form892,form1449,citation_in_oa
57732192,15000008,8291627,8291627,,,,1,0,0
57732193,15000008,2093620,2093620,,,,1,0,0
57732194,15000008,9221406,9221406,,,,1,0,0
57732195,15000008,8544198,8544198,,,,1,0,0
57732196,15000008,7225569,7225569,,,,1,0,0


In [24]:
citations_2017 = citations_15[citations_15.ifw_number.isin(ifw_from_OA)]

In [25]:
len(citations_2017)

43416

In [26]:
citations_2017.head()

Unnamed: 0,app_id,citation_pat_pgpub_id,parsed,ifw_number,action_type,action_subtype,form892,form1449,citation_in_oa
57732253,15000027,8553488,8553488,IRPCOHEPRXEAPX4,102.0,a,1,0,1
57732273,15000034,Dando US 2015/0364400,20150364400,IYD99O6CRXEAPX0,103.0,,0,0,1
57732274,15000034,Sun US 2012/0181696,20120181696,IYD99O6CRXEAPX0,103.0,,0,0,1
57732275,15000034,Sun US 2012/0279760,20120279760,IYD99O6CRXEAPX0,103.0,,0,0,1
57732324,15000048,20140091294,20140091294,IR59S77KRXEAPX5,103.0,,1,0,1


In [27]:
# citations_we_have = citations_2017[citations_2017.citation_pat_pgpub_id.isin(indexGrants2012)]
citations_we_have = citations_2017[citations_2017.parsed.isin(indexGrants2012)]

In [28]:
len(citations_we_have)

8365

In [29]:
citations_we_have.head()

Unnamed: 0,app_id,citation_pat_pgpub_id,parsed,ifw_number,action_type,action_subtype,form892,form1449,citation_in_oa
57733581,15000285,4659845,4659845,IV82WOB1RXEAPX5,103.0,,1,0,1
57733622,15000297,7268118,7268118,IO1DLQ1SRXEAPX3,102.0,b,0,1,1
57733623,15000297,7268118,7268118,IO1DLQ1SRXEAPX3,103.0,a,0,1,1
57733624,15000297,5578570,5578570,IO1DLQ1SRXEAPX3,103.0,a,1,0,1
57733628,15000297,5342620,5342620,IO1DLQ1SRXEAPX3,103.0,a,1,1,1


In [30]:
citations_we_have.to_pickle("../data/citations_2017_2012.dat")

In [31]:
index2017_we_need = {str(key):index2017[str(key)] for key in set(citations_we_have.app_id)}

In [32]:
len(list(index2017_we_need.keys()))

4734

In [33]:
indexGrants2012_we_need = {str(key):indexGrants2012[str(key)] for key in set(citations_we_have.parsed)}

In [34]:
list(indexGrants2012_we_need.keys())[0:10]

['3087414',
 '5010940',
 '2916244',
 '20100012972',
 '20090300770',
 '6550847',
 '6597345',
 '5848419',
 '6415183',
 '6978184']

In [35]:
len(list(indexGrants2012_we_need.keys()))

6352

In [204]:
indexGrants2012_we_need['8227462']

[['grants2012/ipg121030.xml', 5731295],
 ['grants2012/ipg120724.xml', 3034084],
 ['grants2012/ipg120724.xml', 3038802]]

In [68]:
## cut subtext of (target-bound):(target+bound) from very large text file
## but if (target-bound) is smaller than 1, treat as 1.
## retuned value is tupple which value is
## first cell: offset for target
## second cell: array of text, each element stand for one line.

def retrieve_subtext_fast(target, fpath, begin_bound, end_bound):
    start = max(1, target-begin_bound)
    end = target+end_bound
    sed_arg = "'{},{}p'".format(start, end)
    subtext = !sed -n {sed_arg} {fpath}
    return (target - start, subtext)

In [98]:
#<us-patent-application ...>...</us-patent-application>
#<us-patent-grant ...>....</us-patent-grant>

def find_nearest_root_element(target_offset, subtext_arr, tagname):
    begin_tag = '<{}'.format(tagname)
    last_cand = -1
    for i, line in enumerate(subtext_arr):
        if i > target_offset:
            if last_cand == -1:
                # begin tag not found until target_offset.
                # We assume real doc-number must be near the begin_tag,
                # So we set BEGIN_BOUND small and this case is normal (doc-number is far away from begin tag, that is, wrong one.)
                return -1
            return last_cand
        if begin_tag in line:
            last_cand = i
    raise ValueError('no target tagname found around target_offset')        

In [101]:
def retrieve_begin_end(begin_offset, subtext_arr, tagname):
    end_tag = '</{}'.format(tagname)
    res = []
    for line in subtext_arr[begin_offset:]:
        res.append(line)
        if end_tag in line:
            return res
    raise ValueError('no end tagname found')        

In [39]:
import re

In [102]:
DOC_NUMBER_PAT=re.compile(r'<doc-number>([^<]+)</doc-number>')

def find_first_doc_number(text_arr):
    res = []
    for text in text_arr:
        matchObj = DOC_NUMBER_PAT.search(text)
        if matchObj:
            return cut_head_zero_workaround(matchObj.group(1))
    raise ValueError("No doc-number found.")
        
    

In [210]:
find_first_doc_number(res_pat)

'20170351915'

In [66]:
# tagname is "us-patent-application" or "us-patent-grant"
#<us-patent-application ...>...</us-patent-application>
#<us-patent-grant ...>....</us-patent-grant>

def get_patent(tagname, file_path, target_offset):
    BEGIN_BOUND=200
    END_BOUND=10000
    target_mod_offset, subtext_arr = retrieve_subtext_fast(target_offset, file_path, BEGIN_BOUND, END_BOUND)
    begin_pos = find_nearest_root_element(target_mod_offset, subtext_arr, tagname)
    if begin_pos == -1:
        return None, None
    result_patent = retrieve_begin_end(begin_pos, subtext_arr, tagname)
    docid = find_first_doc_number(result_patent)
    return docid, result_patent

In [42]:
patent_grants_dic = {}

In [64]:
def get_real_patent(tagname, key_cand, tup_list):
    for tup in tup_list:
        real_id, pat_text = get_patent(tagname, '../data/' + tup[0], tup[1])
        if real_id == key_cand:
            return pat_text
    return None

In [44]:
dict_list = list(indexGrants2012_we_need.items())

# Heuristics for optimization

If the same doc-number occure too much, it must be in patcite (not calim-ed patent).
We skip more than 5 occurence for a while.

But I check a few doc-number in grants2012, it always start with 08.
So print doc-number starts with 8 and more than 5 occurences and check manually.

In [73]:
import tqdm

In [87]:
patent_grants_dic = {}

In [88]:
for i, (key_cand, tup_list) in tqdm.tqdm(list(enumerate(dict_list))):
    # print("{}: {}, {}".format(i, key_cand, len(tup_list)))
    if len(tup_list) > 5:
        if key_cand.startswith("8"):
            print("Skip 8 beginning key cand: {}, {}".format(key_cand, len(tup_list)))
    else:
        real_pat = get_real_patent("us-patent-grant", key_cand, tup_list)
        if real_pat:
            patent_grants_dic[key_cand] = real_pat 
            print("found {}, {}, {}".format(i, key_cand, len(list(patent_grants_dic.keys()))))

  0%|          | 20/6352 [00:34<3:04:26,  1.75s/it]

found 19, 8202487, 1


  1%|          | 44/6352 [01:28<3:31:04,  2.01s/it]

found 43, 8114845, 2


  1%|▏         | 84/6352 [03:36<4:29:46,  2.58s/it]

found 83, 8140012, 3


  2%|▏         | 105/6352 [04:26<4:24:24,  2.54s/it]

found 104, 8144683, 4


  2%|▏         | 113/6352 [04:49<4:26:24,  2.56s/it]

found 112, 8138863, 5


  3%|▎         | 197/6352 [10:07<5:16:05,  3.08s/it]

found 196, 8095929, 6


  3%|▎         | 219/6352 [11:19<5:17:12,  3.10s/it]

found 218, 8289665, 7


  4%|▎         | 224/6352 [11:38<5:18:36,  3.12s/it]

found 223, 8087315, 8


  4%|▎         | 227/6352 [11:41<5:15:21,  3.09s/it]

found 226, 8322805, 9


  4%|▍         | 246/6352 [12:40<5:14:31,  3.09s/it]

found 245, 8147894, 10


  4%|▍         | 258/6352 [13:39<5:22:31,  3.18s/it]

found 257, 8118868, 11


  4%|▍         | 280/6352 [15:11<5:29:33,  3.26s/it]

found 279, 8101436, 12


  5%|▌         | 327/6352 [17:34<5:23:42,  3.22s/it]

found 326, 8331747, 13


  6%|▌         | 369/6352 [19:42<5:19:34,  3.20s/it]

found 368, 8239267, 14


  7%|▋         | 435/6352 [23:53<5:25:02,  3.30s/it]

found 434, 8307045, 15


  7%|▋         | 474/6352 [26:29<5:28:30,  3.35s/it]

found 473, 8169523, 16


  8%|▊         | 507/6352 [28:20<5:26:41,  3.35s/it]

found 506, 8270788, 17


  8%|▊         | 528/6352 [29:26<5:24:43,  3.35s/it]

found 527, 8199013, 18


  8%|▊         | 531/6352 [29:28<5:23:11,  3.33s/it]

found 530, 8289837, 19


  9%|▉         | 556/6352 [30:43<5:20:15,  3.32s/it]

found 555, 8123234, 20


  9%|▉         | 575/6352 [31:37<5:17:48,  3.30s/it]

found 574, 8292411, 21


  9%|▉         | 577/6352 [31:38<5:16:42,  3.29s/it]

found 576, 8166718, 22


  9%|▉         | 589/6352 [32:24<5:17:08,  3.30s/it]

found 588, 8297738, 23


 10%|▉         | 621/6352 [33:42<5:11:01,  3.26s/it]

found 620, 8106186, 24


 10%|▉         | 623/6352 [33:42<5:10:01,  3.25s/it]

found 622, 8185850, 25


 10%|▉         | 625/6352 [33:44<5:09:09,  3.24s/it]

found 624, 8201268, 26


 10%|▉         | 628/6352 [33:46<5:07:51,  3.23s/it]

found 627, 8123510, 27


 10%|▉         | 632/6352 [33:54<5:06:54,  3.22s/it]

found 631, 8206358, 28


 11%|█         | 670/6352 [36:10<5:06:47,  3.24s/it]

found 669, 8327128, 29


 11%|█         | 676/6352 [36:19<5:04:57,  3.22s/it]

found 675, 8263886, 30


 11%|█         | 679/6352 [36:23<5:03:59,  3.22s/it]

found 678, 8250085, 31


 11%|█         | 707/6352 [38:36<5:08:15,  3.28s/it]

found 706, 8140786, 32


 12%|█▏        | 750/6352 [41:26<5:09:31,  3.32s/it]

found 749, 8151694, 33


 12%|█▏        | 761/6352 [42:39<5:13:21,  3.36s/it]

found 760, 8291754, 34


 12%|█▏        | 762/6352 [42:46<5:13:49,  3.37s/it]

found 761, 8151323, 35


 12%|█▏        | 774/6352 [43:25<5:12:57,  3.37s/it]

found 773, 8113316, 36


 12%|█▏        | 777/6352 [43:34<5:12:42,  3.37s/it]

found 776, 8226943, 37


 13%|█▎        | 823/6352 [46:18<5:11:07,  3.38s/it]

found 822, 8106402, 38


 13%|█▎        | 829/6352 [46:39<5:10:54,  3.38s/it]

found 828, 8200899, 39


 13%|█▎        | 845/6352 [47:31<5:09:41,  3.37s/it]

found 844, 8163144, 40


 14%|█▎        | 867/6352 [48:42<5:08:06,  3.37s/it]

found 866, 8238313, 41


 14%|█▎        | 868/6352 [48:42<5:07:47,  3.37s/it]

found 867, 8325073, 42


 14%|█▍        | 879/6352 [49:38<5:09:06,  3.39s/it]

found 878, 8145369, 43


 14%|█▍        | 898/6352 [50:48<5:08:37,  3.40s/it]

found 897, 8129836, 44


 15%|█▍        | 922/6352 [51:47<5:04:59,  3.37s/it]

found 921, 8088025, 45


 15%|█▍        | 928/6352 [52:08<5:04:45,  3.37s/it]

found 927, 8132770, 46


 15%|█▍        | 930/6352 [52:23<5:05:25,  3.38s/it]

found 929, 8263863, 47


 15%|█▍        | 931/6352 [52:24<5:05:06,  3.38s/it]

found 930, 8329172, 48


 15%|█▌        | 964/6352 [56:21<5:15:00,  3.51s/it]

found 963, 8112505, 49


 16%|█▌        | 992/6352 [1:00:25<5:26:30,  3.65s/it]

found 991, 8274319, 50


 16%|█▌        | 1018/6352 [1:04:06<5:35:53,  3.78s/it]

found 1017, 8179078, 51


 16%|█▋        | 1045/6352 [1:10:10<5:56:23,  4.03s/it]

found 1044, 8173208, 52


 16%|█▋        | 1046/6352 [1:10:11<5:56:03,  4.03s/it]

found 1045, 8279279, 53


 17%|█▋        | 1063/6352 [1:14:12<6:09:11,  4.19s/it]

found 1062, 8234395, 54


 17%|█▋        | 1072/6352 [1:15:04<6:09:48,  4.20s/it]

found 1071, 8333054, 55


 17%|█▋        | 1080/6352 [1:16:32<6:13:36,  4.25s/it]

found 1079, 8108040, 56


 17%|█▋        | 1097/6352 [1:19:15<6:19:40,  4.34s/it]

found 1096, 8149102, 57


 17%|█▋        | 1103/6352 [1:21:06<6:25:58,  4.41s/it]

found 1102, 8150616, 58


 17%|█▋        | 1110/6352 [1:22:21<6:28:55,  4.45s/it]

found 1109, 8300293, 59


 18%|█▊        | 1131/6352 [1:26:40<6:40:08,  4.60s/it]

found 1130, 8219729, 60


 19%|█▊        | 1180/6352 [1:32:27<6:45:16,  4.70s/it]

found 1179, 8129444, 61


 19%|█▉        | 1208/6352 [1:34:59<6:44:32,  4.72s/it]

found 1207, 8191678, 62


 19%|█▉        | 1222/6352 [1:36:34<6:45:25,  4.74s/it]

found 1221, 8264060, 63


 19%|█▉        | 1234/6352 [1:38:33<6:48:44,  4.79s/it]

found 1233, 8263645, 64


 20%|██        | 1292/6352 [1:47:40<7:01:42,  5.00s/it]

found 1291, 8143726, 65


 20%|██        | 1301/6352 [1:48:41<7:02:00,  5.01s/it]

found 1300, 8289171, 66


 21%|██        | 1313/6352 [1:51:38<7:08:26,  5.10s/it]

found 1312, 8205421, 67


 21%|██        | 1315/6352 [1:51:58<7:08:56,  5.11s/it]

found 1314, 8172617, 68


 21%|██▏       | 1355/6352 [1:58:45<7:17:59,  5.26s/it]

found 1354, 8241251, 69


 21%|██▏       | 1359/6352 [1:59:07<7:17:38,  5.26s/it]

found 1358, 8265321, 70


 22%|██▏       | 1390/6352 [2:07:18<7:34:27,  5.50s/it]

found 1389, 8248851, 71


 22%|██▏       | 1412/6352 [2:11:34<7:40:18,  5.59s/it]

found 1411, 8095417, 72


 23%|██▎       | 1488/6352 [2:22:20<7:45:18,  5.74s/it]

found 1487, 8312484, 73


 23%|██▎       | 1492/6352 [2:22:58<7:45:44,  5.75s/it]

found 1491, 8107663, 74


 24%|██▍       | 1511/6352 [2:26:13<7:48:30,  5.81s/it]

found 1510, 8200626, 75


 24%|██▍       | 1520/6352 [2:28:37<7:52:27,  5.87s/it]

Skip 8 beginning key cand: 8000724, 10


 24%|██▍       | 1535/6352 [2:30:37<7:52:41,  5.89s/it]

found 1534, 8214213, 76


 24%|██▍       | 1550/6352 [2:32:48<7:53:25,  5.92s/it]

found 1549, 8197775, 77


 24%|██▍       | 1556/6352 [2:34:01<7:54:45,  5.94s/it]

found 1555, 8315812, 78


 25%|██▌       | 1590/6352 [2:37:39<7:52:11,  5.95s/it]

found 1589, 8325093, 79


 25%|██▌       | 1600/6352 [2:39:22<7:53:19,  5.98s/it]

found 1599, 8291452, 80


 25%|██▌       | 1603/6352 [2:39:39<7:53:01,  5.98s/it]

found 1602, 8100537, 81


 25%|██▌       | 1608/6352 [2:40:01<7:52:08,  5.97s/it]

found 1607, 8139945, 82


 26%|██▌       | 1624/6352 [2:43:09<7:55:01,  6.03s/it]

Exception: no end tagname found

In [89]:
import pickle

In [90]:
len(patent_grants_dic)

82

In [91]:
with open("../data/patent_grants_82dic.dat", 'wb') as f:
    pickle.dump(patent_grants_dic, f)

In [93]:
with open("../data/patent_grants_candidate_tuplist.dat", 'wb') as f:
    pickle.dump(dict_list, f)

In [94]:
founded_keys = set(patent_grants_dic.keys())

In [97]:
[(key, len(tuplist)) for key, tuplist in dict_list if key in founded_keys]

[('8202487', 1),
 ('8114845', 2),
 ('8140012', 1),
 ('8144683', 1),
 ('8138863', 1),
 ('8095929', 2),
 ('8289665', 1),
 ('8087315', 2),
 ('8322805', 1),
 ('8147894', 1),
 ('8118868', 1),
 ('8101436', 1),
 ('8331747', 1),
 ('8239267', 1),
 ('8307045', 1),
 ('8169523', 1),
 ('8270788', 1),
 ('8199013', 1),
 ('8289837', 1),
 ('8123234', 1),
 ('8292411', 1),
 ('8166718', 1),
 ('8297738', 1),
 ('8106186', 1),
 ('8185850', 1),
 ('8201268', 2),
 ('8123510', 1),
 ('8206358', 1),
 ('8327128', 1),
 ('8263886', 1),
 ('8250085', 1),
 ('8140786', 3),
 ('8151694', 1),
 ('8291754', 1),
 ('8151323', 1),
 ('8113316', 1),
 ('8226943', 2),
 ('8106402', 1),
 ('8200899', 1),
 ('8163144', 1),
 ('8238313', 1),
 ('8325073', 1),
 ('8145369', 2),
 ('8129836', 1),
 ('8088025', 2),
 ('8132770', 1),
 ('8263863', 1),
 ('8329172', 1),
 ('8112505', 3),
 ('8274319', 1),
 ('8179078', 1),
 ('8173208', 1),
 ('8279279', 1),
 ('8234395', 2),
 ('8333054', 1),
 ('8108040', 1),
 ('8149102', 1),
 ('8150616', 1),
 ('8300293', 1

In [None]:
for i, (key_cand, tup_list) in tqdm.tqdm(list(enumerate(dict_list))):
    if i <= 1607:
        continue
    if len(tup_list) > 5:
        if key_cand.startswith("8"):
            print("Skip 8 beginning key cand: {}, {}".format(key_cand, len(tup_list)))
    else:
        try:
            real_pat = get_real_patent("us-patent-grant", key_cand, tup_list)
            if real_pat:
                patent_grants_dic[key_cand] = real_pat 
                print("found {}, {}, {}".format(i, key_cand, len(patent_grants_dic)))
        except ValueError:
            print("No end tag found, skip. {}:{}".format(i, key_cand))
            


  0%|          | 0/6352 [00:00<?, ?it/s][A
 25%|██▌       | 1610/6352 [00:01<00:03, 1371.87it/s][A
Exception in thread Thread-8:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



No end tag found, skip. 1625:2705223
No end tag found, skip. 1641:8198448
found 1647, 8126927, 83


 26%|██▌       | 1660/6352 [03:35<10:09,  7.69it/s]  

found 1659, 8098918, 84


 26%|██▌       | 1663/6352 [03:38<10:15,  7.62it/s]

found 1662, 8294418, 85


 26%|██▋       | 1670/6352 [04:00<11:14,  6.94it/s]

found 1669, 8217691, 86


 27%|██▋       | 1700/6352 [05:40<15:32,  4.99it/s]

found 1699, 8244674, 87


 27%|██▋       | 1720/6352 [07:08<19:13,  4.01it/s]

found 1719, 8140358, 88


 28%|██▊       | 1769/6352 [09:36<24:52,  3.07it/s]

found 1768, 8236139, 89


 28%|██▊       | 1780/6352 [10:57<28:07,  2.71it/s]

found 1779, 8337372, 90


 28%|██▊       | 1782/6352 [10:58<28:09,  2.71it/s]

found 1781, 8124477, 91


 28%|██▊       | 1796/6352 [11:13<28:27,  2.67it/s]

found 1795, 8127704, 92


 28%|██▊       | 1800/6352 [11:27<28:57,  2.62it/s]

found 1799, 8114011, 93


 29%|██▊       | 1812/6352 [12:22<30:59,  2.44it/s]

found 1811, 8297173, 94


 29%|██▉       | 1845/6352 [14:22<35:06,  2.14it/s]

found 1844, 8305453, 95


 29%|██▉       | 1849/6352 [14:37<35:37,  2.11it/s]

found 1848, 8097894, 96


 29%|██▉       | 1871/6352 [18:14<43:41,  1.71it/s]

found 1870, 8242476, 97


 30%|██▉       | 1896/6352 [23:50<56:02,  1.33it/s]

found 1895, 8242342, 98


 30%|███       | 1915/6352 [26:09<1:00:36,  1.22it/s]

found 1914, 8326533, 99


 30%|███       | 1922/6352 [27:17<1:02:54,  1.17it/s]

found 1921, 8268141, 100


 31%|███       | 1942/6352 [30:34<1:09:24,  1.06it/s]

found 1941, 8328659, 101


 31%|███       | 1950/6352 [31:43<1:11:36,  1.02it/s]

found 1949, 8204103, 102


 31%|███       | 1956/6352 [33:02<1:14:16,  1.01s/it]

found 1955, 8277780, 103


 31%|███       | 1967/6352 [35:25<1:18:59,  1.08s/it]

found 1966, 8250569, 104


 31%|███▏      | 1992/6352 [40:00<1:27:33,  1.20s/it]

found 1991, 8197048, 105


 32%|███▏      | 2006/6352 [41:51<1:30:40,  1.25s/it]

found 2005, 8187636, 106


 32%|███▏      | 2019/6352 [42:35<1:31:24,  1.27s/it]

found 2018, 8167127, 107


 33%|███▎      | 2069/6352 [51:19<1:46:14,  1.49s/it]

found 2068, 8241077, 108


 33%|███▎      | 2076/6352 [52:57<1:49:05,  1.53s/it]

found 2075, 8326951, 109


 33%|███▎      | 2078/6352 [53:29<1:50:00,  1.54s/it]

found 2077, 8104787, 110


 33%|███▎      | 2084/6352 [53:40<1:49:54,  1.55s/it]

found 2083, 8323070, 111


 33%|███▎      | 2093/6352 [55:15<1:52:27,  1.58s/it]

found 2092, 8171948, 112


 33%|███▎      | 2109/6352 [56:37<1:53:55,  1.61s/it]

found 2108, 8301639, 113


 34%|███▎      | 2134/6352 [1:00:47<2:00:09,  1.71s/it]

found 2133, 8176508, 114


 34%|███▍      | 2169/6352 [1:07:18<2:09:48,  1.86s/it]

found 2168, 8341213, 115


 34%|███▍      | 2191/6352 [1:10:09<2:13:14,  1.92s/it]

found 2190, 8235081, 116


 35%|███▍      | 2204/6352 [1:12:43<2:16:52,  1.98s/it]

found 2203, 8194173, 117


 35%|███▍      | 2210/6352 [1:13:28<2:17:41,  1.99s/it]

found 2209, 8284085, 118


 35%|███▌      | 2249/6352 [1:17:24<2:21:12,  2.07s/it]

found 2248, 8247569, 119


 37%|███▋      | 2333/6352 [1:31:14<2:37:10,  2.35s/it]

found 2332, 8221765, 120


 37%|███▋      | 2342/6352 [1:32:56<2:39:08,  2.38s/it]

found 2341, 8148885, 121


 37%|███▋      | 2375/6352 [1:37:20<2:42:59,  2.46s/it]

found 2374, 8304794, 122


 38%|███▊      | 2398/6352 [1:39:34<2:44:10,  2.49s/it]

found 2397, 8317239, 123


 39%|███▊      | 2451/6352 [1:45:13<2:47:28,  2.58s/it]

found 2450, 8327561, 124


 39%|███▉      | 2463/6352 [1:47:39<2:49:58,  2.62s/it]

found 2462, 8200668, 125


 39%|███▉      | 2472/6352 [1:49:31<2:51:54,  2.66s/it]

found 2471, 8139284, 126


 40%|███▉      | 2516/6352 [1:58:08<3:00:06,  2.82s/it]

found 2515, 8270274, 127


 40%|███▉      | 2519/6352 [1:58:09<2:59:47,  2.81s/it]

found 2518, 8186671, 128


 40%|███▉      | 2539/6352 [2:01:02<3:01:47,  2.86s/it]

found 2538, 8188714, 129


 40%|████      | 2561/6352 [2:03:21<3:02:36,  2.89s/it]

found 2560, 8299588, 130


 41%|████      | 2575/6352 [2:04:56<3:03:15,  2.91s/it]

found 2574, 8254848, 131


 41%|████      | 2585/6352 [2:06:58<3:05:01,  2.95s/it]

found 2584, 8241819, 132


 41%|████      | 2587/6352 [2:07:52<3:06:06,  2.97s/it]

found 2586, 8091547, 133


 41%|████      | 2599/6352 [2:09:00<3:06:17,  2.98s/it]

found 2598, 8213413, 134


 41%|████      | 2605/6352 [2:10:15<3:07:21,  3.00s/it]

found 2604, 8136314, 135


 41%|████      | 2615/6352 [2:12:01<3:08:40,  3.03s/it]

found 2614, 8288831, 136


 42%|████▏     | 2646/6352 [2:17:05<3:12:01,  3.11s/it]

found 2645, 8247940, 137


 42%|████▏     | 2671/6352 [2:21:18<3:14:44,  3.17s/it]

found 2670, 8246726, 138


 42%|████▏     | 2674/6352 [2:21:56<3:15:14,  3.19s/it]

found 2673, 8112794, 139


 42%|████▏     | 2683/6352 [2:22:19<3:14:38,  3.18s/it]

found 2682, 8257317, 140


 43%|████▎     | 2717/6352 [2:26:54<3:16:32,  3.24s/it]

found 2716, 8101434, 141


 43%|████▎     | 2721/6352 [2:26:55<3:16:04,  3.24s/it]

found 2720, 8101279, 142


 43%|████▎     | 2739/6352 [2:29:09<3:16:44,  3.27s/it]

found 2738, 8222936, 143


 44%|████▎     | 2771/6352 [2:34:00<3:19:01,  3.33s/it]

found 2770, 8266698, 144


 44%|████▎     | 2775/6352 [2:34:37<3:19:18,  3.34s/it]

found 2774, 8321497, 145


 44%|████▍     | 2800/6352 [2:38:20<3:20:52,  3.39s/it]

found 2799, 8214943, 146


 44%|████▍     | 2807/6352 [2:38:23<3:20:02,  3.39s/it]

found 2806, 8166155, 147


 44%|████▍     | 2822/6352 [2:40:32<3:20:49,  3.41s/it]

No end tag found, skip. 2821:5166207


 45%|████▍     | 2831/6352 [2:42:07<3:21:37,  3.44s/it]

found 2830, 8246140, 148


 45%|████▍     | 2840/6352 [2:42:41<3:21:10,  3.44s/it]

No end tag found, skip. 2839:20090075333


 45%|████▍     | 2846/6352 [2:42:45<3:20:30,  3.43s/it]

found 2845, 8144997, 149


 45%|████▍     | 2850/6352 [2:43:16<3:20:37,  3.44s/it]

found 2849, 8160701, 150


 45%|████▍     | 2856/6352 [2:43:39<3:20:20,  3.44s/it]

found 2855, 8157790, 151


 45%|████▌     | 2878/6352 [2:46:28<3:20:56,  3.47s/it]

found 2877, 8223033, 152


 45%|████▌     | 2883/6352 [2:47:17<3:21:17,  3.48s/it]

found 2882, 8170705, 153


 45%|████▌     | 2888/6352 [2:47:51<3:21:19,  3.49s/it]

found 2887, 8123375, 154


 45%|████▌     | 2890/6352 [2:47:52<3:21:06,  3.49s/it]

found 2889, 8209628, 155


 46%|████▌     | 2898/6352 [2:50:24<3:23:05,  3.53s/it]

found 2897, 8184908, 156


 46%|████▌     | 2907/6352 [2:51:24<3:23:08,  3.54s/it]

found 2906, 8292984, 157


 46%|████▌     | 2914/6352 [2:52:01<3:22:58,  3.54s/it]

found 2913, 8285833, 158


 46%|████▋     | 2948/6352 [2:58:32<3:26:09,  3.63s/it]

found 2947, 8243749, 159


 46%|████▋     | 2950/6352 [2:59:08<3:26:35,  3.64s/it]

found 2949, 8306922, 160


 47%|████▋     | 2973/6352 [3:02:11<3:27:04,  3.68s/it]

found 2972, 8274671, 161


 47%|████▋     | 2974/6352 [3:02:12<3:26:57,  3.68s/it]

found 2973, 8288090, 162


 47%|████▋     | 2976/6352 [3:02:29<3:27:01,  3.68s/it]

found 2975, 8227462, 163


 47%|████▋     | 2978/6352 [3:02:30<3:26:46,  3.68s/it]

found 2977, 8147165, 164


 47%|████▋     | 2981/6352 [3:02:32<3:26:25,  3.67s/it]

found 2980, 8268828, 165


 47%|████▋     | 2984/6352 [3:02:50<3:26:22,  3.68s/it]

found 2983, 8151016, 166


 47%|████▋     | 3004/6352 [3:06:40<3:28:03,  3.73s/it]

found 3003, 8201370, 167


 47%|████▋     | 3007/6352 [3:07:24<3:28:28,  3.74s/it]

found 3006, 8169016, 168


 48%|████▊     | 3055/6352 [3:14:25<3:29:49,  3.82s/it]

found 3054, 8251233, 169


 49%|████▉     | 3102/6352 [3:21:38<3:31:15,  3.90s/it]

found 3101, 8203685, 170


 50%|████▉     | 3164/6352 [3:32:16<3:33:52,  4.03s/it]

found 3163, 8240614, 171


 50%|████▉     | 3166/6352 [3:32:20<3:33:41,  4.02s/it]

found 3165, 8305196, 172


 50%|█████     | 3180/6352 [3:34:50<3:34:17,  4.05s/it]

found 3179, 8146514, 173


 50%|█████     | 3189/6352 [3:35:46<3:34:01,  4.06s/it]

found 3188, 8193983, 174


 51%|█████     | 3209/6352 [3:38:50<3:34:20,  4.09s/it]

found 3208, 8299555, 175


 51%|█████     | 3210/6352 [3:39:04<3:34:25,  4.09s/it]

found 3209, 8203954, 176


 51%|█████     | 3252/6352 [3:44:59<3:34:28,  4.15s/it]

found 3251, 8260914, 177


 51%|█████     | 3255/6352 [3:45:01<3:34:05,  4.15s/it]

found 3254, 8206243, 178


 51%|█████▏    | 3256/6352 [3:45:01<3:33:58,  4.15s/it]

found 3255, 8218397, 179


 52%|█████▏    | 3310/6352 [3:52:33<3:33:43,  4.22s/it]

found 3309, 8274040, 180


 52%|█████▏    | 3311/6352 [3:52:46<3:33:47,  4.22s/it]

found 3310, 8260725, 181


 52%|█████▏    | 3319/6352 [3:53:16<3:33:10,  4.22s/it]

found 3318, 8104775, 182


 53%|█████▎    | 3347/6352 [4:00:06<3:35:34,  4.30s/it]

found 3346, 8328018, 183


 53%|█████▎    | 3362/6352 [4:04:19<3:37:17,  4.36s/it]

found 3361, 8228112, 184


 53%|█████▎    | 3365/6352 [4:04:22<3:36:55,  4.36s/it]

found 3364, 8184983, 185


 53%|█████▎    | 3392/6352 [4:09:48<3:37:59,  4.42s/it]

found 3391, 8274097, 186


 53%|█████▎    | 3396/6352 [4:10:26<3:37:59,  4.42s/it]

found 3395, 8167280, 187


 54%|█████▎    | 3413/6352 [4:11:41<3:36:44,  4.42s/it]

found 3412, 8297723, 188


 54%|█████▍    | 3429/6352 [4:14:18<3:36:47,  4.45s/it]

found 3428, 8100725, 189


 54%|█████▍    | 3439/6352 [4:15:42<3:36:35,  4.46s/it]

found 3438, 8337293, 190


 54%|█████▍    | 3445/6352 [4:16:17<3:36:15,  4.46s/it]

found 3444, 8210101, 191


 55%|█████▍    | 3472/6352 [4:20:39<3:36:13,  4.50s/it]

found 3471, 8274093, 192


 55%|█████▍    | 3477/6352 [4:22:01<3:36:39,  4.52s/it]

found 3476, 8267812, 193


 55%|█████▌    | 3494/6352 [4:25:36<3:37:15,  4.56s/it]

found 3493, 8201736, 194


 56%|█████▌    | 3541/6352 [4:30:39<3:34:51,  4.59s/it]

found 3540, 8147458, 195


 56%|█████▌    | 3552/6352 [4:31:35<3:34:05,  4.59s/it]

found 3551, 8335703, 196


 56%|█████▌    | 3566/6352 [4:33:21<3:33:33,  4.60s/it]

found 3565, 8263760, 197


 56%|█████▋    | 3581/6352 [4:35:24<3:33:06,  4.61s/it]

found 3580, 8235235, 198


 57%|█████▋    | 3623/6352 [4:41:55<3:32:21,  4.67s/it]

found 3622, 8280525, 199


 57%|█████▋    | 3651/6352 [4:45:56<3:31:32,  4.70s/it]

found 3650, 8215370, 200


 58%|█████▊    | 3667/6352 [4:47:08<3:30:14,  4.70s/it]

found 3666, 8131377, 201


 58%|█████▊    | 3679/6352 [4:48:17<3:29:27,  4.70s/it]

found 3678, 8326468, 202


 58%|█████▊    | 3703/6352 [4:51:31<3:28:32,  4.72s/it]

found 3702, 8283802, 203


 58%|█████▊    | 3707/6352 [4:51:34<3:28:02,  4.72s/it]

found 3706, 8279687, 204


 59%|█████▊    | 3719/6352 [4:52:28<3:27:03,  4.72s/it]

found 3718, 8327784, 205


 59%|█████▊    | 3721/6352 [4:52:48<3:27:01,  4.72s/it]

found 3720, 8093914, 206


 59%|█████▊    | 3723/6352 [4:52:50<3:26:47,  4.72s/it]

found 3722, 8232905, 207


 59%|█████▊    | 3725/6352 [4:53:20<3:26:52,  4.72s/it]

No end tag found, skip. 3724:5604209


 59%|█████▉    | 3750/6352 [4:58:49<3:27:20,  4.78s/it]

found 3749, 8147344, 208


 59%|█████▉    | 3779/6352 [5:02:44<3:26:07,  4.81s/it]

found 3778, 8109469, 209


 60%|█████▉    | 3790/6352 [5:04:05<3:25:33,  4.81s/it]

found 3789, 8186733, 210


 60%|█████▉    | 3800/6352 [5:05:21<3:25:04,  4.82s/it]

No end tag found, skip. 3799:4447001


 61%|██████    | 3853/6352 [5:12:47<3:22:52,  4.87s/it]

found 3852, 8220730, 211


 61%|██████    | 3854/6352 [5:13:04<3:22:55,  4.87s/it]

Skip 8 beginning key cand: 8065598, 6


 61%|██████    | 3856/6352 [5:13:09<3:22:42,  4.87s/it]

found 3855, 8187423, 212


 61%|██████    | 3862/6352 [5:13:44<3:22:17,  4.87s/it]

found 3861, 8203966, 213


 61%|██████▏   | 3892/6352 [5:19:55<3:22:12,  4.93s/it]

found 3891, 8147867, 214


 62%|██████▏   | 3922/6352 [5:26:05<3:22:02,  4.99s/it]

found 3921, 8118466, 215


 62%|██████▏   | 3926/6352 [5:26:09<3:21:32,  4.98s/it]

found 3925, 8108252, 216


 62%|██████▏   | 3931/6352 [5:26:41<3:21:12,  4.99s/it]

found 3930, 8314637, 217


 62%|██████▏   | 3942/6352 [5:27:56<3:20:29,  4.99s/it]

found 3941, 8119202, 218


 62%|██████▏   | 3957/6352 [5:30:06<3:19:48,  5.01s/it]

found 3956, 8094521, 219


 62%|██████▏   | 3965/6352 [5:31:11<3:19:22,  5.01s/it]

found 3964, 8181404, 220


 63%|██████▎   | 3975/6352 [5:32:14<3:18:40,  5.01s/it]

found 3974, 8175425, 221


 63%|██████▎   | 3997/6352 [5:35:12<3:17:30,  5.03s/it]

found 3996, 8169839, 222


 63%|██████▎   | 4003/6352 [5:36:23<3:17:23,  5.04s/it]

found 4002, 8288871, 223


 63%|██████▎   | 4005/6352 [5:36:41<3:17:18,  5.04s/it]

No end tag found, skip. 4004:20090130692


 63%|██████▎   | 4007/6352 [5:37:02<3:17:14,  5.05s/it]

found 4006, 8320023, 224


 63%|██████▎   | 4010/6352 [5:38:04<3:17:27,  5.06s/it]

found 4009, 8118365, 225


 63%|██████▎   | 4019/6352 [5:38:21<3:16:24,  5.05s/it]

found 4018, 8088154, 226


 63%|██████▎   | 4022/6352 [5:38:57<3:16:21,  5.06s/it]

found 4021, 8235086, 227


 64%|██████▍   | 4063/6352 [5:48:38<3:16:25,  5.15s/it]

found 4062, 8189978, 228


 64%|██████▍   | 4084/6352 [5:53:01<3:16:02,  5.19s/it]

found 4083, 8293322, 229


 65%|██████▍   | 4099/6352 [5:53:43<3:14:25,  5.18s/it]

found 4098, 8152776, 230


 65%|██████▍   | 4112/6352 [5:54:52<3:13:19,  5.18s/it]

found 4111, 8180747, 231


 65%|██████▍   | 4120/6352 [5:55:58<3:12:50,  5.18s/it]

found 4119, 8135497, 232


 65%|██████▌   | 4142/6352 [6:01:12<3:12:43,  5.23s/it]

found 4141, 8193031, 233


 65%|██████▌   | 4159/6352 [6:02:58<3:11:23,  5.24s/it]

found 4158, 8099915, 234


 66%|██████▌   | 4163/6352 [6:04:02<3:11:25,  5.25s/it]

found 4162, 8188901, 235


 66%|██████▌   | 4198/6352 [6:09:10<3:09:25,  5.28s/it]

No end tag found, skip. 4197:8314216


 66%|██████▌   | 4200/6352 [6:09:27<3:09:18,  5.28s/it]

found 4199, 8278176, 236


 66%|██████▋   | 4218/6352 [6:13:19<3:08:52,  5.31s/it]

found 4217, 8162214, 237


 66%|██████▋   | 4219/6352 [6:13:19<3:08:44,  5.31s/it]

found 4218, 8203537, 238


 67%|██████▋   | 4246/6352 [6:16:55<3:06:57,  5.33s/it]

found 4245, 8193555, 239


 67%|██████▋   | 4255/6352 [6:17:28<3:06:02,  5.32s/it]

found 4254, 8322863, 240


 67%|██████▋   | 4270/6352 [6:20:21<3:05:27,  5.34s/it]

found 4269, 8229856, 241


 68%|██████▊   | 4305/6352 [6:25:18<3:03:12,  5.37s/it]

found 4304, 8314870, 242


 68%|██████▊   | 4314/6352 [6:26:52<3:02:46,  5.38s/it]

found 4313, 8274098, 243


 69%|██████▊   | 4360/6352 [6:33:56<2:59:59,  5.42s/it]

found 4359, 8103111, 244


 69%|██████▉   | 4375/6352 [6:36:35<2:59:12,  5.44s/it]

found 4374, 8118639, 245


 69%|██████▉   | 4379/6352 [6:37:29<2:59:05,  5.45s/it]

found 4378, 8297743, 246


 69%|██████▉   | 4387/6352 [6:39:29<2:58:56,  5.46s/it]

found 4386, 8257901, 247


 69%|██████▉   | 4392/6352 [6:40:27<2:58:42,  5.47s/it]

found 4391, 8270369, 248


 69%|██████▉   | 4406/6352 [6:43:15<2:58:06,  5.49s/it]

found 4405, 8162603, 249


 69%|██████▉   | 4414/6352 [6:44:34<2:57:37,  5.50s/it]

found 4413, 8150460, 250


 70%|██████▉   | 4422/6352 [6:45:49<2:57:07,  5.51s/it]

found 4421, 8260482, 251


 70%|███████   | 4461/6352 [6:52:47<2:54:58,  5.55s/it]

No end tag found, skip. 4460:6074642


 71%|███████   | 4493/6352 [6:58:00<2:52:57,  5.58s/it]

found 4492, 8175554, 252


 72%|███████▏  | 4551/6352 [7:04:19<2:47:55,  5.59s/it]

found 4550, 8105218, 253


 72%|███████▏  | 4567/6352 [7:06:29<2:46:41,  5.60s/it]

found 4566, 8252304, 254


 72%|███████▏  | 4571/6352 [7:07:11<2:46:26,  5.61s/it]

found 4570, 8191952, 255


 72%|███████▏  | 4576/6352 [7:08:34<2:46:19,  5.62s/it]

found 4575, 8317304, 256


 73%|███████▎  | 4613/6352 [7:14:14<2:43:42,  5.65s/it]

found 4612, 8322832, 257


 73%|███████▎  | 4624/6352 [7:15:30<2:42:45,  5.65s/it]

found 4623, 8194835, 258


 73%|███████▎  | 4627/6352 [7:16:09<2:42:36,  5.66s/it]

found 4626, 8319408, 259


 73%|███████▎  | 4632/6352 [7:16:49<2:42:12,  5.66s/it]

found 4631, 8229384, 260


 73%|███████▎  | 4648/6352 [7:19:01<2:40:57,  5.67s/it]

found 4647, 8254802, 261


 73%|███████▎  | 4649/6352 [7:19:18<2:40:55,  5.67s/it]

found 4648, 8180034, 262


 74%|███████▎  | 4679/6352 [7:22:59<2:38:23,  5.68s/it]

found 4678, 8246272, 263


 74%|███████▎  | 4680/6352 [7:23:00<2:38:16,  5.68s/it]

found 4679, 8283401, 264


 74%|███████▍  | 4694/6352 [7:24:54<2:37:08,  5.69s/it]

found 4693, 8215125, 265


 74%|███████▍  | 4706/6352 [7:26:55<2:36:19,  5.70s/it]

found 4705, 8241932, 266


 74%|███████▍  | 4713/6352 [7:28:06<2:35:50,  5.70s/it]

found 4712, 8153965, 267


 74%|███████▍  | 4715/6352 [7:28:42<2:35:47,  5.71s/it]

found 4714, 8310629, 268


 74%|███████▍  | 4723/6352 [7:29:25<2:35:00,  5.71s/it]

found 4722, 8299862, 269


 75%|███████▍  | 4738/6352 [7:31:15<2:33:43,  5.71s/it]

found 4737, 8200637, 270


 75%|███████▍  | 4748/6352 [7:32:21<2:32:48,  5.72s/it]

found 4747, 8222522, 271


 75%|███████▌  | 4770/6352 [7:36:58<2:31:33,  5.75s/it]

found 4769, 8109216, 272


 75%|███████▌  | 4784/6352 [7:38:22<2:30:14,  5.75s/it]

found 4783, 8263568, 273


 75%|███████▌  | 4785/6352 [7:38:23<2:30:06,  5.75s/it]

found 4784, 8256159, 274


 75%|███████▌  | 4787/6352 [7:38:38<2:29:56,  5.75s/it]

found 4786, 8171211, 275


 76%|███████▌  | 4813/6352 [7:41:37<2:27:36,  5.75s/it]

found 4812, 8113723, 276


 76%|███████▌  | 4827/6352 [7:43:16<2:26:21,  5.76s/it]

found 4826, 8090426, 277


 76%|███████▌  | 4840/6352 [7:45:51<2:25:31,  5.78s/it]

found 4839, 8125025, 278


 76%|███████▋  | 4845/6352 [7:46:27<2:25:05,  5.78s/it]

found 4844, 8202295, 279


 77%|███████▋  | 4884/6352 [7:52:16<2:21:57,  5.80s/it]

found 4883, 8271878, 280


 77%|███████▋  | 4898/6352 [7:55:37<2:21:11,  5.83s/it]

found 4897, 8246292, 281


 78%|███████▊  | 4928/6352 [8:00:57<2:18:58,  5.86s/it]

No end tag found, skip. 4927:6399146


 78%|███████▊  | 4943/6352 [8:02:56<2:17:39,  5.86s/it]

In [None]:
3+4

In [112]:
len(patent_grants_dic)

351

In [113]:
with open("../data/patent_grants_dic_grants12_app17.dat", 'wb') as f:
    pickle.dump(patent_grants_dic, f)

In [114]:
list(patent_grants_dic.keys())[0:5]

['8277780', '8129444', '8299862', '8146514', '8148885']

In [116]:
# patent_grants_dic['8148885']

In [84]:
dict_list[83]

('8140012', [['grants2012/ipg120320.xml', 5011666]])

In [86]:
get_real_patent("us-patent-grant", dict_list[83][0], dict_list[83][1])

In [70]:
for i in range(9, 20):
    key_cand, tup_list = dict_list[i]
    print("{}: {}, {}".format(i, key_cand, len(tup_list)))

    real_pat = get_real_patent("us-patent-grant", key_cand, tup_list)
    if real_pat:
        print("found {}".format(i))

9: 6978184, 6
10: 20070233706, 1
11: 6063078, 40
12: 5980676, 1
13: 6365565, 1
14: 7080960, 1
15: 20090059936, 1
16: 20070203910, 1
17: 5907491, 31
18: 7268118, 2
19: 8202487, 1
found 19


In [71]:
for i in range(20, 30):
    key_cand, tup_list = dict_list[i]
    print("{}: {}, {}".format(i, key_cand, len(tup_list)))

    real_pat = get_real_patent("us-patent-grant", key_cand, tup_list)
    if real_pat:
        print("found {}".format(i))

20: 7797430, 4
21: 20100045635, 1
22: 7193644, 7
23: 6562906, 15
24: 5648506, 10
25: 6847481, 1
26: 5905777, 19
27: 5848173, 1
28: 4958625, 16
29: 5533521, 11


In [72]:
for i in range(30, 70):
    key_cand, tup_list = dict_list[i]
    print("{}: {}, {}".format(i, key_cand, len(tup_list)))

    real_pat = get_real_patent("us-patent-grant", key_cand, tup_list)
    if real_pat:
        print("found {}".format(i))

30: 7252600, 13
31: 7461077, 9
32: 6172344, 4
33: 5756981, 46
34: 6310839, 1
35: 6180415, 14
36: 20070231401, 1
37: 20080104348, 1
38: 20090138245, 1
39: 6064143, 1
40: 7583001, 1
41: 3675597, 3
42: 6054839, 1
43: 8114845, 2
found 43
44: D155065, 1
45: 20110193637, 1
46: 7579197, 1
47: 6222457, 3
48: 20110037114, 1
49: 4965097, 3
50: 5546932, 11
51: 5589033, 5
52: 20100270553, 1
53: 5281018, 4
54: 6458003, 1
55: 3804292, 2
56: 20050033234, 1
57: 6559731, 1
58: 8004092, 4
59: 5451176, 1
60: 7517363, 4
61: 6496766, 8
62: 20120240927, 1
63: 20090161602, 1
64: 5614206, 5
65: 5518597, 2
66: 3843288, 1
67: 4226236, 6
68: 4978144, 2
69: 7751806, 1


In [56]:
i

8

In [57]:
tup_list

[['grants2012/ipg121106.xml', 5342114],
 ['grants2012/ipg121002.xml', 5626822],
 ['grants2012/ipg120508.xml', 5794299],
 ['grants2012/ipg120508.xml', 5803487],
 ['grants2012/ipg120612.xml', 5407506],
 ['grants2012/ipg120522.xml', 5137393],
 ['grants2012/ipg121127.xml', 1444948],
 ['grants2012/ipg121127.xml', 6560493],
 ['grants2012/ipg120320.xml', 5202882],
 ['grants2012/ipg121016.xml', 6215625],
 ['grants2012/ipg120904.xml', 5781479],
 ['grants2012/ipg120814.xml', 5573793],
 ['grants2012/ipg120814.xml', 5576351],
 ['grants2012/ipg120828.xml', 596660],
 ['grants2012/ipg120828.xml', 5851701],
 ['grants2012/ipg120306.xml', 5691746],
 ['grants2012/ipg120306.xml', 5709177],
 ['grants2012/ipg120306.xml', 5720034],
 ['grants2012/ipg120327.xml', 5498384],
 ['grants2012/ipg120327.xml', 5508518],
 ['grants2012/ipg120327.xml', 5519359],
 ['grants2012/ipg120911.xml', 1390981],
 ['grants2012/ipg120911.xml', 5904173],
 ['grants2012/ipg120911.xml', 5944689],
 ['grants2012/ipg120605.xml', 5929238],
 

In [58]:
key_cand

'6415183'

In [69]:
real_pat = get_real_patent("us-patent-grant", key_cand, tup_list)
if real_pat:
    print("found {}".format(i))

### Below cell takes too much times. Must be bug.

In [None]:
for key_cand, tup_list in indexGrants2012_we_need.items():
    try:
        real_pat = get_real_patent("us-patent-grant", key_cand, tup_list)
        if real_pat:
            patent_grants_dic[key_cand] = real_pat            
    except:
        print("can't find tag. ignore. {}".format(key_cand))

In [None]:
len(list(patent_grants_dic.keys()))

In [205]:
real_id1, pat_text1 = get_patent("us-patent-grant", '../data/' + 'grants2012/ipg121030.xml', 5731295)
real_id1

['08300926', '12499630']

In [212]:
real_id2, pat_text2 = get_patent("us-patent-grant", '../data/' + 'grants2012/ipg120724.xml', 3034084)
real_id2

'8227462'

In [142]:
real_id3, pat_text3 = get_patent("us-patent-grant", '../data/' + 'grants2012/ipg121204.xml', 6903688)
real_id3

'08326698'

In [146]:
with open('tmp_out.txt', 'w') as f: 
    print("\n".join(pat_text1), file=f)

In [None]:
['grants2012/ipg120417.xml', 6067926],
 ['grants2012/ipg121204.xml', 6903406],
 ['grants2012/ipg121204.xml', 6903688]

In [147]:
indexGrants2012_we_need['4242108']

[['grants2012/ipg121106.xml', 1672856],
 ['grants2012/ipg120703.xml', 1727841],
 ['grants2012/ipg120724.xml', 1752025]]

In [152]:
real_id1, pat_text1 = get_patent("us-patent-grant", '../data/' + 'grants2012/ipg121106.xml', 1672856)
real_id1

['08303685', '10565466']

In [153]:
real_id2, pat_text2 = get_patent("us-patent-grant", '../data/' + 'grants2012/ipg120703.xml', 1727841)
real_id2

['08211213', '12632244']

In [154]:
real_id3, pat_text3 = get_patent("us-patent-grant", '../data/' + 'grants2012/ipg120724.xml', 1752025)
real_id3

['08226748', '12602887']

In [170]:
indexGrants2012_we_need['20120181696']

KeyError: '20120181696'

In [171]:
indexGrants2012['20120181696']

KeyError: '20120181696'

In [155]:
list(indexGrants2012_we_need.keys())[5:20]

['6510334',
 '7534866',
 '8227462',
 '20110032802',
 '6768196',
 '5235967',
 '20110035525',
 '5708963',
 '8011686',
 '4254145',
 '7010112',
 '4116456',
 '4835565',
 '7500444',
 '20080181288']

In [167]:
'20150364400'.startswith("2015")

True

In [165]:
list(citations_2017.parsed)[0:5]

['8553488', '20150364400', '20120181696', '20120279760', '20140091294']

In [169]:
citations_2017[list(map(lambda s: s.startswith("2012"), [str(elm) for elm in list(citations_2017.parsed)]))]

Unnamed: 0,app_id,citation_pat_pgpub_id,parsed,ifw_number,action_type,action_subtype,form892,form1449,citation_in_oa
57732274,15000034,Sun US 2012/0181696,20120181696,IYD99O6CRXEAPX0,103.0,,0,0,1
57732275,15000034,Sun US 2012/0279760,20120279760,IYD99O6CRXEAPX0,103.0,,0,0,1
57732330,15000048,Ke U.S. Pub # Pub # 2012/0049221 A1,20120049221,IYEPR2ZURXEAPX4,103.0,,0,0,1
57732335,15000048,Dai U.S. Pub # Pub # 2012/0087105 A1,20120087105,IYEPR2ZURXEAPX4,103.0,,0,0,1
57733582,15000285,20120245377,20120245377,IV82WOB1RXEAPX5,103.0,,1,0,1
57734766,15000519,20120261760,20120261760,J0B89O3HRXEAPX0,103.0,,1,0,1
57737475,15000943,20120064136,20120064136,IOVFB1E5RXEAPX4,103.0,,1,0,1
57739052,15001133,20120113095,20120113095,J3LJNGP5RXEAPX4,103.0,a,0,1,1
57739055,15001133,20120113095,20120113095,J3XHHLA6RXEAPX4,103.0,a,0,1,1
57739057,15001133,20120113095,20120113095,J3XHHLA6RXEAPX4,103.0,a,0,1,1


In [78]:
res_tup = retrieve_subtext_fast(6067926, 10, '../data/grants2012/ipg120417.xml')

In [79]:
res_tup[0]

10

In [80]:
res_tup[1][10]

'<doc-number>7945484</doc-number>'

In [83]:
list(index2017_we_need.keys())[0:5]

['15146848', '15192621', '15334971', '15049456', '15282523']

In [84]:
index2017_we_need['15146848']

[['application2017/ipa171207.xml', 5183637],
 ['application2017/ipa171207.xml', 5183673]]

In [91]:
res_tup = retrieve_subtext_fast(5183637, 5000, '../data/application2017/ipa171207.xml')

In [96]:
find_nearest_root_element(res_tup[0], res_tup[1], 'us-patent-application')

4667

In [93]:
res_tup[0]

5000

In [97]:
res_tup[1][4667]

'<us-patent-application lang="EN" dtd-version="v4.4 2014-04-03" file="US20170351915A1-20171207.XML" status="PRODUCTION" id="us-patent-application" country="US" date-produced="20171121" date-publ="20171207">'

In [103]:
res_pat = retrieve_begin_end(4667, res_tup[1], 'us-patent-application')

In [105]:
res_pat[0:5]

['<us-patent-application lang="EN" dtd-version="v4.4 2014-04-03" file="US20170351915A1-20171207.XML" status="PRODUCTION" id="us-patent-application" country="US" date-produced="20171121" date-publ="20171207">',
 '<us-bibliographic-data-application lang="EN" country="US">',
 '<publication-reference>',
 '<document-id>',
 '<country>US</country>']

In [104]:
res_pat[-1]

'</us-patent-application>'

In [None]:
subtext = !sed -n ''

# Try and error

In [74]:
# create_one_index("grants2012/ipg120103.xml")

In [21]:
xmlrel = "application2017/ipa170105.xml"

In [22]:
xmlpath = "../data/"+xmlrel

In [29]:
indexfile=INDEX_PATH+xmlrel+".idx"

In [31]:
!mkdir -p {os.path.dirname(indexfile)}

In [32]:
!grep -n "doc-number" {xmlpath} | sed -e 's/\(.*\)<doc-number>\([^<]*\)<\/doc-number>.*/\1\2/' > {indexfile}

In [45]:
with open(indexfile, "r") as f:
    tupples = [l.rstrip("\n").split(":", 1) for l in f]

In [46]:
len(tupples)

31695

In [60]:
dic = {}

In [61]:
for tup in tupples:
    dic.setdefault(tup[1], []).append(int(tup[0]))

In [62]:
# some doc-number seems dup
dic["15265874"]

[3508, 3648]

In [None]:
for tup in tupples:    
    if tup[1] not in dic:
        print(tup[1])
    dic[tup[1]] = 1    

### just read xml file is too slow. Give up using python

In [4]:
%%time

with open("../data/" + xmlpath, "r") as f:
    lnum = 0
    fpos = 0
    line = f.readline()
    while line:
        lnum += 1
        line = f.readline()
        fpos = f.tell()


KeyboardInterrupt: 

### Whoosh trial

Conclusion: indexing is too slow and we can't use it.

In [6]:
from whoosh.index import create_in
from whoosh.fields import *
import os, os.path


In [14]:
schema = Schema(path=ID(stored=True), line_num=NUMERIC(stored=True), pos=NUMERIC(stored=True), line=TEXT)

In [15]:
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

ix = index.create_in("indexdir", schema)

In [17]:
writer = ix.writer()

In [18]:
xmlpath = "application2017/ipa170105.xml"

In [21]:
with open("../data/" + xmlpath, "r") as f:
    lnum = 0
    fpos = 0
    line = f.readline()
    while line:
        lnum += 1
        writer.add_document(path=xmlpath, line=line, line_num=lnum, pos=fpos)
        line = f.readline()
        fpos = f.tell()

writer.commit()
ix.close()

KeyboardInterrupt: 