In [1]:
import csv
import json
import re

import ahocorasick
import requests
import requests_cache
import spacy

In [2]:
print(requests_cache.__file__)

/nix/store/zdh62hs02jwlywk84ybqw26zqws5rlpi-python3-3.8.8-env/lib/python3.8/site-packages/requests_cache/__init__.py


In [3]:
requests_cache.install_cache("pfocr_cache")

In [4]:
hgnc_data_url = "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_acc_ids&col=gd_locus_type&col=gd_date_mod&col=family.id&col=gd_locus_group&col=gd_name_aliases&col=gd_date_sym_change&col=gd_pub_eg_id&col=family.name&col=gd_date_name_change&col=gd_prev_name&col=gd_date2app_or_res&status=Approved&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit"
hgnc_data = []
r = requests.get(hgnc_data_url, stream=True)
lines = (line.decode("utf-8") for line in r.iter_lines())
for record in csv.DictReader(lines, delimiter="\t"):
    hgnc_data.append(record)

TODO: has the `with requests.get...:` format stopped working? Did it never work?

In [5]:
len(hgnc_data)

42549

In [6]:
hgnc_data[10]

{'HGNC ID': 'HGNC:18149',
 'Approved symbol': 'A4GALT',
 'Approved name': 'alpha 1,4-galactosyltransferase (P blood group)',
 'Status': 'Approved',
 'Previous symbols': 'P1',
 'Alias symbols': 'A14GALT, Gb3S, P(k)',
 'Accession numbers': '',
 'Locus type': 'gene with protein product',
 'Date modified': '2021-04-13',
 'Gene group ID': '454|442',
 'Locus group': 'protein-coding gene',
 'Alias names': 'Gb3 synthase, "CD77 synthase", "globotriaosylceramide synthase", "lactosylceramide 4-alpha-galactosyltransferase"',
 'Date symbol changed': '',
 'NCBI Gene ID': '53947',
 'Gene group name': 'Blood group antigens|Alpha 1,4-glycosyltransferases',
 'Date name changed': '2017-03-16',
 'Previous name': 'alpha 1,4-galactosyltransferase (globotriaosylceramide synthase, P blood group), "P one antigen (P blood group)"',
 'Date approved': '2002-02-06'}

In [7]:
protein_hgnc_data = []
for record in hgnc_data:
    if record["Locus group"] == "protein-coding gene":
        protein_hgnc_data.append(record)
len(protein_hgnc_data)

19228

What kinds of characters connect the base to the integer suffix?

In [8]:
connector_re = re.compile("(.*?[a-zA-Z])(\W+?)?(\d+)$")
connectors = set()
for record in protein_hgnc_data:
    symbols = set([record["Approved symbol"]])
    for s in record["Previous symbols"].split(", "):
        symbols.add(s)
    for s in record["Alias symbols"].split(", "):
        symbols.add(s)

    for s in symbols:
        m = connector_re.match(s)
        if m:
            connector = m.group(2)
            int_suffix = int(m.group(3))
            connectors.add(connector)
            if not connector in [None, "-", "."]:
                print(s)
print(connectors)

MGC:14091
TAF(II)170
IMAGE:4839025
IMAGE:4942737
IMAGE:5164497
G(gamma)13
Mt-GrpE#2
IMAGE:4798971
MGC:4834
ES/130
MGC:39976
TAF(I)41
MGC:8964
TAF(II)43
{'-', None, '/', '.', '#', ')', ':'}


In [9]:
int_suffix_re = re.compile("(.*?[a-zA-Z][\-\/\.#\)\:]?)(\d+)$")
m = int_suffix_re.match("M-12219")

if m:
    print(m.group(1))
    print(m.group(2))

M-
12219


In [10]:
gene_terms = set()
gene_term_bases = set()
int_suffixes_by_gene_term_base = dict()
for record in protein_hgnc_data:
    symbols = set([record["Approved symbol"]])
    for s in record["Previous symbols"].split(", "):
        symbols.add(s)
    for s in record["Alias symbols"].split(", "):
        symbols.add(s)

    for s in symbols:
        gene_terms.add(s)
        m = int_suffix_re.match(s)
        if m:
            gene_term_base = m.group(1)
            gene_term_bases.add(gene_term_base)
            int_suffix = int(m.group(2))
            if gene_term_base in int_suffixes_by_gene_term_base:
                int_suffixes_by_gene_term_base[gene_term_base].add(int_suffix)
            else:
                int_suffixes_by_gene_term_base[gene_term_base] = set(
                    [int_suffix]
                )
        else:
            gene_term_bases.add(s)
    approved_name = record["Approved name"]
    gene_term_bases.add(approved_name)
    gene_terms.add(approved_name)
    if "Alias names" in record and record["Alias names"]:
        for name in record["Alias names"].split(', "'):
            gene_terms.add(name)
            gene_term_bases.add(name.strip(' "'))
    if "Previous name" in record and record["Previous name"]:
        for name in record["Previous name"].split(', "'):
            gene_terms.add(name)
            gene_term_bases.add(name.strip(' "'))

In [11]:
len(int_suffixes_by_gene_term_base.keys())

11817

In [12]:
len(gene_term_bases)

75782

In [13]:
len(gene_terms)

108097

In [14]:
ocr_data = list()
with open(
    "/home/ariutta/Documents/pfocr/data/ocr_pfocr20200224_raw_dollar_tail50.json"
) as f:
    for line in f:
        start = line[:2]
        end = line[-2:]
        if start == "{}":
            print("Detected a line representing an empty object {}")
            continue
        elif start != "${":
            print(line[:10])
            raise Exception(f"Line started with {start}, not a dollar sign.")
        elif end != "$\n":
            print(line[-10:])
            raise Exception(f"Line started with {end}, not a dollar sign.")
        ocr_data.append(json.loads(line.strip("$\n")))
print(ocr_data[0])

Detected a line representing an empty object {}
{'textAnnotations': [{'locale': 'en', 'description': 'AmyloidB dleavage\nand degradation\nLysosome\nGoldi body\nEndoplasmic\nreticulum (ER)\nMicroglia\nInflammation\nCeramide synthesis\n', 'boundingPoly': {'vertices': [{'x': 17, 'y': 99}, {'x': 687, 'y': 99}, {'x': 687, 'y': 466}, {'x': 17, 'y': 466}]}}, {'description': 'AmyloidB', 'boundingPoly': {'vertices': [{'x': 119, 'y': 99}, {'x': 207, 'y': 99}, {'x': 207, 'y': 122}, {'x': 119, 'y': 122}]}}, {'description': 'dleavage', 'boundingPoly': {'vertices': [{'x': 211, 'y': 99}, {'x': 303, 'y': 99}, {'x': 303, 'y': 124}, {'x': 211, 'y': 124}]}}, {'description': 'and', 'boundingPoly': {'vertices': [{'x': 129, 'y': 130}, {'x': 162, 'y': 130}, {'x': 162, 'y': 145}, {'x': 129, 'y': 145}]}}, {'description': 'degradation', 'boundingPoly': {'vertices': [{'x': 171, 'y': 127}, {'x': 284, 'y': 127}, {'x': 284, 'y': 150}, {'x': 171, 'y': 150}]}}, {'description': 'Lysosome', 'boundingPoly': {'vertices':

In [15]:
print(ocr_data[1]["fullTextAnnotation"]["text"])

Pancreatic beta cells
Insulin
Kir6.2
Insulin
signalling
Adipocytes
Cyclin D1
p300
Cyclin D3
CDK4
E2F
PPARY
Adipogenesis
Ac
Hepatocytes
pRB
Cyclin D1
HNF40
Lipid
metabolism
pRB
Cyclin D
CDK4
GCN5
PGC10
Gluconeo-
genesis
E2F
NRF1
mtTFA
Cyclin D1
VDAC
Cyclin B1
CDK1
VDAC
Hexokinase 2
↑Mitochondrial
respiration
Glucose metabolism



In [16]:
ocr_data[5]["fullTextAnnotation"]["text"]

'dj Post-Transeriptional\nDegradation\nCytoplasm\nSİRNA\nRISC\nal Transeriptional Block\nTranseriptional\nUpregulation\nc) Post- Transcriptional\nDegradation\nb)\nCe\nIncRNA Gene\nTumor Suppressor\nLncRNA Gene\nASO\nNucleus\ne) Steric Block\nse\nSmall marecules, ASO, Morpholinas\nA\nINCRNA\nSİRNA-RISC complex\nMorpholinos\nASO\nRNA Binding Protein\nSmal Molecules\n'

In [17]:
def normalize_confusable(confusable_set, text):
    first_confusable = list(confusable_set)[0]
    normalized = text
    for confusable in list(confusable_set):
        if confusable in normalized:
            normalized = normalized.replace(confusable, first_confusable)
    return normalized

In [18]:
gene_automaton = ahocorasick.Automaton()
confusable_sets = [set(["I", "l", "|", "1"])]
for idx, gene_term_base in enumerate(gene_term_bases):
    if len(gene_term_base) > 2:
        gene_term_base_cf = gene_term_base.casefold()
        gene_automaton.add_word(
            gene_term_base, (gene_term_base, "none", gene_term_base)
        )
        gene_automaton.add_word(
            gene_term_base_cf, (gene_term_base, "cf", gene_term_base_cf)
        )
        for confusable_set in confusable_sets:
            first_confusable = list(confusable_set)[0]
            normalized = normalize_confusable(confusable_set, gene_term_base)
            if normalized != gene_term_base:
                gene_automaton.add_word(
                    normalized,
                    (
                        gene_term_base,
                        f"{','.join(list(confusable_set))} -> {first_confusable}",
                        normalized,
                    ),
                )
                normalized_cf = normalized.casefold()
                if normalized_cf != normalized:
                    gene_automaton.add_word(
                        normalized_cf,
                        (
                            gene_term_base,
                            f"{','.join(list(confusable_set))} -> {first_confusable},cf",
                            normalized_cf,
                        ),
                    )
gene_automaton.make_automaton()

In [19]:
numeric_suffix_re = re.compile(
    "^[\-\s]?(\d+)(\s?([\-,&\\/]|and|or)\s?(\d+))?([\s\.\|.*]|$)"
)
m = numeric_suffix_re.match("-5,6\nwow")
if m:
    # print(m.group(0))
    print(m.group(1))
    # print(m.group(2))
    print(m.group(3))
    print(m.group(4))

5
,
6


In [20]:
test_re = re.compile("^.+?[\-\s]?(\d+)([A-Z]+)$")
for gene_term in list(gene_terms)[:100]:
    if test_re.match(gene_term):
        print(gene_term)

semaphorin 3B
KMT3C
NKX2E
MCM3AP
FAM176C
kinesin family member 21A


In [21]:
int_associated_gene_terms = int_suffixes_by_gene_term_base.keys()
alpha_re = re.compile("[a-zA-Z]")
numeric_suffix_re = re.compile(
    "^[\-\s]?(\d+)(\s?([\-,&\\/]|and|or)\s?(\d+))?([\s\.\|.*]|$)"
)
# for data in [{"fullTextAnnotation": {"text": "LRP-5,6\nwow"}}]:
for data in ocr_data[:50]:
    matched = set()
    text = data["fullTextAnnotation"]["text"]
    print(f"-----{str(len(text))}------")
    for (
        end_index,
        (original_word, transforms, transformed_word),
    ) in gene_automaton.iter(text):
        matched.add(f"{str(end_index)},{original_word}")

        start_index = end_index - len(transformed_word) + 1
        if (alpha_re.match(text[start_index - 1])) or (
            alpha_re.match(text[end_index + 1])
        ):
            continue

        print("")
        if original_word in int_associated_gene_terms:
            print((end_index, (original_word, transforms, transformed_word)))
            print(
                (
                    text[max(0, start_index - 10) : start_index],
                    text[start_index : end_index + 1],
                    text[end_index + 1 : end_index + 20],
                )
            )
            start_index = end_index - len(transformed_word) + 1
            m = numeric_suffix_re.match(text[end_index + 1 : end_index + 20])
            if m:
                # print(m.group(0))
                # print(m.group(1))
                # print(m.group(2))
                # print(m.group(3))

                first_int = int(m.group(1))
                connector = m.group(3)
                last_int = m.group(4)
                if last_int:
                    last_int = int(last_int)

                print(first_int)
                print(connector)
                print(last_int)

                int_suffixes = int_suffixes_by_gene_term_base[original_word]
                # print(f"has associated ints: {int_suffixes}")

                if first_int in int_suffixes:
                    if (not connector is None) and last_int in int_suffixes:
                        print(
                            original_word
                            + str(first_int)
                            + connector
                            + str(last_int)
                        )
                        if connector in set(["-"]):
                            for i in range(first_int, last_int + 1):
                                all_matches = []
                                if i in int_suffixes:
                                    all_matches.append(original_word + str(i))
                                else:
                                    print(
                                        "This integer is not found in int_suffixes: "
                                        + str(i)
                                    )
                            print("-> " + ", ".join(all_matches))
                        else:
                            print(
                                "-> "
                                + original_word
                                + str(first_int)
                                + ", "
                                + original_word
                                + str(last_int)
                            )
                    else:
                        print(original_word + str(first_int))
                    continue
                else:
                    print(
                        "This integer is not found in int_suffixes: "
                        + str(first_int)
                    )
                    print(int_suffixes)
        if original_word in gene_terms:
            1 + 1
            # print("good on its own")
    text_cf = text.casefold()
    if text != text_cf:
        for (
            end_index,
            (original_word, transforms, transformed_word),
        ) in gene_automaton.iter(text_cf):
            start_index = end_index - len(transformed_word) + 1
            if (alpha_re.match(text[start_index - 1])) or (
                alpha_re.match(text[end_index + 1])
            ):
                continue

            match_str = f"{str(end_index)},{original_word}"
            if not match_str in matched:
                # print("* text_cf")
                matched.add(match_str)
                # print((end_index, (original_word, transforms, transformed_word)))

    for confusable_set in confusable_sets:
        normalized = normalize_confusable(confusable_set, text)
        if normalized != text:
            for (
                end_index,
                (original_word, transforms, transformed_word),
            ) in gene_automaton.iter(normalized):
                start_index = end_index - len(transformed_word) + 1
                if (alpha_re.match(text[start_index - 1])) or (
                    alpha_re.match(text[end_index + 1])
                ):
                    continue

                match_str = f"{str(end_index)},{original_word}"
                if not match_str in matched:
                    # print("* normalized")
                    matched.add(match_str)
                    # print((end_index, (original_word, transforms, transformed_word)))

            normalized_cf = normalized.casefold()
            if normalized_cf != normalized:
                for (
                    end_index,
                    (original_word, transforms, transformed_word),
                ) in gene_automaton.iter(normalized_cf):
                    start_index = end_index - len(transformed_word) + 1
                    if (alpha_re.match(text[start_index - 1])) or (
                        alpha_re.match(text[end_index + 1])
                    ):
                        continue

                    match_str = f"{str(end_index)},{original_word}"
                    if not match_str in matched:
                        # print("* normalized_cf")
                        matched.add(match_str)
                        # print((end_index, (original_word, transforms, transformed_word)))

-----123------
-----328------

(14, ('Beta', 'cf', 'beta'))
('ancreatic ', 'beta', ' cells\nInsulin\nKir6')


(94, ('CDK', 'none', 'CDK'))
('Cyclin D3\n', 'CDK', '4\nE2F\nPPARY\nAdipoge')
4
None
None
CDK4

(99, ('E2F', 'none', 'E2F'))
('n D3\nCDK4\n', 'E2F', '\nPPARY\nAdipogenesis')

(151, ('HNF', 'none', 'HNF'))
('Cyclin D1\n', 'HNF', '40\nLipid\nmetabolism')
40
None
None
This integer is not found in int_suffixes: 40
{1, 4, 6}

(187, ('CDK', 'none', 'CDK'))
('\nCyclin D\n', 'CDK', '4\nGCN5\nPGC10\nGlucon')
4
None
None
CDK4

(192, ('GCN', 'none', 'GCN'))
('in D\nCDK4\n', 'GCN', '5\nPGC10\nGluconeo-\ng')
5
None
None
GCN5

(197, ('PGC', 'none', 'PGC'))
('CDK4\nGCN5\n', 'PGC', '10\nGluconeo-\ngenesi')
10
None
None
This integer is not found in int_suffixes: 10
{1}


(221, ('E2F', 'none', 'E2F'))
('-\ngenesis\n', 'E2F', '\nNRF1\nmtTFA\nCyclin ')

(225, ('NRF', 'none', 'NRF'))
('nesis\nE2F\n', 'NRF', '1\nmtTFA\nCyclin D1\nV')
1
None
None
NRF1

(247, ('VDAC', 'none', 'VDAC'))
('Cyclin D1\n', 

In [22]:
# gene_term_base_re = re.compile("(" + ")|(".join(["wt", "boo", "fus", "ss"]) + ")")

munged_gene_term_bases = set()
for gene_term_base in gene_term_bases:
    munged_gene_term_bases.add(gene_term_base.casefold())

escaped_munged_gene_term_bases = list()
for gene_term_base in sorted(munged_gene_term_bases, key=len, reverse=True):
    if len(gene_term_base) > 2:
        escaped_munged_gene_term_bases.append(re.escape(gene_term_base))

gene_term_base_re = re.compile(
    "(" + ")|(".join(list(escaped_munged_gene_term_bases)) + ")"
)

In [23]:
escaped_munged_gene_term_bases[-1]

'ahd'

In [24]:
for m in re.finditer(
    gene_term_base_re, ocr_data[0]["fullTextAnnotation"]["text"].casefold()
):
    if m.group(0):
        print(m.start(), m.end(), m.group(0))
        # print("")

0 3 amy
10 13 lea
23 27 egra
27 30 dat
34 37 lys
38 41 som
44 47 old
49 52 bod
54 58 endo
58 61 pla
62 65 mic
66 69 ret
70 73 cul
81 84 mic
84 87 rog
91 94 inf
94 98 lamm
104 107 cer
107 111 amid
113 116 syn
116 120 thes


The cells below were previously trying to use `re2`, not `re`. I think this was an experiment with using a fast regular expression library. But if I'm using the Aho-Corasick algo, I may not need to use the code below at all.

In [30]:
gene_term_base_re = re.compile(
    "(" + ")|(".join(list(escaped_munged_gene_term_bases[:5])) + ")"
)
len(escaped_munged_gene_term_bases)

74413

In [31]:
for i in range(0, 74231, 10000):
    print(i)

0
10000
20000
30000
40000
50000
60000
70000


In [32]:
prev = 0
for i in range(0, 90000, 10000):
    for m in re.finditer(
        "(" + ")|(".join(list(escaped_munged_gene_term_bases[prev:i])) + ")",
        ocr_data[0]["fullTextAnnotation"]["text"].casefold(),
    ):
        if m.group(0):
            print(m.start(), m.end(), m.group(0))
            # print("")
    prev = i
print("hello")

26 30 adat
54 58 endo
94 98 lamm
107 111 amid
116 120 thes
0 3 amy
10 13 lea
23 27 egra
27 30 dat
34 37 lys
38 41 som
44 47 old
49 52 bod
54 57 end
58 61 pla
62 65 mic
66 69 ret
70 73 cul
81 84 mic
84 87 rog
91 94 inf
94 97 lam
97 100 mat
104 107 cer
108 111 mid
113 116 syn
117 120 hes
hello


In [35]:
gene_term_base_res = []
prev = 0
for i in range(0, 90000, 10000):
    gene_term_base_re = re.compile(
        "(" + ")|(".join(list(escaped_munged_gene_term_bases[prev:i])) + ")"
    )
    gene_term_base_res.append(gene_term_base_re)
    prev = i

for gene_term_base_re in gene_term_base_res:
    for data in ocr_data[2:4]:
        text = data["fullTextAnnotation"]["text"]
        # print(text)
        print(f"-----{str(len(text))}------")
        for m in gene_term_base_re.finditer(text.casefold()):
            if m.group(0):
                print(m.start(), m.end(), m.group(0))
                # print("")
print("hello")

-----272------
-----322------
-----272------
-----322------
-----272------
-----322------
-----272------
-----322------
-----272------
-----322------
-----272------
-----322------
-----272------
-----322------
-----272------
22 26 sfrp
40 44 rspo
53 57 axin
103 107 znrf
153 157 axin
230 234 targ
237 241 gene
243 247 znrf
255 259 znrf
-----322------
53 57 ende
81 85 ende
98 102 trxr
116 120 stra
150 154 gsh-
158 162 ende
213 217 ende
250 254 gsh-
258 262 ende
268 272 ende
288 292 stra
-----272------
0 3 lrp
7 10 lrp
14 17 wnt
18 21 fzd
22 25 sfr
27 30 fzd
31 34 lgr
40 43 rsp
54 57 xin
58 61 dis
62 65 eve
66 69 led
75 78 cat
79 82 nin
85 88 cat
89 92 nin
95 98 cat
99 102 nin
104 107 nrf
109 112 rnf
117 120 cat
121 124 nin
125 128 lrp
134 137 fzd
141 144 gsa
145 148 cki
154 157 xin
158 161 apc
163 166 urn
174 177 cat
178 181 nin
185 188 trc
193 197 prot
199 202 som
208 211 urn
218 221 cat
222 225 nin
226 229 tcf
230 233 tar
233 236 get
237 240 gen
244 247 nrf
249 252 rnf
256 259 nrf
261 2

In [38]:
gene_term_base_res = []
prev = 0
for i in range(0, 90000, 10000):
    gene_term_base_re = re.compile(
        "(" + ")|(".join(list(escaped_munged_gene_term_bases[prev:i])) + ")"
    )
    gene_term_base_res.append(gene_term_base_re)
    prev = i
for data in ocr_data[2:4]:
    text = data["fullTextAnnotation"]["text"]
    # print(text)
    print(f"-----{str(len(text))}------")
    for gene_term_base_re in gene_term_base_res:
        for m in gene_term_base_re.finditer(text.casefold()):
            if m.group(0):
                print(m.start(), m.end(), m.group(0))
                # print("")
print("hello")

-----272------
22 26 sfrp
40 44 rspo
53 57 axin
103 107 znrf
153 157 axin
230 234 targ
237 241 gene
243 247 znrf
255 259 znrf
0 3 lrp
7 10 lrp
14 17 wnt
18 21 fzd
22 25 sfr
27 30 fzd
31 34 lgr
40 43 rsp
54 57 xin
58 61 dis
62 65 eve
66 69 led
75 78 cat
79 82 nin
85 88 cat
89 92 nin
95 98 cat
99 102 nin
104 107 nrf
109 112 rnf
117 120 cat
121 124 nin
125 128 lrp
134 137 fzd
141 144 gsa
145 148 cki
154 157 xin
158 161 apc
163 166 urn
174 177 cat
178 181 nin
185 188 trc
193 197 prot
199 202 som
208 211 urn
218 221 cat
222 225 nin
226 229 tcf
230 233 tar
233 236 get
237 240 gen
244 247 nrf
249 252 rnf
256 259 nrf
261 264 rnf
267 270 dkk
-----322------
53 57 ende
81 85 ende
98 102 trxr
116 120 stra
150 154 gsh-
158 162 ende
213 217 ende
250 254 gsh-
258 262 ende
268 272 ende
288 292 stra
0 3 nad
6 9 nad
17 20 nad
23 26 trx
30 33 grx
40 43 gss
45 48 trx
50 53 dep
53 56 end
56 59 ent
60 63 fun
73 76 grx
78 81 dep
81 84 end
84 87 ent
88 91 fun
98 101 trx
103 106 trx
113 116 sub
116 119 str
119

In [39]:
gene_term_base_re = re.compile(
    "(" + ")|(".join(list(escaped_munged_gene_term_bases)) + ")"
)
for data in ocr_data[2:4]:
    text = data["fullTextAnnotation"]["text"]
    # print(text)
    print("")
    for m in gene_term_base_re.finditer(text.casefold()):
        if m.group(0):
            print(m.start(), m.end(), m.group(0))
            # print("")


0 3 lrp
7 10 lrp
14 17 wnt
18 21 fzd
22 26 sfrp
27 30 fzd
31 34 lgr
40 44 rspo
53 57 axin
58 61 dis
62 65 eve
66 69 led
75 78 cat
79 82 nin
85 88 cat
89 92 nin
95 98 cat
99 102 nin
103 107 znrf
109 112 rnf
117 120 cat
121 124 nin
125 128 lrp
134 137 fzd
141 144 gsa
145 148 cki
153 157 axin
158 161 apc
163 166 urn
174 177 cat
178 181 nin
185 188 trc
193 197 prot
199 202 som
208 211 urn
218 221 cat
222 225 nin
226 229 tcf
230 234 targ
237 241 gene
243 247 znrf
249 252 rnf
255 259 znrf
261 264 rnf
267 270 dkk

0 3 nad
6 9 nad
17 20 nad
23 26 trx
30 33 grx
40 43 gss
45 48 trx
50 53 dep
53 57 ende
60 63 fun
73 76 grx
78 81 dep
81 85 ende
88 91 fun
98 102 trxr
103 106 trx
113 116 sub
116 120 stra
126 129 pro
139 142 gsh
143 146 grx
150 154 gsh-
155 158 dep
158 162 ende
165 168 fun
175 178 nad
186 189 nad
192 195 gss
197 200 trx
205 208 trx
210 213 dep
213 217 ende
220 223 fun
230 233 tgr
234 237 trx
246 249 gsh
250 254 gsh-
255 258 dep
258 262 ende
265 268 dep
268 272 ende
275 278 fun
285 2

In [40]:
data.keys()

dict_keys(['textAnnotations', 'fullTextAnnotation'])

In [41]:
data["fullTextAnnotation"]["text"]

'NADP+\nNADPH + H*\nNADP*\nTrx-S2\nGrx-(SH)2\nGSSG\nTrx-\ndependent\nfunctions\nGR\nGrx-\ndependent\nfunctions\nTrxR\nTrx-(SH)2\nsubstrate ox productaed\n2 GSH\nGrx-S2\nGSH-\ndependent\nfunctions\nNADPH + H*\nNADP+\nGSSG\nTrx-S2.\nTrx-\ndependent\nfunctions\nTGR\nTrx-(SH)2\n2 GSH\nGSH-\ndependent\ndependent functions\nsubstrate,\nGrx-\nproductred\nfunctions\n'

In [42]:
for item in data["fullTextAnnotation"]["pages"]:
    print(item.keys())

dict_keys(['width', 'blocks', 'height', 'property'])


In [None]:
for item in data["fullTextAnnotation"]["pages"][0]["blocks"]:
    print(item.keys())

In [None]:
for item in data["fullTextAnnotation"]["pages"][0]["blocks"][0]["paragraphs"]:
    print(item.keys())

In [None]:
for item in data["fullTextAnnotation"]["pages"][0]["blocks"][0]["paragraphs"][
    0
]["words"]:
    print(item.keys())

In [None]:
for page in data["fullTextAnnotation"]["pages"]:
    for block in page["blocks"]:
        for paragraph in block["paragraphs"]:
            for word in paragraph["words"]:
                for symbol in word["symbols"]:
                    print(symbol.keys())
                    print(symbol["text"])

In [None]:
for boo in data["textAnnotations"]:
    print(boo.keys())

In [None]:
data["textAnnotations"][0]["description"]

In [None]:
data["textAnnotations"][1]

In [None]:
data["textAnnotations"][1]["description"]