Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 6 additions & 11 deletions deep_reference_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
# distracting on the command line. These lines here (while undesireable)
# reduce the level of verbosity.

import os
import sys
import warnings
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

Expand All @@ -19,21 +19,16 @@

from .common import download_model_artefact
from .deep_reference_parser import DeepReferenceParser
from .logger import logger
from .model_utils import get_config
from .reference_utils import (
break_into_chunks,
labels_to_prodigy,
load_data,
from .io import (
load_tsv,
prodigy_to_conll,
prodigy_to_lists,
read_jsonl,
read_pickle,
write_json,
write_jsonl,
write_pickle,
write_to_csv,
write_txt,
write_tsv,
)
from .logger import logger
from .model_utils import get_config
from .reference_utils import break_into_chunks
from .tokens_to_references import tokens_to_references
2 changes: 1 addition & 1 deletion deep_reference_parser/__version__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
__name__ = "deep_reference_parser"
__version__ = "2020.3.0"
__version__ = "2020.3.1"
__description__ = "Deep learning model for finding and parsing references"
__url__ = "https://github.com/wellcometrust/deep_reference_parser"
__author__ = "Wellcome Trust DataLabs Team"
Expand Down
4 changes: 2 additions & 2 deletions deep_reference_parser/deep_reference_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
save_confusion_matrix,
word2vec_embeddings,
)
from .reference_utils import load_tsv, read_pickle, write_pickle, write_to_csv
from .io import load_tsv, read_pickle, write_pickle, write_to_csv


class DeepReferenceParser:
Expand Down Expand Up @@ -456,7 +456,7 @@ def build_model(

self.model = model

logger.debug(self.model.summary(line_length=150))
# logger.debug(self.model.summary(line_length=150))

def train_model(
self, epochs=25, batch_size=100, early_stopping_patience=5, metric="val_f1"
Expand Down
3 changes: 2 additions & 1 deletion deep_reference_parser/io/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .io import read_jsonl, write_jsonl
from .io import (load_tsv, read_jsonl, read_pickle, write_jsonl, write_pickle,
write_to_csv, write_tsv)
133 changes: 133 additions & 0 deletions deep_reference_parser/io/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,74 @@
"""

import json
import pickle
import csv
import os
import pandas as pd

from ..logger import logger

def _split_list_by_linebreaks(tokens):
"""Cycle through a list of tokens (or labels) and split them into lists
based on the presence of Nones or more likely math.nan caused by converting
pd.DataFrame columns to lists.
"""
out = []
tokens_gen = iter(tokens)
while True:
try:
token = next(tokens_gen)
if isinstance(token, str) and token:
out.append(token)
else:
yield out
out = []
except StopIteration:
if out:
yield out
break

def load_tsv(filepath, split_char="\t"):
"""
Load and return the data stored in the given path.

Expects data in the following format (tab separations).

References o o
o o
1 o o
. o o
o o
WHO title b-r
treatment title i-r
guidelines title i-r
for title i-r
drug title i-r
- title i-r
resistant title i-r
tuberculosis title i-r
, title i-r
2016 title i-r



Args:
filepath (str): Path to the data.
split_char(str): Character to be used to split each line of the
document.

Returns:
a series of lists depending on the number of label columns provided in
filepath.

"""

df = pd.read_csv(filepath, delimiter=split_char, header=None, skip_blank_lines=False)
out = [list(_split_list_by_linebreaks(column)) for _, column in df.iteritems()]

logger.info("Loaded %s training examples", len(out[0]))

return tuple(out)

def write_jsonl(input_data, output_file):
"""
Expand Down Expand Up @@ -61,3 +126,71 @@ def read_jsonl(input_file):
logger.debug("Read %s lines from %s", len(out), input_file)

return out


def write_to_csv(filename, columns, rows):
"""
Create a .csv file from data given as columns and rows

Args:
filename(str): Path and name of the .csv file, without csv extension
columns(list): Columns of the csv file (First row of the file)
rows: Data to write into the csv file, given per row
"""

with open(filename, "w") as csvfile:
wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
wr.writerow(columns)

for i, row in enumerate(rows):
wr.writerow(row)
logger.info("Wrote results to %s", filename)


def write_pickle(input_data, output_file, path=None):
"""
Write an object to pickle

Args:
input_data(dict): A dict to be written to json.
output_file(str): A filename or path to which the json will be saved.
path(str): A string which will be prepended onto `output_file` with
`os.path.join()`. Obviates the need for lengthy `os.path.join`
statements each time this function is called.
"""

if path:

output_file = os.path.join(path, output_file)

with open(output_file, "wb") as fb:
pickle.dump(input_data, fb)


def read_pickle(input_file, path=None):
"""Create a list from a jsonl file

Args:
input_file(str): File to be loaded.
path(str): A string which will be prepended onto `input_file` with
`os.path.join()`. Obviates the need for lengthy `os.path.join`
statements each time this function is called.
"""

if path:
input_file = os.path.join(path, input_file)

with open(input_file, "rb") as fb:
out = pickle.load(fb)

logger.debug("Read data from %s", input_file)

return out

def write_tsv(token_label_pairs, output_path):
"""
Write tsv files to disk
"""
with open(output_path, "w") as fb:
writer = csv.writer(fb, delimiter="\t")
writer.writerows(token_label_pairs)
2 changes: 2 additions & 0 deletions deep_reference_parser/prodigy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@
from .reach_to_prodigy import ReachToProdigy, reach_to_prodigy
from .reference_to_token_annotations import TokenTagger, reference_to_token_annotations
from .spacy_doc_to_prodigy import SpacyDocToProdigy
from .misc import prodigy_to_conll
from .labels_to_prodigy import labels_to_prodigy
57 changes: 57 additions & 0 deletions deep_reference_parser/prodigy/labels_to_prodigy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
def labels_to_prodigy(tokens, labels):
"""
Converts a list of tokens and labels like those used by Rodrigues et al,
and converts to prodigy format dicts.

Args:
tokens (list): A list of tokens.
labels (list): A list of labels relating to `tokens`.

Returns:
A list of prodigy format dicts containing annotated data.
"""

prodigy_data = []

all_token_index = 0

for line_index, line in enumerate(tokens):
prodigy_example = {}

tokens = []
spans = []
token_start_offset = 0

for token_index, token in enumerate(line):

token_end_offset = token_start_offset + len(token)

tokens.append(
{
"text": token,
"id": token_index,
"start": token_start_offset,
"end": token_end_offset,
}
)

spans.append(
{
"label": labels[line_index][token_index : token_index + 1][0],
"start": token_start_offset,
"end": token_end_offset,
"token_start": token_index,
"token_end": token_index,
}
)

prodigy_example["text"] = " ".join(line)
prodigy_example["tokens"] = tokens
prodigy_example["spans"] = spans
prodigy_example["meta"] = {"line": line_index}

token_start_offset = token_end_offset + 1

prodigy_data.append(prodigy_example)

return prodigy_data
38 changes: 38 additions & 0 deletions deep_reference_parser/prodigy/misc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import spacy


def _join_prodigy_tokens(text):
"""Return all prodigy tokens in a single string
"""

return "\n".join([str(i) for i in text])


def prodigy_to_conll(docs):
"""
Expect list of jsons loaded from a jsonl
"""

nlp = spacy.load("en_core_web_sm")
texts = [doc["text"] for doc in docs]
docs = list(nlp.tokenizer.pipe(texts))

out = [_join_prodigy_tokens(i) for i in docs]

out_str = "DOCSTART\n\n" + "\n\n".join(out)

return out_str


def prodigy_to_lists(docs):
"""
Expect list of jsons loaded from a jsonl
"""

nlp = spacy.load("en_core_web_sm")
texts = [doc["text"] for doc in docs]
docs = list(nlp.tokenizer.pipe(texts))

out = [[str(token) for token in doc] for doc in docs]

return out
4 changes: 1 addition & 3 deletions deep_reference_parser/prodigy/prodigy_to_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

msg = Printer()

ROWS_TO_PRINT=15
ROWS_TO_PRINT = 15


class TokenLabelPairs:
Expand Down Expand Up @@ -375,8 +375,6 @@ def prodigy_to_tsv(

with open(output_file, "w") as fb:
writer = csv.writer(fb, delimiter="\t")
# Write DOCSTART and a blank line
# writer.writerows([("DOCSTART", None), (None, None)])
writer.writerows(merged_pairs)

# Print out the first ten rows as a sense check
Expand Down
Loading