From fd72e1445eb41aa9fe732faf06fc6f834c1a39d2 Mon Sep 17 00:00:00 2001
From: Matthew Upson <matthew.a.upson@gmail.com>
Date: Tue, 17 Mar 2020 17:54:41 -0300
Subject: [PATCH 01/11] new: Add new test data

---
 tests/common.py                   |  1 +
 tests/test_data/test_load_tsv.tsv | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)
 create mode 100644 tests/test_data/test_load_tsv.tsv

diff --git a/tests/common.py b/tests/common.py
index 21cfb4e..2bf6107 100644
--- a/tests/common.py
+++ b/tests/common.py
@@ -13,3 +13,4 @@ def get_path(p):
 TEST_REFERENCES = get_path("test_data/test_references.txt")
 TEST_TSV_PREDICT = get_path("test_data/test_tsv_predict.tsv")
 TEST_TSV_TRAIN = get_path("test_data/test_tsv_train.tsv")
+TEST_LOAD_TSV = get_path("test_data/test_load_tsv.tsv")
diff --git a/tests/test_data/test_load_tsv.tsv b/tests/test_data/test_load_tsv.tsv
new file mode 100644
index 0000000..ad64c5d
--- /dev/null
+++ b/tests/test_data/test_load_tsv.tsv
@@ -0,0 +1,18 @@
+the	i-r	a
+focus	i-r	a
+in	i-r	a
+Daloa	i-r	a
+,	i-r	a
+Côte	i-r	a
+d’Ivoire].	i-r	a
+	
+Bulletin	i-r	a
+de	i-r	a
+la	i-r	a
+Société	i-r	a
+de	i-r	a
+Pathologie	i-r	a
+	
+Exotique	i-r	a
+et	i-r	a
+

From aa2699212a9bcb0999a9e8fd83f2a67d45107d21 Mon Sep 17 00:00:00 2001
From: Matthew Upson <matthew.a.upson@gmail.com>
Date: Tue, 17 Mar 2020 17:55:17 -0300
Subject: [PATCH 02/11] chg: Refactor load_tsv to cover multitask case

* Also removes defunct load_data
---
 deep_reference_parser/__init__.py        |   1 -
 deep_reference_parser/reference_utils.py | 171 ++++++-----------------
 tests/test_reference_utils.py            |  80 ++++++++++-
 3 files changed, 120 insertions(+), 132 deletions(-)

diff --git a/deep_reference_parser/__init__.py b/deep_reference_parser/__init__.py
index c18d70b..810d992 100644
--- a/deep_reference_parser/__init__.py
+++ b/deep_reference_parser/__init__.py
@@ -24,7 +24,6 @@
 from .reference_utils import (
     break_into_chunks,
     labels_to_prodigy,
-    load_data,
     load_tsv,
     prodigy_to_conll,
     prodigy_to_lists,
diff --git a/deep_reference_parser/reference_utils.py b/deep_reference_parser/reference_utils.py
index 3a2fcd5..c66271f 100644
--- a/deep_reference_parser/reference_utils.py
+++ b/deep_reference_parser/reference_utils.py
@@ -8,112 +8,56 @@
 import json
 import os
 import pickle
+import pandas as pd
 
 import spacy
 
 from .logger import logger
 
 
-def load_data(filepath):
+def split_list_by_linebreaks(tokens):
+    """Cycle through a list of tokens (or labels) and split them into lists
+    based on the presence of Nones or more likely math.nan caused by converting
+    pd.DataFrame columns to lists.
     """
-    Load and return the data stored in the given path.
-
-    Adapted from: https://github.com/dhlab-epfl/LinkedBooksDeepReferenceParsing
-
-    The data is structured as follows:
-     * Each line contains four columns separated by a single space.
-     * Each word has been put on a separate line and there is an empty line
-        after each sentence.
-     * The first item on each line is a word, the second, third and fourth are
-        tags related to the word.
-
-    Example:
-
-    The sentence "L. Antonielli, Iprefetti dell' Italia napoleonica, Bologna
-        1983." is represented in the dataset as:
-
-    ```
-    L author b-secondary b-r
-    . author i-secondary i-r
-    Antonielli author i-secondary i-r
-    , author i-secondary i-r
-    Iprefetti title i-secondary i-r
-    dell title i-secondary i-r
-    ’ title i-secondary i-r
-    Italia title i-secondary i-r
-    napoleonica title i-secondary i-r
-    , title i-secondary i-r
-    Bologna publicationplace i-secondary i-r
-    1983 year e-secondary i-r
-    . year e-secondary e-r
-    ```
-
-    Args:
-        filepath (str): Path to the data.
-
-    Returns:
-        four lists: The first contains tokens, the next three contain
-            corresponding labels.
-
-    """
-
-    # Arrays to return
-    words = []
-    tags_1 = []
-    tags_2 = []
-    tags_3 = []
-
-    word = tags1 = tags2 = tags3 = []
-    with open(filepath, "r") as file:
-        for line in file:
-            # Do not take the first line into consideration
-
-            if "DOCSTART" not in line:
-                # Check if empty line
-
-                if line in ["\n", "\r\n"]:
-                    # Append line
-
-                    words.append(word)
-                    tags_1.append(tags1)
-                    tags_2.append(tags2)
-                    tags_3.append(tags3)
-
-                    # Reset
-                    word = []
-                    tags1 = []
-                    tags2 = []
-                    tags3 = []
-
-                else:
-                    # Split the line into words, tag #1
-                    w = line[:-1].split(" ")
-
-                    word.append(w[0])
-                    tags1.append(w[1])
-                    tags2.append(w[2])
-                    tags3.append(w[3])
-
-    logger.info("Loaded %s training examples", len(words))
-
-    return words, tags_1, tags_2, tags_3
-
+    out = []
+    tokens_gen = iter(tokens)
+    while True:
+        try:
+            token = next(tokens_gen)
+            if isinstance(token, str) and token:
+                out.append(token)
+            else:
+                yield out
+                out = []
+        except StopIteration:
+            if out:
+                yield out
+            break
 
 def load_tsv(filepath, split_char="\t"):
     """
     Load and return the data stored in the given path.
 
-    Adapted from: https://github.com/dhlab-epfl/LinkedBooksDeepReferenceParsing
+    Expects data in the following format (tab separations).
+
+      References   o       o
+                   o       o
+               1   o       o
+               .   o       o
+                   o       o
+             WHO   title   b-r
+       treatment   title   i-r
+      guidelines   title   i-r
+             for   title   i-r
+            drug   title   i-r
+               -   title   i-r
+       resistant   title   i-r
+    tuberculosis   title   i-r
+               ,   title   i-r
+            2016   title   i-r
 
-    NOTE: In the current implementation in deep_reference_parser, only one set
-    of tags is used. The others will be used in a later PR.
 
-    The data is structured as follows:
-     * Each line contains four columns separated by a single space.
-     * Each word has been put on a separate line and there is an empty line
-        after each sentence.
-     * The first item on each line is a word, the second, third and fourth are
-        tags related to the word.
 
     Args:
         filepath (str): Path to the data.
@@ -121,48 +65,17 @@ def load_tsv(filepath, split_char="\t"):
             document.
 
     Returns:
-        two lists: The first contains tokens, the second contains corresponding
-        labels.
+        a series of lists depending on the number of label columns provided in 
+        filepath.
 
     """
 
-    # Arrays to return
-    words = []
-    tags_1 = []
-
-    word = []
-    tags1 = []
-
-    with open(filepath, "r") as file:
-        for line in file:
-            # Check if empty line
-
-            if line in ["\n", "\r\n", "\t\n"]:
-                # Append line
-
-                words.append(word)
-                tags_1.append(tags1)
-
-                # Reset
-                word = []
-                tags1 = []
-
-            else:
-
-                # Split the line into words, tag #1
-
-                w = line[:-1].split(split_char)
-                word.append(w[0])
-
-                # If tags are passed, (for training) then also add
-
-                if len(w) == 2:
-
-                    tags1.append(w[1])
+    df = pd.read_csv(filepath, delimiter=split_char, header=None, skip_blank_lines=False)
+    out = [list(split_list_by_linebreaks(column)) for _, column in df.iteritems()]
 
-    logger.info("Loaded %s training examples", len(words))
+    logger.info("Loaded %s training examples", len(out[0]))
 
-    return words, tags_1
+    return tuple(out)
 
 
 def prodigy_to_conll(docs):
diff --git a/tests/test_reference_utils.py b/tests/test_reference_utils.py
index 118e399..e9e04ba 100644
--- a/tests/test_reference_utils.py
+++ b/tests/test_reference_utils.py
@@ -12,9 +12,10 @@
     prodigy_to_conll,
     write_tsv,
     yield_token_label_pairs,
+    split_list_by_linebreaks,
 )
 
-from .common import TEST_TSV_PREDICT, TEST_TSV_TRAIN
+from .common import TEST_TSV_PREDICT, TEST_TSV_TRAIN, TEST_LOAD_TSV
 
 
 def test_prodigy_to_conll():
@@ -75,6 +76,14 @@ def test_load_tsv_train():
 
     actual = load_tsv(TEST_TSV_TRAIN)
 
+    assert len(actual[0][0]) == len(expected[0][0])
+    assert len(actual[0][1]) == len(expected[0][1])
+    assert len(actual[0][2]) == len(expected[0][2])
+
+    assert len(actual[1][0]) == len(expected[1][0])
+    assert len(actual[1][1]) == len(expected[1][1])
+    assert len(actual[1][2]) == len(expected[1][2])
+
     assert actual == expected
 
 
@@ -109,13 +118,59 @@ def test_load_tsv_predict():
             ["Bulletin", "de", "la", "Société", "de", "Pathologie"],
             ["Exotique", "et"],
         ],
-        [[], [], [],],
     )
 
     actual = load_tsv(TEST_TSV_PREDICT)
 
     assert actual == expected
 
+def test_load_tsv_train_multiple_labels():
+    """
+    Text of TEST_TSV_TRAIN:
+
+    ```
+        the	i-r
+        focus	i-r
+        in	i-r
+        Daloa	i-r
+        ,	i-r
+        Côte	i-r
+        d’Ivoire].	i-r
+
+        Bulletin	i-r
+        de	i-r
+        la	i-r
+        Société	i-r
+        de-r
+        Pathologie	i-r
+
+        Exotique	i-r
+        et	i-r
+    ```
+    """
+
+    expected = (
+        [
+            ["the", "focus", "in", "Daloa", ",", "Côte", "d’Ivoire]."],
+            ["Bulletin", "de", "la", "Société", "de", "Pathologie"],
+            ["Exotique", "et"],
+        ],
+        [
+            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
+            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
+            ["i-r", "i-r"],
+        ],
+        [
+            ["a", "a", "a", "a", "a", "a", "a"],
+            ["a", "a", "a", "a", "a", "a"],
+            ["a", "a"],
+        ],
+    )
+
+    actual = load_tsv(TEST_LOAD_TSV)
+
+    assert actual == expected
+
 
 def test_yield_toke_label_pairs():
 
@@ -197,3 +252,24 @@ def test_break_into_chunks():
     actual = break_into_chunks(before, max_words=2)
 
     assert expected == actual
+
+def test_split_list_by_linebreaks():
+
+    lst = ["a", "b", "c", None, "d"]
+    expected = [["a", "b", "c"], ["d"]]
+
+    actual = split_list_by_linebreaks(lst)
+
+def test_list_by_linebreaks_ending_in_None():
+
+    lst = ["a", "b", "c", float("nan"), "d", None]
+    expected = [["a", "b", "c"], ["d"]]
+
+    actual = split_list_by_linebreaks(lst)
+
+def test_list_by_linebreaks_starting_in_None():
+
+    lst = [None, "a", "b", "c", None, "d"]
+    expected = [["a", "b", "c"], ["d"]]
+
+    actual = split_list_by_linebreaks(lst)

From f7ff43315359498fc6bbe88a3da932b7a21475d8 Mon Sep 17 00:00:00 2001
From: Matthew Upson <matthew.a.upson@gmail.com>
Date: Tue, 17 Mar 2020 18:13:51 -0300
Subject: [PATCH 03/11] chg: Remove commented out code

---
 deep_reference_parser/prodigy/prodigy_to_tsv.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/deep_reference_parser/prodigy/prodigy_to_tsv.py b/deep_reference_parser/prodigy/prodigy_to_tsv.py
index 50488fa..4743530 100644
--- a/deep_reference_parser/prodigy/prodigy_to_tsv.py
+++ b/deep_reference_parser/prodigy/prodigy_to_tsv.py
@@ -375,8 +375,6 @@ def prodigy_to_tsv(
 
     with open(output_file, "w") as fb:
         writer = csv.writer(fb, delimiter="\t")
-        # Write DOCSTART and a blank line
-        # writer.writerows([("DOCSTART", None), (None, None)])
         writer.writerows(merged_pairs)
 
     # Print out the first ten rows as a sense check

From 637abf771a7b39aa92f4ccd1dedacc607f766c89 Mon Sep 17 00:00:00 2001
From: Matthew Upson <matthew.a.upson@gmail.com>
Date: Tue, 17 Mar 2020 18:14:18 -0300
Subject: [PATCH 04/11] chg: Adjust requirements for tensorflow compatibility

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 7e84c93..7e11145 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -42,9 +42,9 @@ sklearn-crfsuite==0.3.6
 spacy==2.1.7
 srsly==1.0.1
 tabulate==0.8.6
-tensorboard==1.14.0
+tensorboard==1.16.0
 tensorflow==1.15.2
-tensorflow-estimator==1.14.0
+tensorflow-estimator==1.15.1
 termcolor==1.1.0
 thinc==7.0.8
 tqdm==4.42.1

From 6c743462939dc70a69e97d01ab6bd07c07fb8da9 Mon Sep 17 00:00:00 2001
From: Matthew Upson <matthew.a.upson@gmail.com>
Date: Tue, 17 Mar 2020 19:03:24 -0300
Subject: [PATCH 05/11] chg: Fix tensorflow dependency

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 7e11145..ca18031 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -42,7 +42,7 @@ sklearn-crfsuite==0.3.6
 spacy==2.1.7
 srsly==1.0.1
 tabulate==0.8.6
-tensorboard==1.16.0
+tensorboard==1.15.0
 tensorflow==1.15.2
 tensorflow-estimator==1.15.1
 termcolor==1.1.0

From cda8323acf0f7fc1bd2720df4408dfc8a9a50b68 Mon Sep 17 00:00:00 2001
From: Matthew Upson <matthew.a.upson@gmail.com>
Date: Tue, 17 Mar 2020 22:56:09 -0300
Subject: [PATCH 06/11] chg: Move prodigy functions to prodigy module

---
 .../prodigy/labels_to_prodigy.py              | 58 +++++++++++++++++++
 deep_reference_parser/prodigy/misc.py         | 36 ++++++++++++
 tests/{ => prodigy}/test_labels_to_prodigy.py |  2 +-
 tests/prodigy/test_misc.py                    | 18 ++++++
 4 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 deep_reference_parser/prodigy/labels_to_prodigy.py
 create mode 100644 deep_reference_parser/prodigy/misc.py
 rename tests/{ => prodigy}/test_labels_to_prodigy.py (97%)
 create mode 100644 tests/prodigy/test_misc.py

diff --git a/deep_reference_parser/prodigy/labels_to_prodigy.py b/deep_reference_parser/prodigy/labels_to_prodigy.py
new file mode 100644
index 0000000..5425305
--- /dev/null
+++ b/deep_reference_parser/prodigy/labels_to_prodigy.py
@@ -0,0 +1,58 @@
+
+def labels_to_prodigy(tokens, labels):
+    """
+    Converts a list of tokens and labels like those used by Rodrigues et al,
+    and converts to prodigy format dicts.
+
+    Args:
+        tokens (list): A list of tokens.
+        labels (list): A list of labels relating to `tokens`.
+
+    Returns:
+        A list of prodigy format dicts containing annotated data.
+    """
+
+    prodigy_data = []
+
+    all_token_index = 0
+
+    for line_index, line in enumerate(tokens):
+        prodigy_example = {}
+
+        tokens = []
+        spans = []
+        token_start_offset = 0
+
+        for token_index, token in enumerate(line):
+
+            token_end_offset = token_start_offset + len(token)
+
+            tokens.append(
+                {
+                    "text": token,
+                    "id": token_index,
+                    "start": token_start_offset,
+                    "end": token_end_offset,
+                }
+            )
+
+            spans.append(
+                {
+                    "label": labels[line_index][token_index : token_index + 1][0],
+                    "start": token_start_offset,
+                    "end": token_end_offset,
+                    "token_start": token_index,
+                    "token_end": token_index,
+                }
+            )
+
+            prodigy_example["text"] = " ".join(line)
+            prodigy_example["tokens"] = tokens
+            prodigy_example["spans"] = spans
+            prodigy_example["meta"] = {"line": line_index}
+
+            token_start_offset = token_end_offset + 1
+
+        prodigy_data.append(prodigy_example)
+
+    return prodigy_data
diff --git a/deep_reference_parser/prodigy/misc.py b/deep_reference_parser/prodigy/misc.py
new file mode 100644
index 0000000..0ebbd94
--- /dev/null
+++ b/deep_reference_parser/prodigy/misc.py
@@ -0,0 +1,36 @@
+import spacy
+
+def _join_prodigy_tokens(text):
+    """Return all prodigy tokens in a single string
+    """
+
+    return "\n".join([str(i) for i in text])
+
+def prodigy_to_conll(docs):
+    """
+    Expect list of jsons loaded from a jsonl
+    """
+
+    nlp = spacy.load("en_core_web_sm")
+    texts = [doc["text"] for doc in docs]
+    docs = list(nlp.tokenizer.pipe(texts))
+
+    out = [_join_prodigy_tokens(i) for i in docs]
+
+    out_str = "DOCSTART\n\n" + "\n\n".join(out)
+
+    return out_str
+
+
+def prodigy_to_lists(docs):
+    """
+    Expect list of jsons loaded from a jsonl
+    """
+
+    nlp = spacy.load("en_core_web_sm")
+    texts = [doc["text"] for doc in docs]
+    docs = list(nlp.tokenizer.pipe(texts))
+
+    out = [[str(token) for token in doc] for doc in docs]
+
+    return out
diff --git a/tests/test_labels_to_prodigy.py b/tests/prodigy/test_labels_to_prodigy.py
similarity index 97%
rename from tests/test_labels_to_prodigy.py
rename to tests/prodigy/test_labels_to_prodigy.py
index 53b8d77..0ef67c8 100644
--- a/tests/test_labels_to_prodigy.py
+++ b/tests/prodigy/test_labels_to_prodigy.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # coding: utf-8
 
-from deep_reference_parser.reference_utils import labels_to_prodigy
+from deep_reference_parser.prodigy import labels_to_prodigy
 
 
 def test_labels_to_prodigy():
diff --git a/tests/prodigy/test_misc.py b/tests/prodigy/test_misc.py
new file mode 100644
index 0000000..c4436c0
--- /dev/null
+++ b/tests/prodigy/test_misc.py
@@ -0,0 +1,18 @@
+from deep_reference_parser.prodigy import prodigy_to_conll
+
+def test_prodigy_to_conll():
+
+    before = [
+        {"text": "References",},
+        {"text": "37. No single case of malaria reported in"},
+        {
+            "text": "an essential requirement for the correct labelling of potency for therapeutic"
+        },
+        {"text": "EQAS, quality control for STI"},
+    ]
+
+    after = "DOCSTART\n\nReferences\n\n37\n.\nNo\nsingle\ncase\nof\nmalaria\nreported\nin\n\nan\nessential\nrequirement\nfor\nthe\ncorrect\nlabelling\nof\npotency\nfor\ntherapeutic\n\nEQAS\n,\nquality\ncontrol\nfor\nSTI"
+
+    out = prodigy_to_conll(before)
+
+    assert after == out

From 9041fb9f44d1212b39726ea6f711ed0e241c545e Mon Sep 17 00:00:00 2001
From: Matthew Upson <matthew.a.upson@gmail.com>
Date: Tue, 17 Mar 2020 22:56:53 -0300
Subject: [PATCH 07/11] chg: Move io functions to io module

---
 deep_reference_parser/io/__init__.py     |   3 +-
 deep_reference_parser/io/io.py           | 133 +++++++++
 deep_reference_parser/reference_utils.py | 341 -----------------------
 tests/test_io.py                         | 230 ++++++++++++++-
 tests/test_reference_utils.py            | 259 +----------------
 5 files changed, 363 insertions(+), 603 deletions(-)

diff --git a/deep_reference_parser/io/__init__.py b/deep_reference_parser/io/__init__.py
index 613c7e6..4e7eaba 100644
--- a/deep_reference_parser/io/__init__.py
+++ b/deep_reference_parser/io/__init__.py
@@ -1 +1,2 @@
-from .io import read_jsonl, write_jsonl
+from .io import (load_tsv, read_jsonl, read_pickle, write_jsonl, write_pickle,
+                 write_to_csv, write_tsv)
diff --git a/deep_reference_parser/io/io.py b/deep_reference_parser/io/io.py
index afa2cd4..639b43d 100644
--- a/deep_reference_parser/io/io.py
+++ b/deep_reference_parser/io/io.py
@@ -6,9 +6,74 @@
 """
 
 import json
+import pickle
+import csv
+import os
+import pandas as pd
 
 from ..logger import logger
 
+def _split_list_by_linebreaks(tokens):
+    """Cycle through a list of tokens (or labels) and split them into lists
+    based on the presence of Nones or more likely math.nan caused by converting
+    pd.DataFrame columns to lists.
+    """
+    out = []
+    tokens_gen = iter(tokens)
+    while True:
+        try:
+            token = next(tokens_gen)
+            if isinstance(token, str) and token:
+                out.append(token)
+            else:
+                yield out
+                out = []
+        except StopIteration:
+            if out:
+                yield out
+            break
+
+def load_tsv(filepath, split_char="\t"):
+    """
+    Load and return the data stored in the given path.
+
+    Expects data in the following format (tab separations).
+
+      References   o       o
+                   o       o
+               1   o       o
+               .   o       o
+                   o       o
+             WHO   title   b-r
+       treatment   title   i-r
+      guidelines   title   i-r
+             for   title   i-r
+            drug   title   i-r
+               -   title   i-r
+       resistant   title   i-r
+    tuberculosis   title   i-r
+               ,   title   i-r
+            2016   title   i-r
+
+
+
+    Args:
+        filepath (str): Path to the data.
+        split_char(str): Character to be used to split each line of the
+            document.
+
+    Returns:
+        a series of lists depending on the number of label columns provided in 
+        filepath.
+
+    """
+
+    df = pd.read_csv(filepath, delimiter=split_char, header=None, skip_blank_lines=False)
+    out = [list(_split_list_by_linebreaks(column)) for _, column in df.iteritems()]
+
+    logger.info("Loaded %s training examples", len(out[0]))
+
+    return tuple(out)
 
 def write_jsonl(input_data, output_file):
     """
@@ -61,3 +126,71 @@ def read_jsonl(input_file):
     logger.debug("Read %s lines from %s", len(out), input_file)
 
     return out
+
+
+def write_to_csv(filename, columns, rows):
+    """
+    Create a .csv file from data given as columns and rows
+
+    Args:
+        filename(str): Path and name of the .csv file, without csv extension
+        columns(list): Columns of the csv file (First row of the file)
+        rows: Data to write into the csv file, given per row
+    """
+
+    with open(filename, "w") as csvfile:
+        wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
+        wr.writerow(columns)
+
+        for i, row in enumerate(rows):
+            wr.writerow(row)
+    logger.info("Wrote results to %s", filename)
+
+
+def write_pickle(input_data, output_file, path=None):
+    """
+    Write an object to pickle
+
+    Args:
+        input_data(dict): A dict to be written to json.
+        output_file(str): A filename or path to which the json will be saved.
+        path(str): A string which will be prepended onto `output_file` with
+            `os.path.join()`. Obviates the need for lengthy `os.path.join`
+            statements each time this function is called.
+    """
+
+    if path:
+
+        output_file = os.path.join(path, output_file)
+
+    with open(output_file, "wb") as fb:
+        pickle.dump(input_data, fb)
+
+
+def read_pickle(input_file, path=None):
+    """Create a list from a jsonl file
+
+    Args:
+        input_file(str): File to be loaded.
+        path(str): A string which will be prepended onto `input_file` with
+            `os.path.join()`. Obviates the need for lengthy `os.path.join`
+            statements each time this function is called.
+    """
+
+    if path:
+        input_file = os.path.join(path, input_file)
+
+    with open(input_file, "rb") as fb:
+        out = pickle.load(fb)
+
+    logger.debug("Read data from %s", input_file)
+
+    return out
+
+def write_tsv(token_label_pairs, output_path):
+    """
+    Write tsv files to disk
+    """
+    with open(output_path, "w") as fb:
+        writer = csv.writer(fb, delimiter="\t")
+        writer.writerows(token_label_pairs)
diff --git a/deep_reference_parser/reference_utils.py b/deep_reference_parser/reference_utils.py
index c66271f..5516b51 100644
--- a/deep_reference_parser/reference_utils.py
+++ b/deep_reference_parser/reference_utils.py
@@ -4,339 +4,8 @@
 """
 """
 
-import csv
-import json
-import os
-import pickle
-import pandas as pd
-
-import spacy
-
 from .logger import logger
 
-
-def split_list_by_linebreaks(tokens):
-    """Cycle through a list of tokens (or labels) and split them into lists
-    based on the presence of Nones or more likely math.nan caused by converting
-    pd.DataFrame columns to lists.
-    """
-    out = []
-    tokens_gen = iter(tokens)
-    while True:
-        try:
-            token = next(tokens_gen)
-            if isinstance(token, str) and token:
-                out.append(token)
-            else:
-                yield out
-                out = []
-        except StopIteration:
-            if out:
-                yield out
-            break
-
-def load_tsv(filepath, split_char="\t"):
-    """
-    Load and return the data stored in the given path.
-
-    Expects data in the following format (tab separations).
-
-      References   o       o
-                   o       o
-               1   o       o
-               .   o       o
-                   o       o
-             WHO   title   b-r
-       treatment   title   i-r
-      guidelines   title   i-r
-             for   title   i-r
-            drug   title   i-r
-               -   title   i-r
-       resistant   title   i-r
-    tuberculosis   title   i-r
-               ,   title   i-r
-            2016   title   i-r
-
-
-
-    Args:
-        filepath (str): Path to the data.
-        split_char(str): Character to be used to split each line of the
-            document.
-
-    Returns:
-        a series of lists depending on the number of label columns provided in 
-        filepath.
-
-    """
-
-    df = pd.read_csv(filepath, delimiter=split_char, header=None, skip_blank_lines=False)
-    out = [list(split_list_by_linebreaks(column)) for _, column in df.iteritems()]
-
-    logger.info("Loaded %s training examples", len(out[0]))
-
-    return tuple(out)
-
-
-def prodigy_to_conll(docs):
-    """
-    Expect list of jsons loaded from a jsonl
-    """
-
-    nlp = spacy.load("en_core_web_sm")
-    texts = [doc["text"] for doc in docs]
-    docs = list(nlp.tokenizer.pipe(texts))
-
-    out = [_join_prodigy_tokens(i) for i in docs]
-
-    out_str = "DOCSTART\n\n" + "\n\n".join(out)
-
-    return out_str
-
-
-def prodigy_to_lists(docs):
-    """
-    Expect list of jsons loaded from a jsonl
-    """
-
-    nlp = spacy.load("en_core_web_sm")
-    texts = [doc["text"] for doc in docs]
-    docs = list(nlp.tokenizer.pipe(texts))
-
-    out = [[str(token) for token in doc] for doc in docs]
-
-    return out
-
-
-def _join_prodigy_tokens(text):
-    """Return all prodigy tokens in a single string
-    """
-
-    return "\n".join([str(i) for i in text])
-
-
-def write_json(input_data, output_file, path=None):
-    """
-    Write a dict to json
-
-    Args:
-        input_data(dict): A dict to be written to json.
-        output_file(str): A filename or path to which the json will be saved.
-        path(str): A string which will be prepended onto `output_file` with
-            `os.path.join()`. Obviates the need for lengthy `os.path.join`
-            statements each time this function is called.
-    """
-
-    if path:
-
-        output_file = os.path.join(path, output_file)
-
-    logger.info("Writing data to %s", output_file)
-
-    with open(output_file, "w") as fb:
-        fb.write(json.dumps(input_data))
-
-
-def write_jsonl(input_data, output_file, path=None):
-    """
-    Write a dict to jsonl (line delimited json)
-
-    Output format will look like:
-
-    ```
-    {"a": 0}
-    {"b": 1}
-    {"c": 2}
-    {"d": 3}
-    ```
-
-    Args:
-        input_data(dict): A dict to be written to json.
-        output_file(str): A filename or path to which the json will be saved.
-        path(str): A string which will be prepended onto `output_file` with
-            `os.path.join()`. Obviates the need for lengthy `os.path.join`
-            statements each time this function is called.
-    """
-
-    if path:
-
-        output_file = os.path.join(path, output_file)
-
-    with open(output_file, "w") as fb:
-
-        # Check if a dict (and convert to list if so)
-
-        if isinstance(input_data, dict):
-            input_data = [value for key, value in input_data.items()]
-
-        # Write out to jsonl file
-
-        logger.info("Writing %s lines to %s", len(input_data), output_file)
-
-        for i in input_data:
-            json_ = json.dumps(i) + "\n"
-            fb.write(json_)
-
-
-def read_jsonl(input_file, path=None):
-    """Create a list from a jsonl file
-
-    Args:
-        input_file(str): File to be loaded.
-        path(str): A string which will be prepended onto `input_file` with
-            `os.path.join()`. Obviates the need for lengthy `os.path.join`
-            statements each time this function is called.
-    """
-
-    if path:
-        input_file = os.path.join(path, input_file)
-
-    out = []
-    with open(input_file, "r") as fb:
-
-        logger.info("Reading contents of %s", input_file)
-
-        for i in fb:
-            out.append(json.loads(i))
-
-    logger.info("Read %s lines from %s", len(out), input_file)
-
-    return out
-
-
-def write_txt(input_data, output_file):
-    """Write a text string to a file
-
-    Args:
-        input_file (str): String to be written
-        output_file (str): File to be saved to
-    """
-
-    with open(output_file, "w") as fb:
-        fb.write(input_data)
-
-    logger.info("Read %s characters to file: %s", len(input_data), output_file)
-
-
-def labels_to_prodigy(tokens, labels):
-    """
-    Converts a list of tokens and labels like those used by Rodrigues et al,
-    and converts to prodigy format dicts.
-
-    Args:
-        tokens (list): A list of tokens.
-        labels (list): A list of labels relating to `tokens`.
-
-    Returns:
-        A list of prodigy format dicts containing annotated data.
-    """
-
-    prodigy_data = []
-
-    all_token_index = 0
-
-    for line_index, line in enumerate(tokens):
-        prodigy_example = {}
-
-        tokens = []
-        spans = []
-        token_start_offset = 0
-
-        for token_index, token in enumerate(line):
-
-            token_end_offset = token_start_offset + len(token)
-
-            tokens.append(
-                {
-                    "text": token,
-                    "id": token_index,
-                    "start": token_start_offset,
-                    "end": token_end_offset,
-                }
-            )
-
-            spans.append(
-                {
-                    "label": labels[line_index][token_index : token_index + 1][0],
-                    "start": token_start_offset,
-                    "end": token_end_offset,
-                    "token_start": token_index,
-                    "token_end": token_index,
-                }
-            )
-
-            prodigy_example["text"] = " ".join(line)
-            prodigy_example["tokens"] = tokens
-            prodigy_example["spans"] = spans
-            prodigy_example["meta"] = {"line": line_index}
-
-            token_start_offset = token_end_offset + 1
-
-        prodigy_data.append(prodigy_example)
-
-    return prodigy_data
-
-
-def write_to_csv(filename, columns, rows):
-    """
-    Create a .csv file from data given as columns and rows
-
-    Args:
-        filename(str): Path and name of the .csv file, without csv extension
-        columns(list): Columns of the csv file (First row of the file)
-        rows: Data to write into the csv file, given per row
-    """
-
-    with open(filename, "w") as csvfile:
-        wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
-        wr.writerow(columns)
-
-        for i, row in enumerate(rows):
-            wr.writerow(row)
-    logger.info("Wrote results to %s", filename)
-
-
-def write_pickle(input_data, output_file, path=None):
-    """
-    Write an object to pickle
-
-    Args:
-        input_data(dict): A dict to be written to json.
-        output_file(str): A filename or path to which the json will be saved.
-        path(str): A string which will be prepended onto `output_file` with
-            `os.path.join()`. Obviates the need for lengthy `os.path.join`
-            statements each time this function is called.
-    """
-
-    if path:
-
-        output_file = os.path.join(path, output_file)
-
-    with open(output_file, "wb") as fb:
-        pickle.dump(input_data, fb)
-
-
-def read_pickle(input_file, path=None):
-    """Create a list from a jsonl file
-
-    Args:
-        input_file(str): File to be loaded.
-        path(str): A string which will be prepended onto `input_file` with
-            `os.path.join()`. Obviates the need for lengthy `os.path.join`
-            statements each time this function is called.
-    """
-
-    if path:
-        input_file = os.path.join(path, input_file)
-
-    with open(input_file, "rb") as fb:
-        out = pickle.load(fb)
-
-    logger.debug("Read data from %s", input_file)
-
-    return out
-
-
 def yield_token_label_pairs(tokens, labels):
     """
     Convert matching lists of tokens and labels to tuples of (token, label) but
@@ -355,16 +24,6 @@ def yield_token_label_pairs(tokens, labels):
         else:
             yield (None, None)
 
-
-def write_tsv(token_label_pairs, output_path):
-    """
-    Write tsv files to disk
-    """
-    with open(output_path, "w") as fb:
-        writer = csv.writer(fb, delimiter="\t")
-        writer.writerows(token_label_pairs)
-
-
 def break_into_chunks(doc, max_words=250):
     """
     Breaks a list into lists of lists of length max_words
diff --git a/tests/test_io.py b/tests/test_io.py
index 3799131..931ad35 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -2,13 +2,13 @@
 # coding: utf-8
 
 import os
-import tempfile
 
 import pytest
 
-from deep_reference_parser.io import read_jsonl, write_jsonl
+from deep_reference_parser.io.io import read_jsonl, write_jsonl, load_tsv, write_tsv, _split_list_by_linebreaks
+from deep_reference_parser.reference_utils import yield_token_label_pairs
 
-from .common import TEST_JSONL
+from .common import TEST_JSONL, TEST_TSV_TRAIN, TEST_TSV_PREDICT, TEST_LOAD_TSV
 
 
 @pytest.fixture(scope="module")
@@ -16,6 +16,209 @@ def tmpdir(tmpdir_factory):
     return tmpdir_factory.mktemp("data")
 
 
+def test_write_tsv(tmpdir):
+
+    expected = (
+        [
+            [],
+            ["the", "focus", "in", "Daloa", ",", "Côte", "d’Ivoire]."],
+            ["Bulletin", "de", "la", "Société", "de", "Pathologie"],
+            ["Exotique", "et"],
+        ],
+        [
+            [],
+            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
+            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
+            ["i-r", "i-r"],
+        ],
+    )
+
+    token_label_tuples = list(yield_token_label_pairs(expected[0], expected[1]))
+
+    PATH = os.path.join(tmpdir, "test_tsv.tsv")
+    write_tsv(token_label_tuples, PATH)
+    actual = load_tsv(os.path.join(PATH))
+
+    assert expected == actual
+
+def test_load_tsv_train():
+    """
+    Text of TEST_TSV_TRAIN:
+
+    ```
+        the	i-r
+        focus	i-r
+        in	i-r
+        Daloa	i-r
+        ,	i-r
+        Côte	i-r
+        d’Ivoire].	i-r
+
+        Bulletin	i-r
+        de	i-r
+        la	i-r
+        Société	i-r
+        de	i-r
+        Pathologie	i-r
+
+        Exotique	i-r
+        et	i-r
+    ```
+    """
+
+    expected = (
+        [
+            ["the", "focus", "in", "Daloa", ",", "Côte", "d’Ivoire]."],
+            ["Bulletin", "de", "la", "Société", "de", "Pathologie"],
+            ["Exotique", "et"],
+        ],
+        [
+            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
+            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
+            ["i-r", "i-r"],
+        ],
+    )
+
+    actual = load_tsv(TEST_TSV_TRAIN)
+
+    assert len(actual[0][0]) == len(expected[0][0])
+    assert len(actual[0][1]) == len(expected[0][1])
+    assert len(actual[0][2]) == len(expected[0][2])
+
+    assert len(actual[1][0]) == len(expected[1][0])
+    assert len(actual[1][1]) == len(expected[1][1])
+    assert len(actual[1][2]) == len(expected[1][2])
+
+    assert actual == expected
+
+
+def test_load_tsv_predict():
+    """
+    Text of TEST_TSV_PREDICT:
+
+    ```
+        the
+        focus
+        in
+        Daloa
+        ,
+        Côte
+        d’Ivoire].
+
+        Bulletin
+        de
+        la
+        Société
+        de
+        Pathologie
+
+        Exotique
+        et
+    ```
+    """
+
+    expected = (
+        [
+            ["the", "focus", "in", "Daloa", ",", "Côte", "d’Ivoire]."],
+            ["Bulletin", "de", "la", "Société", "de", "Pathologie"],
+            ["Exotique", "et"],
+        ],
+    )
+
+    actual = load_tsv(TEST_TSV_PREDICT)
+
+    assert actual == expected
+
+def test_load_tsv_train_multiple_labels():
+    """
+    Text of TEST_TSV_TRAIN:
+
+    ```
+        the	i-r
+        focus	i-r
+        in	i-r
+        Daloa	i-r
+        ,	i-r
+        Côte	i-r
+        d’Ivoire].	i-r
+
+        Bulletin	i-r
+        de	i-r
+        la	i-r
+        Société	i-r
+        de-r
+        Pathologie	i-r
+
+        Exotique	i-r
+        et	i-r
+    ```
+    """
+
+    expected = (
+        [
+            ["the", "focus", "in", "Daloa", ",", "Côte", "d’Ivoire]."],
+            ["Bulletin", "de", "la", "Société", "de", "Pathologie"],
+            ["Exotique", "et"],
+        ],
+        [
+            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
+            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
+            ["i-r", "i-r"],
+        ],
+        [
+            ["a", "a", "a", "a", "a", "a", "a"],
+            ["a", "a", "a", "a", "a", "a"],
+            ["a", "a"],
+        ],
+    )
+
+    actual = load_tsv(TEST_LOAD_TSV)
+
+    assert actual == expected
+
+
+def test_yield_toke_label_pairs():
+
+    tokens = [
+        [],
+        ["the", "focus", "in", "Daloa", ",", "Côte", "d’Ivoire]."],
+        ["Bulletin", "de", "la", "Société", "de", "Pathologie"],
+        ["Exotique", "et"],
+    ]
+
+    labels = [
+        [],
+        ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
+        ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
+        ["i-r", "i-r"],
+    ]
+
+    expected = [
+        (None, None),
+        ("the", "i-r"),
+        ("focus", "i-r"),
+        ("in", "i-r"),
+        ("Daloa", "i-r"),
+        (",", "i-r"),
+        ("Côte", "i-r"),
+        ("d’Ivoire].", "i-r"),
+        (None, None),
+        ("Bulletin", "i-r"),
+        ("de", "i-r"),
+        ("la", "i-r"),
+        ("Société", "i-r"),
+        ("de", "i-r"),
+        ("Pathologie", "i-r"),
+        (None, None),
+        ("Exotique", "i-r"),
+        ("et", "i-r"),
+        (None, None),
+    ]
+
+    actual = list(yield_token_label_pairs(tokens, labels))
+
+    assert expected == actual
+
 def test_read_jsonl():
 
     expected = [
@@ -76,3 +279,24 @@ def test_write_jsonl(tmpdir):
     actual = read_jsonl(temp_file)
 
     assert expected == actual
+
+def test_split_list_by_linebreaks():
+
+    lst = ["a", "b", "c", None, "d"]
+    expected = [["a", "b", "c"], ["d"]]
+
+    actual = _split_list_by_linebreaks(lst)
+
+def test_list_by_linebreaks_ending_in_None():
+
+    lst = ["a", "b", "c", float("nan"), "d", None]
+    expected = [["a", "b", "c"], ["d"]]
+
+    actual = _split_list_by_linebreaks(lst)
+
+def test_list_by_linebreaks_starting_in_None():
+
+    lst = [None, "a", "b", "c", None, "d"]
+    expected = [["a", "b", "c"], ["d"]]
+
+    actual = _split_list_by_linebreaks(lst)
diff --git a/tests/test_reference_utils.py b/tests/test_reference_utils.py
index e9e04ba..4b329d8 100644
--- a/tests/test_reference_utils.py
+++ b/tests/test_reference_utils.py
@@ -1,249 +1,12 @@
 #!/usr/bin/env python3
 # coding: utf-8
 
-import os
-import tempfile
-
 import pytest
 
 from deep_reference_parser.reference_utils import (
-    break_into_chunks,
-    load_tsv,
-    prodigy_to_conll,
-    write_tsv,
-    yield_token_label_pairs,
-    split_list_by_linebreaks,
+    break_into_chunks
 )
 
-from .common import TEST_TSV_PREDICT, TEST_TSV_TRAIN, TEST_LOAD_TSV
-
-
-def test_prodigy_to_conll():
-
-    before = [
-        {"text": "References",},
-        {"text": "37. No single case of malaria reported in"},
-        {
-            "text": "an essential requirement for the correct labelling of potency for therapeutic"
-        },
-        {"text": "EQAS, quality control for STI"},
-    ]
-
-    after = "DOCSTART\n\nReferences\n\n37\n.\nNo\nsingle\ncase\nof\nmalaria\nreported\nin\n\nan\nessential\nrequirement\nfor\nthe\ncorrect\nlabelling\nof\npotency\nfor\ntherapeutic\n\nEQAS\n,\nquality\ncontrol\nfor\nSTI"
-
-    out = prodigy_to_conll(before)
-
-    assert after == out
-
-
-def test_load_tsv_train():
-    """
-    Text of TEST_TSV_TRAIN:
-
-    ```
-        the	i-r
-        focus	i-r
-        in	i-r
-        Daloa	i-r
-        ,	i-r
-        Côte	i-r
-        d’Ivoire].	i-r
-
-        Bulletin	i-r
-        de	i-r
-        la	i-r
-        Société	i-r
-        de	i-r
-        Pathologie	i-r
-
-        Exotique	i-r
-        et	i-r
-    ```
-    """
-
-    expected = (
-        [
-            ["the", "focus", "in", "Daloa", ",", "Côte", "d’Ivoire]."],
-            ["Bulletin", "de", "la", "Société", "de", "Pathologie"],
-            ["Exotique", "et"],
-        ],
-        [
-            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
-            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
-            ["i-r", "i-r"],
-        ],
-    )
-
-    actual = load_tsv(TEST_TSV_TRAIN)
-
-    assert len(actual[0][0]) == len(expected[0][0])
-    assert len(actual[0][1]) == len(expected[0][1])
-    assert len(actual[0][2]) == len(expected[0][2])
-
-    assert len(actual[1][0]) == len(expected[1][0])
-    assert len(actual[1][1]) == len(expected[1][1])
-    assert len(actual[1][2]) == len(expected[1][2])
-
-    assert actual == expected
-
-
-def test_load_tsv_predict():
-    """
-    Text of TEST_TSV_PREDICT:
-
-    ```
-        the
-        focus
-        in
-        Daloa
-        ,
-        Côte
-        d’Ivoire].
-
-        Bulletin
-        de
-        la
-        Société
-        de
-        Pathologie
-
-        Exotique
-        et
-    ```
-    """
-
-    expected = (
-        [
-            ["the", "focus", "in", "Daloa", ",", "Côte", "d’Ivoire]."],
-            ["Bulletin", "de", "la", "Société", "de", "Pathologie"],
-            ["Exotique", "et"],
-        ],
-    )
-
-    actual = load_tsv(TEST_TSV_PREDICT)
-
-    assert actual == expected
-
-def test_load_tsv_train_multiple_labels():
-    """
-    Text of TEST_TSV_TRAIN:
-
-    ```
-        the	i-r
-        focus	i-r
-        in	i-r
-        Daloa	i-r
-        ,	i-r
-        Côte	i-r
-        d’Ivoire].	i-r
-
-        Bulletin	i-r
-        de	i-r
-        la	i-r
-        Société	i-r
-        de-r
-        Pathologie	i-r
-
-        Exotique	i-r
-        et	i-r
-    ```
-    """
-
-    expected = (
-        [
-            ["the", "focus", "in", "Daloa", ",", "Côte", "d’Ivoire]."],
-            ["Bulletin", "de", "la", "Société", "de", "Pathologie"],
-            ["Exotique", "et"],
-        ],
-        [
-            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
-            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
-            ["i-r", "i-r"],
-        ],
-        [
-            ["a", "a", "a", "a", "a", "a", "a"],
-            ["a", "a", "a", "a", "a", "a"],
-            ["a", "a"],
-        ],
-    )
-
-    actual = load_tsv(TEST_LOAD_TSV)
-
-    assert actual == expected
-
-
-def test_yield_toke_label_pairs():
-
-    tokens = [
-        [],
-        ["the", "focus", "in", "Daloa", ",", "Côte", "d’Ivoire]."],
-        ["Bulletin", "de", "la", "Société", "de", "Pathologie"],
-        ["Exotique", "et"],
-    ]
-
-    labels = [
-        [],
-        ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
-        ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
-        ["i-r", "i-r"],
-    ]
-
-    expected = [
-        (None, None),
-        ("the", "i-r"),
-        ("focus", "i-r"),
-        ("in", "i-r"),
-        ("Daloa", "i-r"),
-        (",", "i-r"),
-        ("Côte", "i-r"),
-        ("d’Ivoire].", "i-r"),
-        (None, None),
-        ("Bulletin", "i-r"),
-        ("de", "i-r"),
-        ("la", "i-r"),
-        ("Société", "i-r"),
-        ("de", "i-r"),
-        ("Pathologie", "i-r"),
-        (None, None),
-        ("Exotique", "i-r"),
-        ("et", "i-r"),
-        (None, None),
-    ]
-
-    actual = list(yield_token_label_pairs(tokens, labels))
-
-    assert expected == actual
-
-
-def test_write_tsv():
-
-    expected = (
-        [
-            [],
-            ["the", "focus", "in", "Daloa", ",", "Côte", "d’Ivoire]."],
-            ["Bulletin", "de", "la", "Société", "de", "Pathologie"],
-            ["Exotique", "et"],
-        ],
-        [
-            [],
-            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
-            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
-            ["i-r", "i-r"],
-        ],
-    )
-
-    _, path = tempfile.mkstemp()
-
-    token_label_tuples = list(yield_token_label_pairs(expected[0], expected[1]))
-
-    write_tsv(token_label_tuples, path)
-    actual = load_tsv(path)
-
-    assert expected == actual
-
-    os.remove(path)
-
-
 def test_break_into_chunks():
 
     before = ["a", "b", "c", "d", "e"]
@@ -253,23 +16,3 @@ def test_break_into_chunks():
 
     assert expected == actual
 
-def test_split_list_by_linebreaks():
-
-    lst = ["a", "b", "c", None, "d"]
-    expected = [["a", "b", "c"], ["d"]]
-
-    actual = split_list_by_linebreaks(lst)
-
-def test_list_by_linebreaks_ending_in_None():
-
-    lst = ["a", "b", "c", float("nan"), "d", None]
-    expected = [["a", "b", "c"], ["d"]]
-
-    actual = split_list_by_linebreaks(lst)
-
-def test_list_by_linebreaks_starting_in_None():
-
-    lst = [None, "a", "b", "c", None, "d"]
-    expected = [["a", "b", "c"], ["d"]]
-
-    actual = split_list_by_linebreaks(lst)

From 0480d5aff4486032ab7a4ed99401129843c9741d Mon Sep 17 00:00:00 2001
From: Matthew Upson <matthew.a.upson@gmail.com>
Date: Tue, 17 Mar 2020 22:57:33 -0300
Subject: [PATCH 08/11] chg: Update __init__.py files and imports

---
 deep_reference_parser/__init__.py             | 22 ++++++-------------
 .../deep_reference_parser.py                  |  2 +-
 deep_reference_parser/prodigy/__init__.py     |  2 ++
 3 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/deep_reference_parser/__init__.py b/deep_reference_parser/__init__.py
index 810d992..6009dee 100644
--- a/deep_reference_parser/__init__.py
+++ b/deep_reference_parser/__init__.py
@@ -2,9 +2,9 @@
 # distracting on the command line. These lines here (while undesireable)
 # reduce the level of verbosity.
 
+import os
 import sys
 import warnings
-import os
 
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 
@@ -19,20 +19,12 @@
 
 from .common import download_model_artefact
 from .deep_reference_parser import DeepReferenceParser
+from .io import (load_tsv, read_jsonl, read_pickle, write_jsonl, write_pickle,
+                 write_to_csv, write_tsv)
 from .logger import logger
 from .model_utils import get_config
-from .reference_utils import (
-    break_into_chunks,
-    labels_to_prodigy,
-    load_tsv,
-    prodigy_to_conll,
-    prodigy_to_lists,
-    read_jsonl,
-    read_pickle,
-    write_json,
-    write_jsonl,
-    write_pickle,
-    write_to_csv,
-    write_txt,
-)
+from .reference_utils import break_into_chunks
 from .tokens_to_references import tokens_to_references
+
+
+
diff --git a/deep_reference_parser/deep_reference_parser.py b/deep_reference_parser/deep_reference_parser.py
index eeaa0c0..74c3580 100644
--- a/deep_reference_parser/deep_reference_parser.py
+++ b/deep_reference_parser/deep_reference_parser.py
@@ -47,7 +47,7 @@
     save_confusion_matrix,
     word2vec_embeddings,
 )
-from .reference_utils import load_tsv, read_pickle, write_pickle, write_to_csv
+from .io import load_tsv, read_pickle, write_pickle, write_to_csv
 
 
 class DeepReferenceParser:
diff --git a/deep_reference_parser/prodigy/__init__.py b/deep_reference_parser/prodigy/__init__.py
index f90ce43..c582cc9 100644
--- a/deep_reference_parser/prodigy/__init__.py
+++ b/deep_reference_parser/prodigy/__init__.py
@@ -6,3 +6,5 @@
 from .reach_to_prodigy import ReachToProdigy, reach_to_prodigy
 from .reference_to_token_annotations import TokenTagger, reference_to_token_annotations
 from .spacy_doc_to_prodigy import SpacyDocToProdigy
+from .misc import prodigy_to_conll
+from .labels_to_prodigy import labels_to_prodigy

From 5baef9fce22348729f1d107d8fb7c05fe30e4f3d Mon Sep 17 00:00:00 2001
From: Matthew Upson <matthew.a.upson@gmail.com>
Date: Tue, 17 Mar 2020 23:04:09 -0300
Subject: [PATCH 09/11] chg: Linting with black

---
 deep_reference_parser/__init__.py                  | 14 +++++++++-----
 deep_reference_parser/prodigy/labels_to_prodigy.py |  1 -
 deep_reference_parser/prodigy/misc.py              |  2 ++
 deep_reference_parser/prodigy/prodigy_to_tsv.py    |  2 +-
 deep_reference_parser/reference_utils.py           |  2 ++
 tests/prodigy/test_misc.py                         |  1 +
 tests/test_io.py                                   | 14 +++++++++++++-
 tests/test_reference_utils.py                      |  6 ++----
 8 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/deep_reference_parser/__init__.py b/deep_reference_parser/__init__.py
index 6009dee..18a418c 100644
--- a/deep_reference_parser/__init__.py
+++ b/deep_reference_parser/__init__.py
@@ -19,12 +19,16 @@
 
 from .common import download_model_artefact
 from .deep_reference_parser import DeepReferenceParser
-from .io import (load_tsv, read_jsonl, read_pickle, write_jsonl, write_pickle,
-                 write_to_csv, write_tsv)
+from .io import (
+    load_tsv,
+    read_jsonl,
+    read_pickle,
+    write_jsonl,
+    write_pickle,
+    write_to_csv,
+    write_tsv,
+)
 from .logger import logger
 from .model_utils import get_config
 from .reference_utils import break_into_chunks
 from .tokens_to_references import tokens_to_references
-
-
-
diff --git a/deep_reference_parser/prodigy/labels_to_prodigy.py b/deep_reference_parser/prodigy/labels_to_prodigy.py
index 5425305..b6107d4 100644
--- a/deep_reference_parser/prodigy/labels_to_prodigy.py
+++ b/deep_reference_parser/prodigy/labels_to_prodigy.py
@@ -1,4 +1,3 @@
-
 def labels_to_prodigy(tokens, labels):
     """
     Converts a list of tokens and labels like those used by Rodrigues et al,
diff --git a/deep_reference_parser/prodigy/misc.py b/deep_reference_parser/prodigy/misc.py
index 0ebbd94..c1f8d5c 100644
--- a/deep_reference_parser/prodigy/misc.py
+++ b/deep_reference_parser/prodigy/misc.py
@@ -1,11 +1,13 @@
 import spacy
 
+
 def _join_prodigy_tokens(text):
     """Return all prodigy tokens in a single string
     """
 
     return "\n".join([str(i) for i in text])
 
+
 def prodigy_to_conll(docs):
     """
     Expect list of jsons loaded from a jsonl
diff --git a/deep_reference_parser/prodigy/prodigy_to_tsv.py b/deep_reference_parser/prodigy/prodigy_to_tsv.py
index 4743530..41a8716 100644
--- a/deep_reference_parser/prodigy/prodigy_to_tsv.py
+++ b/deep_reference_parser/prodigy/prodigy_to_tsv.py
@@ -20,7 +20,7 @@
 
 msg = Printer()
 
-ROWS_TO_PRINT=15
+ROWS_TO_PRINT = 15
 
 
 class TokenLabelPairs:
diff --git a/deep_reference_parser/reference_utils.py b/deep_reference_parser/reference_utils.py
index 5516b51..fc8e8ab 100644
--- a/deep_reference_parser/reference_utils.py
+++ b/deep_reference_parser/reference_utils.py
@@ -6,6 +6,7 @@
 
 from .logger import logger
 
+
 def yield_token_label_pairs(tokens, labels):
     """
     Convert matching lists of tokens and labels to tuples of (token, label) but
@@ -24,6 +25,7 @@ def yield_token_label_pairs(tokens, labels):
         else:
             yield (None, None)
 
+
 def break_into_chunks(doc, max_words=250):
     """
     Breaks a list into lists of lists of length max_words
diff --git a/tests/prodigy/test_misc.py b/tests/prodigy/test_misc.py
index c4436c0..5ed18df 100644
--- a/tests/prodigy/test_misc.py
+++ b/tests/prodigy/test_misc.py
@@ -1,5 +1,6 @@
 from deep_reference_parser.prodigy import prodigy_to_conll
 
+
 def test_prodigy_to_conll():
 
     before = [
diff --git a/tests/test_io.py b/tests/test_io.py
index 931ad35..dd86061 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -5,7 +5,13 @@
 
 import pytest
 
-from deep_reference_parser.io.io import read_jsonl, write_jsonl, load_tsv, write_tsv, _split_list_by_linebreaks
+from deep_reference_parser.io.io import (
+    read_jsonl,
+    write_jsonl,
+    load_tsv,
+    write_tsv,
+    _split_list_by_linebreaks,
+)
 from deep_reference_parser.reference_utils import yield_token_label_pairs
 
 from .common import TEST_JSONL, TEST_TSV_TRAIN, TEST_TSV_PREDICT, TEST_LOAD_TSV
@@ -41,6 +47,7 @@ def test_write_tsv(tmpdir):
 
     assert expected == actual
 
+
 def test_load_tsv_train():
     """
     Text of TEST_TSV_TRAIN:
@@ -129,6 +136,7 @@ def test_load_tsv_predict():
 
     assert actual == expected
 
+
 def test_load_tsv_train_multiple_labels():
     """
     Text of TEST_TSV_TRAIN:
@@ -219,6 +227,7 @@ def test_yield_toke_label_pairs():
 
     assert expected == actual
 
+
 def test_read_jsonl():
 
     expected = [
@@ -280,6 +289,7 @@ def test_write_jsonl(tmpdir):
 
     assert expected == actual
 
+
 def test_split_list_by_linebreaks():
 
     lst = ["a", "b", "c", None, "d"]
@@ -287,6 +297,7 @@ def test_split_list_by_linebreaks():
 
     actual = _split_list_by_linebreaks(lst)
 
+
 def test_list_by_linebreaks_ending_in_None():
 
     lst = ["a", "b", "c", float("nan"), "d", None]
@@ -294,6 +305,7 @@ def test_list_by_linebreaks_ending_in_None():
 
     actual = _split_list_by_linebreaks(lst)
 
+
 def test_list_by_linebreaks_starting_in_None():
 
     lst = [None, "a", "b", "c", None, "d"]
diff --git a/tests/test_reference_utils.py b/tests/test_reference_utils.py
index 4b329d8..c6d2091 100644
--- a/tests/test_reference_utils.py
+++ b/tests/test_reference_utils.py
@@ -3,9 +3,8 @@
 
 import pytest
 
-from deep_reference_parser.reference_utils import (
-    break_into_chunks
-)
+from deep_reference_parser.reference_utils import break_into_chunks
+
 
 def test_break_into_chunks():
 
@@ -15,4 +14,3 @@ def test_break_into_chunks():
     actual = break_into_chunks(before, max_words=2)
 
     assert expected == actual
-

From 519937f287a7f02cbca713a39a10b338b3182915 Mon Sep 17 00:00:00 2001
From: Matthew Upson <matthew.a.upson@gmail.com>
Date: Tue, 17 Mar 2020 23:11:49 -0300
Subject: [PATCH 10/11] chg: Suppress model summary log

---
 deep_reference_parser/deep_reference_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deep_reference_parser/deep_reference_parser.py b/deep_reference_parser/deep_reference_parser.py
index 74c3580..b9b28cd 100644
--- a/deep_reference_parser/deep_reference_parser.py
+++ b/deep_reference_parser/deep_reference_parser.py
@@ -456,7 +456,7 @@ def build_model(
 
         self.model = model
 
-        logger.debug(self.model.summary(line_length=150))
+#        logger.debug(self.model.summary(line_length=150))
 
     def train_model(
         self, epochs=25, batch_size=100, early_stopping_patience=5, metric="val_f1"

From e45bc363ba45b01ad960c30ab564f45a4ac0eb98 Mon Sep 17 00:00:00 2001
From: Matthew Upson <matthew.a.upson@gmail.com>
Date: Tue, 17 Mar 2020 23:14:02 -0300
Subject: [PATCH 11/11] chg: Bump version to 2020.3.1

---
 deep_reference_parser/__version__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deep_reference_parser/__version__.py b/deep_reference_parser/__version__.py
index 0a989eb..8627b22 100644
--- a/deep_reference_parser/__version__.py
+++ b/deep_reference_parser/__version__.py
@@ -1,5 +1,5 @@
 __name__ = "deep_reference_parser"
-__version__ = "2020.3.0"
+__version__ = "2020.3.1"
 __description__ = "Deep learning model for finding and parsing references"
 __url__ = "https://github.com/wellcometrust/deep_reference_parser"
 __author__ = "Wellcome Trust DataLabs Team"