While Sysrev is mainly used to help users gather relevant literature for a research question, there are many other applications for the tool. This example shows how we can use NER to detect gene names in each article. Using the Spacy library, we first process each text to format it properly for Spacy to read. Next, we run the processed file through the NER pipeline for training. With the trained model, we can test it on our project's texts to detect gene names. Now we can add our findings as a column to our DataFrame for the project. Hopefully, you can also find additional applications for the Sysrev tool.

In [9]:
# Retrieve project annotations
! curl -X GET -d project-id=3144 -G https://sysrev.com/web-api/project-annotations > sysrev_output.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 10.3M  100 10.3M    0     0  4200k      0  0:00:02  0:00:02 --:--:-- 4199k


In [14]:
# Some annotations are missing start-offsets and we will remove them
with open('sysrev_output.json') as f:
    SYSREV_DATA = json.load(f)
annotations = [x for x in SYSREV_DATA['result'] if 'start-offset' in x['context'].keys()]

In [15]:
# Process annotations to proper format for Spacy

import json

label = 'GENE'

def process_annotation(annotation):
    return [annotation["context"]["text-context"],
            {"entities": [(annotation["context"]["start-offset"],annotation["context"]["end-offset"],label)]}]

processed_annotations = map(process_annotation, annotations)

def combine_annotations(processed_annotations):
    combined_annotations = {}
    for text,entities in processed_annotations:
        if combined_annotations.get(text) is None:
            combined_annotations[text] = []
        combined_annotations[text].append(entities["entities"][0])
    for key in combined_annotations:
        combined_annotations[key] = list(set(combined_annotations[key]))            
    return combined_annotations

combined_processed_annotations = combine_annotations(processed_annotations)
final_json = []
for k in combined_processed_annotations:
    final_json.append([k,{"entities":combined_processed_annotations[k]}])

with open('processed_output.json', 'w') as fout:
    json.dump(final_json, fout)

In [16]:
# Train on our processed data

from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
import json
import io
import pprint

nlp = spacy.blank('en')  # create blank Language class

model=None
new_model_name='gene'
output_dir='sysrev_gene'
n_iter=20
training_data='training_data.json'
label='GENE'
max_steps_since_min=10

with open('processed_output.json') as f:
    TRAIN_DATA = json.load(f)

# Add entity recognizer to model if it's not in the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
# otherwise, get it, so we can add labels to it
else:
    ner = nlp.get_pipe('ner')

ner.add_label(label)   # add new entity label to entity recognizer
if model is None:
    optimizer = nlp.begin_training()
else:
    # Note that 'begin_training' initializes the models, so it'll zero out
    # existing entity types.
    optimizer = nlp.entity.create_optimizer()

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
min_loss = 5000.0
steps_since_last_min = 0
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                       losses=losses)
        steps_since_last_min += 1
        if losses["ner"] < min_loss:
            min_loss = losses["ner"]
            steps_since_last_min = 0
        print("current loss: {} | min_loss: {} | Steps since last min: {}".format(losses["ner"],min_loss,steps_since_last_min))    
        if steps_since_last_min > max_steps_since_min:
            print("Maximum steps since last min loss exceeded")
            break




# test the trained model

test_text = "Depletion of Nup98 or Wdr82 abolishes Set1A recruitment to chromatin and subsequently ablates H3K4me3 at adjacent promoters."

# save model to output directory
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.meta['name'] = new_model_name  # rename model
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc2 = nlp2(test_text)
for ent in doc2.ents:
    print(ent.label_, ent.text)

current loss: 8352.80908776 | min_loss: 5000.0 | Steps since last min: 1
current loss: 5233.20488424 | min_loss: 5000.0 | Steps since last min: 2
current loss: 4245.75026934 | min_loss: 4245.75026934 | Steps since last min: 0
current loss: 3863.74363356 | min_loss: 3863.74363356 | Steps since last min: 0
current loss: 3395.230973 | min_loss: 3395.230973 | Steps since last min: 0
current loss: 3206.04109392 | min_loss: 3206.04109392 | Steps since last min: 0
current loss: 3009.51124628 | min_loss: 3009.51124628 | Steps since last min: 0
current loss: 2577.49075437 | min_loss: 2577.49075437 | Steps since last min: 0
current loss: 2578.52411763 | min_loss: 2577.49075437 | Steps since last min: 1
current loss: 2230.60432886 | min_loss: 2230.60432886 | Steps since last min: 0
current loss: 2085.05889421 | min_loss: 2085.05889421 | Steps since last min: 0
current loss: 1960.22309058 | min_loss: 1960.22309058 | Steps since last min: 0
current loss: 1912.14955398 | min_loss: 1912.14955398 | St

In [22]:
# Use Python client to get data for the project
import PySysrev
df = PySysrev.getAnnotations(3144)

In [23]:
# Our initial text
df.head(5)

Unnamed: 0,annotation,datasource,end,external_id,selection,semantic_class,start,sysrev_id,text
0,α-KGDH,pubmed,286.0,29211711,α-KGDH,gene,280.0,1524023,"Histone modifications, such as the frequently ..."
1,KAT2A,pubmed,391.0,29211711,KAT2A,gene,386.0,1524023,"Histone modifications, such as the frequently ..."
2,GCN5,pubmed,411.0,29211711,GCN5,gene,407.0,1524023,"Histone modifications, such as the frequently ..."
3,succinyl-CoA,pubmed,493.0,29211711,succinyl-CoA,gene,481.0,1524023,"Histone modifications, such as the frequently ..."
4,KAT2A,pubmed,509.0,29211711,KAT2A,gene,504.0,1524023,"Histone modifications, such as the frequently ..."


In [19]:
# Run our texts through our trained model to get entities
nlp2 = spacy.load('sysrev_gene')
txt_list = []
for txt in list(df['text']):
    if txt is None:
        txt_list.append(None)
    else:
        doc2 = nlp2(txt)
        txt_list.append(doc2.ents)

In [24]:
# Add entities to DataFrame
df['entities'] = txt_list

In [25]:
# Identified genes shown as new column
df.head(5)

Unnamed: 0,annotation,datasource,end,external_id,selection,semantic_class,start,sysrev_id,text,entities
0,α-KGDH,pubmed,286.0,29211711,α-KGDH,gene,280.0,1524023,"Histone modifications, such as the frequently ...","((α, -, KGDH), (KAT2A), (GCN5), (succinyl, -, ..."
1,KAT2A,pubmed,391.0,29211711,KAT2A,gene,386.0,1524023,"Histone modifications, such as the frequently ...","((α, -, KGDH), (KAT2A), (GCN5), (succinyl, -, ..."
2,GCN5,pubmed,411.0,29211711,GCN5,gene,407.0,1524023,"Histone modifications, such as the frequently ...","((α, -, KGDH), (KAT2A), (GCN5), (succinyl, -, ..."
3,succinyl-CoA,pubmed,493.0,29211711,succinyl-CoA,gene,481.0,1524023,"Histone modifications, such as the frequently ...","((α, -, KGDH), (KAT2A), (GCN5), (succinyl, -, ..."
4,KAT2A,pubmed,509.0,29211711,KAT2A,gene,504.0,1524023,"Histone modifications, such as the frequently ...","((α, -, KGDH), (KAT2A), (GCN5), (succinyl, -, ..."
