# Creating work units for CrowdFlower job #733049 using Ben's partially processed data

2015-06-01 Tong Shu Li

Ben has already processed the 1st SemmedDB gold standard into a format for his CrowdFlower task. In order to keep things consistent, I will be using his data for my job. Of course, since my job will be different from his, I will need to adapt the data to my needs.

In [1]:
import pandas as pd
import xml.etree.cElementTree as ET

---

In [2]:
def add_simple_tag(tag_name, tag_class, text):
    return "<{0} class=\"{1}\">{2}</{0}>".format(tag_name, tag_class, text)

def get_semtype_defn():
    return pd.read_csv("data/srdef.txt", sep = '|')

### Read the original file:

In [3]:
tree = ET.parse("gold/adjudicated_semmed_gold.xml")
root = tree.getroot()

### Read Ben's file:

In [4]:
res = pd.read_csv("gold/dev_gold.tsv", sep = '\t')

In [5]:
res.head()

Unnamed: 0,predication_id,indicator_type,predicate_start_index,predicate_end_index,predicate,predicate_definition,subject_start_index,subject_end_index,subject_score,subject_text,...,object_score,object_text,object_pref_name,object_CUI,object_stype_name,object_stype_desc,object_desc,sentence,sentence_html,pmid
0,0,,41,43,ISA,,30,40,0,meditation,...,0,intervention,Intervention regimes,C1273869,Health Care Activity,"null,",Health Care Activity. null,Findings suggest the value of meditation as an...,<div>Findings suggest the value of <strong>med...,16617173
1,0,,29,34,TREATS,,38,53,0,steroid therapy,...,0,patient,Patients,C0030705,"Patient or Disabled Group,Human","null,null,",Individuals participating in the health care s...,One patient was successfully cured by steroid ...,<div>One <strong>patient</strong> was successf...,16415554
2,0,,43,54,COEXISTS_WITH,,0,28,0,Systemic lupus erythematosus,...,0,gastrointestinal symptoms,Gastrointestinal symptoms NOS,C0426576,Sign or Symptom,"null,",Sign or Symptom. null,Systemic lupus erythematosus is frequently acc...,<div><strong>Systemic lupus erythematosus</str...,16415554
3,0,,0,0,LOCATION_OF,,89,96,0,colonic,...,0,lesions,Lesion,C0221198,Finding,"null,",A localized pathological or traumatic structur...,Systemic lupus erythematosus is frequently acc...,<div>Systemic lupus erythematosus is frequentl...,16415554
4,0,,12,14,ISA,,0,11,0,Fasciolosis,...,0,disease,Disease,C0012634,Disease or Syndrome,"null,",top term heading for all specific disorders an...,Fasciolosis is an uncommon disease in this reg...,<div><strong>Fasciolosis</strong> is an uncomm...,16024249


### Get the definitions for the semantic types:

In [6]:
semtype_defn = get_semtype_defn()

In [7]:
semtype_defn.head()

Unnamed: 0,code,name,definition
0,orgm,Organism,"Generally, a living individual, including all ..."
1,plnt,Plant,"An organism having cellulose cell walls, growi..."
2,fngs,Fungus,A eukaryotic organism characterized by the abs...
3,virs,Virus,An organism consisting of a core of a single n...
4,bact,Bacterium,"A small, typically one-celled, prokaryotic mic..."


In [8]:
def get_defn(semtype):
    result = semtype_defn.query("name == '{0}'".format(semtype))
    assert not result.empty, semtype
    return result.iloc[0]["definition"]

### For each row of the table, we want to correct the sub/obj semantic types to the "relation semantic type"

In [9]:
def find_pmid(pmid):
    for node in root.iter("MedlineCitation"):
        if node.attrib["pmid"] == str(pmid):
            return node
        
    raise Exception("Couldn't find pmid")
        
def find_sentence(node, sentence):
    for branch in node.iter("Sentence"):
        if branch.attrib["text"] == sentence:
            return branch
        
    raise Exception("Couldn't find sentence")

def get_semtypes(pmid, sentence, predicate, s_text, o_text, s_off, o_off):
    temp = find_pmid(pmid)
    node = find_sentence(temp, sentence)
    
    if predicate.startswith("NEG_"):
        predicate = predicate[4:]
        
    s_off = str(s_off)
    o_off = str(o_off)
    
    def correct_pred(node):
        for child in node:
            if child.tag == "Predicate" and predicate != child.attrib["type"]:
                return False
            
            if child.tag == "Subject" and s_text != child.attrib["text"]:
                return False
            
            if child.tag == "Object" and o_text != child.attrib["text"]:
                return False
            
        return True
    
    n_found = 0
    semtype = dict()
    for pred in node.iter("Predication"):
        if correct_pred(pred):
            n_found += 1
            
            for child in pred:
                if child.tag != "Predicate":
                    for c in child:
                        if c.tag == "RelationSemanticType":
                            semtype[child.tag] = c.text
            
    if n_found > 1:
        print "Found more than one predicate!!!"
        print pmid
        print sentence
        print s_text
        print o_text
        print
        
    return semtype

In [10]:
for idx, row, in res.iterrows():
    semtypes = get_semtypes(row["pmid"], row["sentence"], row["predicate"], row["subject_text"], row["object_text"],
                            row["subject_start_index"], row["object_start_index"])
    
    res.loc[idx, "subject_stype_name"] = semtypes["Subject"]
    res.loc[idx, "object_stype_name"] = semtypes["Object"]
    
    res.loc[idx, "subject_stype_desc"] = get_defn(semtypes["Subject"])
    res.loc[idx, "object_stype_desc"] = get_defn(semtypes["Object"])

Found more than one predicate!!!
16332961
The main site for ROS production is the respiratory chain inside the mitochondria and accumulation of mtDNA mutations, and impaired respiratory chain function have been associated with degenerative diseases and aging.
mitochondria
respiratory chain

Found more than one predicate!!!
16332961
The main site for ROS production is the respiratory chain inside the mitochondria and accumulation of mtDNA mutations, and impaired respiratory chain function have been associated with degenerative diseases and aging.
mitochondria
respiratory chain

Found more than one predicate!!!
15750284
Since mRNAs of both GH and GH receptor were present in stem cells and B-cell precursors in bone marrow, GH may modulate B-lymphoid precursors development in an autocrine or paracrine manner in bone marrows.
stem cells
mRNAs

Found more than one predicate!!!
15750284
Since mRNAs of both GH and GH receptor were present in stem cells and B-cell precursors in bone marrow, GH 

### Add a unique identifier for easy reference:

In [11]:
res["uniq_id"] = pd.Series(["uniq_id_b{0}".format(i) for i in range(len(res))])

### Format the sentences with HTML tags for easy text highlighting:

In [12]:
def format_sent(row):
    s_text = row["subject_text"]
    o_text = row["object_text"]
    
    s_offset = row["subject_start_index"]
    o_offset = row["object_start_index"]
    
    sentence = row["sentence"]
    
    if s_offset < o_offset:
        return (sentence[:s_offset]
                + add_simple_tag("span", "subject_text", s_text)
                + sentence[s_offset + len(s_text) : o_offset]
                + add_simple_tag("span", "object_text", o_text)
                + sentence[o_offset + len(o_text) :])
    
    assert s_offset > o_offset
    
    return (sentence[ : o_offset]
            + add_simple_tag("span", "object_text", o_text)
            + sentence[o_offset + len(o_text) : s_offset]
            + add_simple_tag("span", "subject_text", s_text)
            + sentence[s_offset + len(s_text) :])

In [13]:
form_sent = []
for idx, row in res.iterrows():
    ans = format_sent(row)
    form_sent.append(ans)
    
res["form_sent"] = form_sent

In [14]:
def format_subject(text):
    return "\"{0}\"".format(add_simple_tag("span", "subject_text", text))

def format_object(text):
    return "\"{0}\"".format(add_simple_tag("span", "object_text", text))

In [15]:
res["form_sub_text"] = res["subject_text"].map(format_subject)
res["form_obj_text"] = res["object_text"].map(format_object)

In [19]:
res.to_csv("data/data_for_cf_job_733049.tsv", sep = '\t', index = False)