This the TF-IDF baseline model from the starter script using 

In [1]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
import numpy
from sklearn.metrics import f1_score,precision_score,recall_score

In [2]:
import json
dev_path = "./training_data/dev.json"
with open(dev_path) as json_file:
    dev = json.load(json_file)
# Example instance
print(dev[list(dev.keys())[1]])

{'Type': 'Comparison', 'Section_id': 'Eligibility', 'Primary_id': 'NCT00425854', 'Secondary_id': 'NCT01224678', 'Statement': 'Patients with significantly elevated ejection fraction are excluded from the primary trial, but can still be eligible for the secondary trial if they are 55 years of age or over', 'Label': 'Contradiction'}


In [3]:
uuid_list = list(dev.keys())
statements = []
gold_dev_primary_evidence = []
gold_dev_secondary_evidence = []
for i in range(len(uuid_list)):
  #Retrieve all statements from the development set
  statements.append(dev[uuid_list[i]]["Statement"])

In [4]:
Results = {}
for i in range(len(uuid_list)):
  primary_ctr_path = os.path.join("./training_data/CT json",dev[uuid_list[i]]["Primary_id"]+".json")
  with open(primary_ctr_path) as json_file:
    primary_ctr = json.load(json_file)

  #Retrieve the full section from the primary trial
  primary_section = primary_ctr[dev[uuid_list[i]]["Section_id"]]

  #Convert a primary section entries to a matrix of TF-IDF features.
  vectorizer = TfidfVectorizer().fit(primary_section)
  X_s = vectorizer.transform([statements[i]])
  X_p = vectorizer.transform(primary_section)
  #Compute the cosine similarity between the primary section entries and the statement
  primary_scores = cosine_distances(X_s, X_p)
  #Repeat for the secondary trial
  if dev[uuid_list[i]]["Type"] == "Comparison":
    secondary_ctr_path = os.path.join("./training_data/CT json",dev[uuid_list[i]]["Secondary_id"]+".json")
    with open(secondary_ctr_path) as json_file:
      secondary_ctr = json.load(json_file)
    secondary_section = secondary_ctr[dev[uuid_list[i]]["Section_id"]]
    vectorizer = TfidfVectorizer().fit(secondary_section)
    X_s = vectorizer.transform([statements[i]])
    X_p = vectorizer.transform(secondary_section)
    secondary_scores = cosine_distances(X_s, X_p)
    #Combine and average the cosine distances of all entries from the relevant section of the primary and secondary trial
    combined_scores = []
    combined_scores.extend(secondary_scores[0])
    combined_scores.extend(primary_scores[0])
    score = numpy.average(combined_scores)
    #If the cosine distance is gless than 0.9 the prediction is entailment
    if score > 0.9:
      Prediction = "Contradiction"
    else:
      Prediction = "Entailment"
    Results[str(uuid_list[i])] = {"Prediction":Prediction}
  else:
    #If the cosine distance is greater than 0.9 the prediction is contradiction
    score = numpy.average(primary_scores)
    if score > 0.9:
      Prediction = "Contradiction"
    else:
      Prediction = "Entailment"
    Results[str(uuid_list[i])] = {"Prediction":Prediction}

In [6]:

with open("TF-IDF_results.json",'w') as jsonFile:
    jsonFile.write(json.dumps(Results,indent=4))

In [7]:
def main():

    gold = dev
    results = Results
    uuid_list = list(results.keys())

    results_pred = []
    gold_labels = []
    for i in range(len(uuid_list)):
        if results[uuid_list[i]]["Prediction"] == "Entailment":
            results_pred.append(1)
        else:
            results_pred.append(0)
        if gold[uuid_list[i]]["Label"] == "Entailment":
            gold_labels.append(1)
        else:
            gold_labels.append(0)

    f_score = f1_score(gold_labels,results_pred)
    p_score = precision_score(gold_labels,results_pred)
    r_score = recall_score(gold_labels,results_pred)

    print('F1:{:f}'.format(f_score))
    print('precision_score:{:f}'.format(p_score))
    print('recall_score:{:f}'.format(r_score))

if '__main__' == __name__:
    main()

F1:0.502415
precision_score:0.485981
recall_score:0.520000


In [1]:
import os

# data set
import json

data_path = "./training_data/train.json"
with open(data_path) as json_file:
    dev = json.load(json_file)

uuid_list = list(dev.keys())
statements = []
gold_dev_primary_evidence = []
gold_dev_secondary_evidence = []
for i in range(len(uuid_list)):
  #Retrieve all statements from the development set
  statements.append(dev[uuid_list[i]]["Statement"])

for i in range(len(uuid_list)):
    primary_ctr_path = os.path.join("./training_data/CT json",dev[uuid_list[i]]["Primary_id"]+".json")
    with open(primary_ctr_path) as json_file:
        primary_ctr = json.load(json_file)
    #Retrieve the full section from the primary trial
    primary_section = primary_ctr[dev[uuid_list[i]]["Section_id"]]
    dev[uuid_list[i]]["Primary_section"] = primary_section
    if dev[uuid_list[i]]["Type"] == "Comparison":
        secondary_ctr_path = os.path.join("./training_data/CT json",dev[uuid_list[i]]["Secondary_id"]+".json")
        with open(secondary_ctr_path) as json_file:
          secondary_ctr = json.load(json_file)
        secondary_section = secondary_ctr[dev[uuid_list[i]]["Section_id"]]
        dev[uuid_list[i]]["Secondary_section"] = secondary_section

In [6]:
import pandas as pd
df = pd.DataFrame(dev).T
df.head(10)

Unnamed: 0,Type,Section_id,Primary_id,Secondary_id,Statement,Label,Primary_section,Secondary_section
5bc844fc-e852-4270-bfaf-36ea9eface3d,Comparison,Intervention,NCT01928186,NCT00684983,All the primary trial participants do not rece...,Contradiction,"[INTERVENTION 1: , Diagnostic (FLT PET), P...","[INTERVENTION 1: , Arm A, Patients receive..."
86b7cb3d-6186-4a04-9aa6-b174ab764eed,Single,Eligibility,NCT00662129,,"Patients with Platelet count over 100,000/mm¬¨...",Contradiction,"[DISEASE CHARACTERISTICS:, Histologically or...",
dbed5471-c2fc-45b5-b26f-430c9fa37a37,Comparison,Adverse Events,NCT00093145,NCT00703326,Heart-related adverse events were recorded in ...,Entailment,"[Adverse Events 1:, Total: 5/32 (15.63%), ...","[Adverse Events 1:, Total: 285/752 (37.90%),..."
20c35c89-8d23-4be3-b603-ac0ee0f3b4de,Single,Eligibility,NCT01097642,,Adult Patients with histologic confirmation of...,Contradiction,"[Inclusion Criteria:, Patients with histolog...",
f17cb242-419d-4f5d-bfa4-41494ed5ac0e,Comparison,Intervention,NCT00852930,NCT02308020,Laser Therapy is in each cohort of the primary...,Contradiction,"[INTERVENTION 1: , Laser Therapy Alone, th...","[INTERVENTION 1: , Part A Abemaciclib: HR+, ..."
fc5c4554-7ce9-4c16-b374-a3cd9d15b021,Comparison,Eligibility,NCT00971945,NCT01027416,Patients must have already participated in a s...,Contradiction,"[Inclusion Criteria:, Subjects who were conf...","[Inclusion Criteria:, The patient must conse..."
96b77cdd-aa9f-4770-8447-8a04d9ca5da7,Single,Eligibility,NCT00633750,,Patients with Clinical stage II (T2 N1) invasi...,Contradiction,"[Inclusion Criteria:, Clinical stage I or II...",
c73faed2-371b-4238-bf7d-293fae380203,Comparison,Intervention,NCT00003404,NCT00711529,the primary trial and the secondary trial have...,Contradiction,"[INTERVENTION 1: , Adjuvant Radiotherapy, ...","[INTERVENTION 1: , Hypnotherapy, Patients ..."
8765009d-ffc4-4395-ab7a-11ecdfd43a40,Single,Eligibility,NCT00201773,,Adele is an 85 year old woman with Stage II hi...,Entailment,"[Inclusion Criteria:, Must be female with hi...",
0ad7293d-df35-42e8-881d-f2afc3f7d3fd,Comparison,Intervention,NCT02606708,NCT02504424,Only patients in the primary trial receive 40....,Contradiction,"[INTERVENTION 1: , Accelerated Intensity Mod...","[INTERVENTION 1: , AeroForm Tissue Expander,..."


In [None]:
df.to_csv("train.csv")  