In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!ls
%cd "/content/drive/MyDrive/Colab Notebooks/skema/data"

drive  sample_data
/content/drive/MyDrive/Colab Notebooks/skema/data


In [3]:
import numpy as np
import pandas as pd
import os
import json
import io

In [4]:
path = "/content/drive/MyDrive/Colab Notebooks/skema/data/"
extractions_path = "cosmos-and-extractions-jsons-for-3-papers"
annotated_files = ["data-sars-double.json", "data_modeling_covid_italy.json", "data-response-to-covid-19-was-italy-unprepared.json"]
extraction_files =["extractions_sarsdouble.json", "extractions_modeling_covid_italy--COSMOS-data.json","extractions_response-to-covid-19-was-italy-unprepared--COSMOS-data.json"]
paper_names = ["sarsdouble.pdf", "modeling_covid_italy.pdf", "response-to-covid-19-was-italy-unprepared.pdf"]
ann_extr_file_pairs = {}
for name,ann, extr in zip(paper_names,annotated_files, extraction_files):
  ann_extr_file_pairs[name] = [ann, extr]

In [5]:
ann_extr_file_pairs

{'sarsdouble.pdf': ['data-sars-double.json', 'extractions_sarsdouble.json'],
 'modeling_covid_italy.pdf': ['data_modeling_covid_italy.json',
  'extractions_modeling_covid_italy--COSMOS-data.json'],
 'response-to-covid-19-was-italy-unprepared.pdf': ['data-response-to-covid-19-was-italy-unprepared.json',
  'extractions_response-to-covid-19-was-italy-unprepared--COSMOS-data.json']}

### Save Combined annotations and extractions details in csv and json

In [6]:
def save_extr_ann_file(path, filename, map):
  pd.DataFrame.from_records(map).to_csv(os.path.join(path, filename+".xlsx"))

  with io.open(os.path.join(path, filename+".json"), 'w', encoding='utf-8') as f:
    f.write(json.dumps(map, ensure_ascii=False))

### Generate combined annotations and extractions file

In [7]:
def combine_ann_extr(path, extractions_path, extr, ann,filename ):
  with open(os.path.join(path,extractions_path, extr ), "r", encoding='UTF-8') as f:
    contents = f.readlines()
    extractions = json.loads(contents[0])
  with open(os.path.join(path, ann ), "r", encoding='UTF-8') as f:
    contents = f.read()
    annotations = json.loads(contents)

  print(len(annotations), len(extractions["mentions"]), len(extractions["documents"]))

  doc_event_map = []
  event_doc_map = {}
  for mention in extractions['mentions']:
    if mention['id'].startswith("E:"):
      for att in mention["attachments"]:
          if "pageNum" in att.keys():
            doc_event_map.append({"doc_id":mention['document'], "pg_num":att["pageNum"][0],"blk_id":att["blockIdx"][0],"sentence_id":mention['sentence'], 
                                  "doc_sentence_count":len(extractions["documents"][mention['document']]["sentences"]), 
                                  "event_id":mention["id"], "event":mention["text"] }) #corrected_sent_number -> 1,2,5,4,6,7, => 1,2,3,4,5,6
            event_doc_map[mention["id"]] = {"doc_id":mention['document'],"pg_num":att["pageNum"][0],"blk_id":att["blockIdx"][0],"sentence_id":mention['sentence'], 
                                  "doc_sentence_count":len(extractions["documents"][mention['document']]["sentences"]), 
                                  "event_id":mention["id"], "event":mention["text"] }
  event_ann_map = {}
  for ann in annotations:
    event_ann_map[ann['eventId']] = {"annotated_page_num":ann["page_num"],"para_num":ann["para_num"], "event":ann["event"], 'locationContext': ann['locationContext'],
    'temporalContext': ann['temporalContext'],'explanation': ann['explanation']}
  
  event_extr_ann_map = []
  for event, values in event_doc_map.items():
    this_event = event_ann_map[event]
    event_extr_ann_map.append({"doc_id":values['doc_id'],"annotated_page_num":this_event["page_num"],"para_num":this_event["para_num"], "event_id":values["event_id"],
                               "event":this_event["event"], 'locationContext': ",".join(this_event['locationContext']),
    'temporalContext': ",".join(this_event['temporalContext']),'explanation': this_event['explanation'], 'pg_num':values['pg_num'], 'blk_id':values['blk_id'], 
    'sentence_id':values['sentence_id'], 'doc_sentence_count':values['doc_sentence_count']})
  
  save_extr_ann_file(path, filename, event_extr_ann_map)
  

### Covid papers, Text Reading Pipeline Extractions & Annotations

In [8]:
# for key in paper_names:
#   name = key
#   ann, extr = ann_extr_file_pairs[name]
#   combine_ann_extr(path, extractions_path, extr, ann, name[:-4])

In [24]:
def combine_ann_extr_all(path, extractions_path, extr, ann,filename ):
  with open(os.path.join(path,extractions_path, extr ), "r", encoding='UTF-8') as f:
    contents = f.readlines()
    extractions = json.loads(contents[0])
  with open(os.path.join(path, ann ), "r", encoding='UTF-8') as f:
    contents = f.read()
    annotations = json.loads(contents)

  print(len(annotations), len(extractions["mentions"]), len(extractions["documents"]))

  doc_event_map = []
  event_doc_map = {}
  for mention in extractions['mentions']:
    # if mention['id'].startswith("E:"):
    for att in mention["attachments"]:
      if "pageNum" in att.keys():
        doc_event_map.append({"doc_id":mention['document'], "pg_num":att["pageNum"][0],"blk_id":att["blockIdx"][0],"sentence_id":mention['sentence'], 
                              "doc_sentence_count":len(extractions["documents"][mention['document']]["sentences"]), 
                              "event_id":mention["id"], "event":mention["text"] }) #corrected_sent_number -> 1,2,5,4,6,7, => 1,2,3,4,5,6
        event_doc_map[mention["id"]] = {"doc_id":mention['document'],"pg_num":att["pageNum"][0],"blk_id":att["blockIdx"][0],"sentence_id":mention['sentence'], 
                              "doc_sentence_count":len(extractions["documents"][mention['document']]["sentences"]), 
                              "event_id":mention["id"], "event":mention["text"] }
  event_ann_map = {}
  for ann in annotations:
    event_ann_map[ann['eventId']] = {"annotated_page_num":ann["page_num"],"para_num":ann["para_num"], "event":ann["event"], 'locationContext': ann['locationContext'],
    'temporalContext': ann['temporalContext'],'explanation': ann['explanation']}
  
  empty_map = {"annotated_page_num":"","para_num":"", "event":"", 'locationContext': "",
    'temporalContext': "",'explanation': ""}
  event_extr_ann_map = []
  for event, values in event_doc_map.items():
    this_event = event_ann_map[event] if event in event_ann_map.keys() else empty_map
    event_extr_ann_map.append({"doc_id":values['doc_id'],"annotated_page_num":this_event["annotated_page_num"],"para_num":this_event["para_num"], "event_id":values["event_id"],
                               "event":this_event["event"], 'locationContext': ",".join(this_event['locationContext']),
    'temporalContext': ",".join(this_event['temporalContext']),'explanation': this_event['explanation'], 'pg_num':values['pg_num'], 'blk_id':values['blk_id'], 
    'sentence_id':values['sentence_id'], 'doc_sentence_count':values['doc_sentence_count']})

  df = pd.DataFrame.from_records(event_extr_ann_map)
  df = df[df.columns]
  print(df.columns.to_list())
  print("\n")
  df['locationContext'] = df['locationContext'].replace({"^'|'$": ""}, regex=True)
  df['temporalContext'] = df['temporalContext'].replace({"^'|'$": ""}, regex=True)
  df.sort_values(by=['doc_id', 'pg_num','blk_id', 'sentence_id', 'doc_sentence_count'], inplace=True,
               ascending = [True, True, True, True, True])
  d = df[['doc_id', 'pg_num','blk_id','sentence_id', 'doc_sentence_count']]
  d['linear_order'] = [i for i in range( 1,len(d)+1, 1)]
  d.to_csv(os.path.join(path, filename+"_linear_order"+".xlsx"))
  save_extr_ann_file(path, filename, event_extr_ann_map)
  

In [25]:
for key in paper_names:
  name = key
  ann, extr = ann_extr_file_pairs[name]
  combine_ann_extr_all(path, extractions_path, extr, ann, name[:-4]+"_all")

174 6212 91
['doc_id', 'annotated_page_num', 'para_num', 'event_id', 'event', 'locationContext', 'temporalContext', 'explanation', 'pg_num', 'blk_id', 'sentence_id', 'doc_sentence_count']




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['linear_order'] = [i for i in range( 1,len(d)+1, 1)]


302 10045 76
['doc_id', 'annotated_page_num', 'para_num', 'event_id', 'event', 'locationContext', 'temporalContext', 'explanation', 'pg_num', 'blk_id', 'sentence_id', 'doc_sentence_count']




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['linear_order'] = [i for i in range( 1,len(d)+1, 1)]


42 4429 47
['doc_id', 'annotated_page_num', 'para_num', 'event_id', 'event', 'locationContext', 'temporalContext', 'explanation', 'pg_num', 'blk_id', 'sentence_id', 'doc_sentence_count']




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['linear_order'] = [i for i in range( 1,len(d)+1, 1)]
