In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import json
from sqlalchemy import create_engine, inspect, Table, Column
import pandas as pd
from args_reader import read_args
from scicrunch_client_fast_mod import scicrunch
import sys
import time
from interlex_sql import interlex_sql
import re
from collections import defaultdict

args = read_args(api_key='../production_api_scicrunch_key.txt', engine_key='../production_engine_scicrunch_key.txt', production=True)
sql = interlex_sql(engine_key=args.engine_key)
sci = scicrunch(api_key=args.api_key, base_path=args.base_path, engine_key=args.engine_key)

## Creation of Ref files

In [None]:
label_to_id = sql.get_labels_to_ids_dict()

In [None]:
## So the shortNames list is the only complete data for some unknown reason at the moment; best to just use that to make pure annotations and update defintions
tier2 = json.load(open('/home/troy/Desktop/NDA/nda_shortName_data.json', 'r'))

In [None]:
annotation_items = ['required', 'condition', 'filterElement', 'type', 'size', 'valueRange', 'notes', 'id']

def collect_annotations(tid, anno_type, memo):
    annotations = []
    for annotation_key, annotations_values in memo.items():
        if annotation_key in annotation_items:
            if not annotations_values:
                value = annotation_key + ' : None'
            elif isinstance(annotations_values, list):
                value = annotation_key + ' : ' + '; '.join(annotations_values)
            else:
                value = annotation_key + ' : ' + str(annotations_values)
            annotations.append({'tid':tid, 'annotation_tid':anno_type, 'value':value})
    return annotations

complete_nda_tier2_production_ref = defaultdict(list)
cdes_with_no_data = []
tier2_ids = []
for k, v in tier2.items():
    try:
        for e in v['dataElements']:
            label = e['name'] + ' (' + k + ')'
            try:
                tid = label_to_id[label.lower().strip().replace("'", '')]['cde']
            except:
                print(label)
                sys.exit(label_to_id[label.lower().strip()])
            if not tid: #defaultdicts return null even with keyerror 
                sys.exit(k + e['name'])
            definition = e['description']
            annotations = collect_annotations(tid, 15074, e)

            complete_nda_tier2_production_ref[k].append({
                    'annotations':annotations,
                    'label':label,
                    'definition':definition, 
                    'id':tid,
            })
            tier2_ids.append(tid)
    except:
        cdes_with_no_data.append(k)

json.dump(cdes_with_no_data, open('/home/troy/Desktop/NDA/cdes_with_no_data.json', 'w'), indent=4)
json.dump(complete_nda_tier2_production_ref, open('/home/troy/Desktop/NDA/complete_nda_tier2_production_ref.json', 'w'), indent=4)
json.dump(tier2_ids, open('/home/troy/Desktop/NDA/tier2_ids.json', 'w'), indent=4)

## Analyzing what is missing or broken for annotations

In [None]:
annotations_df = sql.get_annotations()

In [None]:
id_to_label = sql.get_ids_to_labels_dict()

In [None]:
cdes_incomplete_annotations = []
cdes_no_annotations = []
ids_to_fix = []
for i, id in enumerate(tier2_ids[:]):
    
    hit = annotations_df.loc[annotations_df.tid == id]
    
    if hit.empty:
        cdes_no_annotations.append({'id':id, 'label':id_to_label[id]})
        ids_to_fix.append(id)
    else:
        vs = hit.to_dict('list')['value']
        if 'id' not in ''.join(vs):
            cdes_incomplete_annotations.append({'id':id, 'label':id_to_label[id]})
            ids_to_fix.append(id)
            
with open('/home/troy/Desktop/NDA/missing_annotaions.json', 'w') as f:
    json.dump(cdes_no_annotations, f, indent=4)
print(len(cdes_no_annotations))
with open('/home/troy/Desktop/NDA/incomplete_annotaions.json', 'w') as f:
    json.dump(cdes_incomplete_annotations, f, indent=4)
print(len(cdes_incomplete_annotations))
with open('/home/troy/Desktop/NDA/ids_that_were_missing_annotations_or_had_them_incomplete.txt', 'w') as f:
    f.write('\n'.join(map(str, ids_to_fix)))
print(len(ids_to_fix))

In [None]:
ids_to_fix = {id:True for id in ids_to_fix}

In [None]:
annotations = []
term_data = []
ids_to_fix = {42722:True, 208982:True}
for cde1, polycde2 in complete_nda_tier2_production_ref.items():
    for cde2 in polycde2:
        if ids_to_fix.get(cde2['id']):
            #annotations += cde2['annotations']
            term_data.append({'id':cde2['id'], 'definition':cde2['definition']})
            print(cde2)
        
#json.dump(annotations, open('/home/troy/Desktop/NDA/annotations_missing_or_incomplete_4_2018.json', 'w'), indent=4)
json.dump(term_data, open('/home/troy/Desktop/NDA/definition_fix_4_2018.json', 'w'), indent=4)

## Analyzing what hasn't been updated for definitions

In [None]:
terms_df = sql.get_terms()

In [None]:
print('start')
broken_definitions = []
for i, id in enumerate(tier2_ids[:]):
    
    hit = terms_df.loc[terms_df.id == id]
    
    if hit.empty:
        sys.exit(id)
    else:
        for row in hit.itertuples():
            if not row.definition or len(row.definition) == 0 or 'filterElement' in str(row.definition):                
                broken_definitions.append({'id':row.id, 'definition':row.definition})
                #sys.exit(row)
                
json.dump(broken_definitions, open('/home/troy/Desktop/NDA/definition_fix_4_2018.json', 'w'), indent=4)
print(len(broken_definitions))