In [2]:
import json
import gzip
from collections import defaultdict

In [3]:
def clinvar_jsons():
    with gzip.open("clinvar.json.gz") as f:
        for line in f:
            line = line.rstrip()
            yield json.loads(line.decode())

In [47]:
# def get_trait_name(trait_names):
#     trait_name_list = []
#     for name in trait_names:
#         # First trait name in the list will always be the "Preferred" one
#         if name['elementValue']['type'] == 'Preferred':
#             trait_name_list = [name['elementValue']['value']] + trait_name_list
#         elif name['elementValue']['type'] in ["EFO URL", "EFO id", "EFO name"]:
#             continue
#         else:
#             trait_name_list = trait_name_list + [name['elementValue']['value']]
#     return trait_name_list[0]

In [14]:
def get_trait_names(trait_set):
    trait_list = []
    for trait in trait_set['trait']:
        trait_list.append([])
        for name in trait['name']:
            # First trait name in the list will always be the "Preferred" one
            if name['elementValue']['value'].lower() == "not provided":
                continue
            if name['elementValue']['type'] == 'Preferred':
                trait_list[-1] = [name['elementValue']['value']] + trait_list[-1]
            elif name['elementValue']['type'] in ["EFO URL", "EFO id", "EFO name"]:
                continue  # if the trait name not originally from clinvar
            else:
                trait_list[-1].append(name['elementValue']['value'])
    
    trait_names_to_return = []
    for trait in trait_list:
        if len(trait) == 0:
            continue
        trait_names_to_return.append(trait[0].lower())

    return trait_names_to_return

In [15]:
trait_names = defaultdict(int)

for clinvar_json in clinvar_jsons():
    
    clin_sigs = set()
    for clinvar_assertion in clinvar_json["clinvarSet"]["clinVarAssertion"]:
        if "description" in clinvar_assertion["clinicalSignificance"]:
            for description in clinvar_assertion["clinicalSignificance"]["description"]:
                clin_sigs.add(description)
        else:
            continue
    if len(clin_sigs.intersection({"Pathogenic", "Likely pathogenic"})) == 0:
        continue
    var_trait_names = get_trait_names(clinvar_json["clinvarSet"]["referenceClinVarAssertion"]["traitSet"])
    for name in var_trait_names:
        trait_names[name] += 1

In [17]:
with open("traitnames_counts_pathlikelypath_remnp.txt", "wt") as f:
    for name, count in trait_names.items():
        f.write(name + "\t" + str(count) + "\n")

In [17]:
for name, count in trait_names.items():
    print(name, count)

ungueal dystrophy 1
amyotrophic lateral sclerosis type 2 1
ectodermal dysplasia 7, hair/nail type 1
hydatidiform mole 1
deafness, autosomal recessive 7 1
hyperphosphatasia with mental retardation syndrome 5 1
niemann-pick disease, type a 1
luscan-lumish syndrome 1
neurofibromatosis, type 1 1
myopathy, centronuclear, 5 1
mucopolysaccharidosis, type vi, intermediate 1
pontocerebellar hypoplasia, type 1c 1
congenital central hypoventilation 1
breast cancer, early-onset 1
preeclampsia/eclampsia 4 1
cerebroretinal microangiopathy with calcifications and cysts 1
bleeding disorder platelet type macrothrombocytopenia 1
wolcott-rallison dysplasia 1
ehlers-danlos syndrome type 7, autosomal recessive 1
homocystinuria due to mthfr deficiency 1
intrinsic factor deficiency 1
cranioosteoarthropathy 1
cyclical vomiting syndrome 1
ciliary dyskinesia, primary, 21 1
marfan syndrome, mild variable 1
mitochondrial complex iii deficiency, nuclear type 8 1
primary familial hypertrophic cardiomyopathy 1
metac

In [16]:
print(len(trait_names))

5201
