In [1]:
import pandas as pd
import re

In [2]:
datadir = '/Users/xthomas/Documents/data/guide/'
clinvar_file = 'variant_summary_02.txt' # Updated 5/21/2018

# First thing, download and survey the 'Clinvar' data

## ➜ ClinVar is a freely accessible, public archive of reports of the relationships among human variations and phenotypes, with supporting evidence.

+ A phenotype results from the expression of an organism's genetic code, its genotype, as well as the influence of environmental factors and the interactions between the two. [Wikipedia](https://en.wikipedia.org/wiki/Phenotype)
+ File is from **[THIS FTP DIRECTORY](ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/)** (which is updated monthly)
+ ** Download this file [Variant_summary.txt](ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz)** which is a 809K row dataset listing 

In [3]:
clinvar = pd.read_csv(datadir + clinvar_file, sep='\t', lineterminator='\n')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
list(clinvar)

['#AlleleID',
 'Type',
 'Name',
 'GeneID',
 'GeneSymbol',
 'HGNC_ID',
 'ClinicalSignificance',
 'ClinSigSimple',
 'LastEvaluated',
 'RS# (dbSNP)',
 'nsv/esv (dbVar)',
 'RCVaccession',
 'PhenotypeIDS',
 'PhenotypeList',
 'Origin',
 'OriginSimple',
 'Assembly',
 'ChromosomeAccession',
 'Chromosome',
 'Start',
 'Stop',
 'ReferenceAllele',
 'AlternateAllele',
 'Cytogenetic',
 'ReviewStatus',
 'NumberSubmitters',
 'Guidelines',
 'TestedInGTR',
 'OtherIDs',
 'SubmitterCategories',
 'VariationID']

In [5]:
# count the rows of data
clinvar.shape

(811417, 31)

In [6]:
# Work with Human Reference Genome #38
latest = clinvar[clinvar['Assembly'] == 'GRCh38']
latest.shape

(389089, 31)

In [7]:
# Further filter the dataset down by these 'authoritative' flags
# from the 'Review Status' column

authoritative = latest[
    (latest['ReviewStatus'] == 'criteria provided, multiple submitters, no conflicts') | 
    (latest['ReviewStatus'] == 'practice guideline') | 
    (latest['ReviewStatus'] == 'reviewed by expert panel') 
]
authoritative.shape

(62166, 31)

In [8]:
authstub = authoritative.filter(['PhenotypeIDS','PhenotypeList','ReviewStatus']).iloc[0:1500]
authstub.shape

(1500, 3)

In [9]:
authstub

Unnamed: 0,PhenotypeIDS,PhenotypeList,ReviewStatus
11,"MeSH:D030342,MedGen:C0950123;MedGen:C1838979,O...",Inborn genetic diseases;Mitochondrial complex ...,"criteria provided, multiple submitters, no con..."
20,"MedGen:C3469186,OMIM:235200;MedGen:C0392514,SN...",Hemochromatosis type 1;Hereditary hemochromato...,"criteria provided, multiple submitters, no con..."
57,"MedGen:C3150878,OMIM:613616,Orphanet:ORPHA93600","Primary hyperoxaluria, type III","criteria provided, multiple submitters, no con..."
70,"MedGen:C1419614,OMIM:606068;MedGen:CN517202",Retinitis pigmentosa 28;not provided,"criteria provided, multiple submitters, no con..."
72,"MedGen:C1419614,OMIM:606068;MedGen:CN517202",Retinitis pigmentosa 28;not provided,"criteria provided, multiple submitters, no con..."
74,"MedGen:C1419614,OMIM:606068",Retinitis pigmentosa 28,"criteria provided, multiple submitters, no con..."
105,"MedGen:C3150801,OMIM:613559,Orphanet:ORPHA2549...",Combined oxidative phosphorylation deficiency ...,"criteria provided, multiple submitters, no con..."
107,"MedGen:C3553886,OMIM:614852;MedGen:CN517202",Primary autosomal recessive microcephaly 9;not...,"criteria provided, multiple submitters, no con..."
184,"na;MedGen:C0342701,OMIM:275350,Orphanet:ORPHA8...",TCN2 POLYMORPHISM;Transcobalamin II deficiency...,"criteria provided, multiple submitters, no con..."
223,MedGen:CN230736;na;Human Phenotype Ontology:HP...,"Cardiovascular phenotype;HYPERHOMOCYSTEINEMIA,...","criteria provided, multiple submitters, no con..."


## Time to start matching the OMIM tags

+ First order is to find OMIM tags that match the **59 ACMG Recommendations Diseases**
+ Import this file: [ACMG Recommendations for Reporting of Incidental Findings in Clinical Exome and Genome Sequencing](https://www.ncbi.nlm.nih.gov/clinvar/docs/acmg/)

In [10]:
# Import Disease names and OMIM tags and convert to a list
acmg = pd.read_csv(datadir + 'ACMG_Conditions_OMIM.csv')
acmgtags = acmg.ACMGTAGS.tolist()

In [11]:
# function to write OMIM TAGS to a new Column
def omim_tags (x):
    # Find anything that has an OMIM tag, from 1 to 6 numerals long
    res = re.findall(r"(?:OMIM:(\d{1,6}))",x)
    if res:
        return res
    else:
#         print("NA")
         return ("NA")

In [26]:
authstub['OMIMTAGS'] = authoritative.PhenotypeIDS.apply(omim_tags)

In [27]:
authstub

Unnamed: 0,PhenotypeIDS,PhenotypeList,ReviewStatus,OMIMTAGS
11,"MeSH:D030342,MedGen:C0950123;MedGen:C1838979,O...",Inborn genetic diseases;Mitochondrial complex ...,"criteria provided, multiple submitters, no con...",[252010]
20,"MedGen:C3469186,OMIM:235200;MedGen:C0392514,SN...",Hemochromatosis type 1;Hereditary hemochromato...,"criteria provided, multiple submitters, no con...","[235200, 612635]"
57,"MedGen:C3150878,OMIM:613616,Orphanet:ORPHA93600","Primary hyperoxaluria, type III","criteria provided, multiple submitters, no con...",[613616]
70,"MedGen:C1419614,OMIM:606068;MedGen:CN517202",Retinitis pigmentosa 28;not provided,"criteria provided, multiple submitters, no con...",[606068]
72,"MedGen:C1419614,OMIM:606068;MedGen:CN517202",Retinitis pigmentosa 28;not provided,"criteria provided, multiple submitters, no con...",[606068]
74,"MedGen:C1419614,OMIM:606068",Retinitis pigmentosa 28,"criteria provided, multiple submitters, no con...",[606068]
105,"MedGen:C3150801,OMIM:613559,Orphanet:ORPHA2549...",Combined oxidative phosphorylation deficiency ...,"criteria provided, multiple submitters, no con...",[613559]
107,"MedGen:C3553886,OMIM:614852;MedGen:CN517202",Primary autosomal recessive microcephaly 9;not...,"criteria provided, multiple submitters, no con...",[614852]
184,"na;MedGen:C0342701,OMIM:275350,Orphanet:ORPHA8...",TCN2 POLYMORPHISM;Transcobalamin II deficiency...,"criteria provided, multiple submitters, no con...",[275350]
223,MedGen:CN230736;na;Human Phenotype Ontology:HP...,"Cardiovascular phenotype;HYPERHOMOCYSTEINEMIA,...","criteria provided, multiple submitters, no con...",[236200]


In [29]:
# authstub.to_csv(datadir + 'auth_1500_v1.csv')

In [72]:
# function to write "OMIM > ACMG" matches to a new Column
# you want this to match on ROW 246 of the Google Sheet
def acmg_matches (x):
    # Find anything that has an OMIM tag, from 4 to 6 numerals long
    res = re.findall(r"(?:OMIM:(\d{4,6}))",x)
    print ("On this line, there are {} OMIM Tags".format(len(res)))
    if res:
        for match in res:
            # Check if matches against ACMG
            if int(match) in acmgtags:
                print ("ACMG Match Found here: {}".format(match))
            else:
                print ("{} (NOT ACMG)".format(match))
    else:
        print ("NO OMIM")
#          return ("NA")

In [73]:
test = authstub.PhenotypeIDS.iloc[275:285].apply(acmg_matches)

On this line, there are 2 OMIM Tags
252500 NO ACMG
252600 NO ACMG
On this line, there are 3 OMIM Tags
232500 NO ACMG
232500 NO ACMG
263570 NO ACMG
On this line, there are 1 OMIM Tags
216550 NO ACMG
On this line, there are 1 OMIM Tags
203750 NO ACMG
On this line, there are 2 OMIM Tags
ACMG Match Found here: 143890
ACMG Match Found here: 603776
On this line, there are 1 OMIM Tags
273800 NO ACMG
On this line, there are 1 OMIM Tags
246700 NO ACMG
On this line, there are 1 OMIM Tags
257220 NO ACMG
On this line, there are 1 OMIM Tags
257220 NO ACMG
On this line, there are 1 OMIM Tags
257220 NO ACMG


In [None]:
onlyomim = authoritative[authoritative['OMIMTAGS'] != "NA"]
onlyacmg = authoritative[authoritative['ACMGTAGS'] != "NA"]

## NOTES FROM ALICE

+ Expert Panel, Practice Guideline, multiple submitters, no conflict
+ Check out information on drug response
    + Add  Drug Response (clniincial signifigance > Drug Response)
+ Assembly
    + Only Keep 38


