In [1]:
import pandas as pd
import re

In [2]:
datadir = '/home/xavier/data/'
clinvar_file = 'variant_summary_02.txt' # Updated 5/21/2018

# First thing, download and survey the 'Clinvar' data

## ➜ ClinVar is a freely accessible, public archive of reports of the relationships among human variations and phenotypes, with supporting evidence.

+ A phenotype results from the expression of an organism's genetic code, its genotype, as well as the influence of environmental factors and the interactions between the two. [Wikipedia](https://en.wikipedia.org/wiki/Phenotype)
+ File is from **[THIS FTP DIRECTORY](ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/)** (which is updated monthly)
+ ** Download this file [Variant_summary.txt](ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz)** which is a 809K row dataset listing 

In [3]:
clinvar = pd.read_csv(datadir + clinvar_file, sep='\t', lineterminator='\n')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# count the rows of data
clinvar.shape

(811417, 31)

In [5]:
# Work with Human Reference Genome #38
latest = clinvar[clinvar['Assembly'] == 'GRCh38']
latest.shape

(389089, 31)

In [6]:
# Further filter the dataset down by these 'authoritative' flags
# from the 'Review Status' column

authoritative = latest[
    (latest['ReviewStatus'] == 'criteria provided, multiple submitters, no conflicts') | 
    (latest['ReviewStatus'] == 'practice guideline') | 
    (latest['ReviewStatus'] == 'reviewed by expert panel') 
]
authoritative.shape

(62166, 31)

## Time to start matching the OMIM tags

+ First order is to find OMIM tags that match the **59 ACMG Recommendations Diseases**
+ Import this file: [ACMG Recommendations for Reporting of Incidental Findings in Clinical Exome and Genome Sequencing](https://www.ncbi.nlm.nih.gov/clinvar/docs/acmg/)

In [7]:
# Import Disease names and OMIM tags and convert to a list
acmg = pd.read_csv(datadir + 'ACMG_Conditions_OMIM.csv')
acmgtags = acmg.ACMGTAGS.tolist()

In [8]:
acmg

Unnamed: 0.1,Unnamed: 0,DISEASENAME,ACMGTAGS,OMIMURL
0,0,Adenomatous polyposis coli,175100,http://omim.org/entry/175100
1,1,"Aortic aneurysm, familial thoracic 4",132900,http://omim.org/entry/132900
2,2,"Aortic aneurysm, familial thoracic 6",611788,http://omim.org/entry/611788
3,3,Arrhythmogenic right ventricular cardiomyopath...,604400,http://omim.org/entry/604400
4,4,Arrhythmogenic right ventricular cardiomyopath...,607450,http://omim.org/entry/607450
5,5,Arrhythmogenic right ventricular cardiomyopath...,609040,http://omim.org/entry/609040
6,6,Arrhythmogenic right ventricular cardiomyopath...,610193,http://omim.org/entry/610193
7,7,Arrhythmogenic right ventricular cardiomyopath...,610476,http://omim.org/entry/610476
8,8,"Breast-ovarian cancer, familial 1",604370,http://omim.org/entry/604370
9,9,"Breast-ovarian cancer, familial 2",612555,http://omim.org/entry/612555


In [9]:
# function to write OMIM TAGS to a new Column
def acmg_matches (x):
    # Find anything that has an OMIM tag, from 1 to 6 numerals long
    res = re.finditer(r"(?:OMIM:(\d{4,6}))",x)
    if res:
        for hit in res:
            if int(hit.group(1)) in acmgtags:
                return int(hit.group(1))
        else:
            return("NOACMG")

In [10]:
authoritative['ACMGTAGS'] = authoritative.PhenotypeIDS.apply(acmg_matches)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [11]:
authoritative.shape

(62166, 32)

In [12]:
onlyacmg = authoritative[authoritative['ACMGTAGS'] != "NOACMG"]

In [13]:
onlyacmg.shape

(12138, 32)

In [14]:
onlyacmg.head()

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,AlternateAllele,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories,VariationID,ACMGTAGS
761,15440,single nucleotide variant,NM_017841.2(SDHAF2):c.232G>A (p.Gly78Arg),54949,SDHAF2,HGNC:26034,Pathogenic,1,"Sep 06, 2017",113560320,...,A,11q12.2,"criteria provided, multiple submitters, no con...",4,"ACMG2013,ACMG2016",N,"OMIM Allelic Variant:613019.0001,UniProtKB (pr...",3,401,601650
1366,15773,single nucleotide variant,NM_024334.2(TMEM43):c.1073C>T (p.Ser358Leu),79188,TMEM43,HGNC:28472,Pathogenic,1,"Jun 09, 2017",63750743,...,T,3p25.1,"criteria provided, multiple submitters, no con...",8,"ACMG2013,ACMG2016",N,"OMIM Allelic Variant:612048.0001,UniProtKB (pr...",3,734,604400
1484,15837,single nucleotide variant,NM_000038.5(APC):c.904C>T (p.Arg302Ter),324,APC,HGNC:583,Pathogenic,1,"Apr 22, 2017",137854568,...,T,5q22.2,"criteria provided, multiple submitters, no con...",8,"ACMG2013,ACMG2016",N,"HGMD:CM910029,OMIM Allelic Variant:611731.0002...",3,798,175100
1490,15840,single nucleotide variant,NM_000038.5(APC):c.4012C>T (p.Gln1338Ter),324,APC,HGNC:583,Pathogenic/Likely pathogenic,1,"Mar 22, 2016",121913327,...,T,5q22.2,"criteria provided, multiple submitters, no con...",4,"ACMG2013,ACMG2016",N,OMIM Allelic Variant:611731.0009,3,801,175100
1501,15845,single nucleotide variant,NM_000038.5(APC):c.1621C>T (p.Gln541Ter),324,APC,HGNC:583,Pathogenic,1,"May 24, 2017",137854572,...,T,5q22.2,"criteria provided, multiple submitters, no con...",4,"ACMG2013,ACMG2016",N,OMIM Allelic Variant:611731.0014,3,806,175100


In [15]:
latlong = pd.read_csv(datadir + '380K-rows.csv')

In [16]:
latlong.VariationID.isin(onlyacmg.VariationID).value_counts()

False    376951
True      12138
Name: VariationID, dtype: int64

In [17]:
result = pd.merge(onlyacmg,
                latlong[["VariationID","x","y"]],
                on='VariationID',
                how='left')

In [18]:
result.head()

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories,VariationID,ACMGTAGS,x,y
0,15440,single nucleotide variant,NM_017841.2(SDHAF2):c.232G>A (p.Gly78Arg),54949,SDHAF2,HGNC:26034,Pathogenic,1,"Sep 06, 2017",113560320,...,"criteria provided, multiple submitters, no con...",4,"ACMG2013,ACMG2016",N,"OMIM Allelic Variant:613019.0001,UniProtKB (pr...",3,401,601650,7.802864,1.796336
1,15773,single nucleotide variant,NM_024334.2(TMEM43):c.1073C>T (p.Ser358Leu),79188,TMEM43,HGNC:28472,Pathogenic,1,"Jun 09, 2017",63750743,...,"criteria provided, multiple submitters, no con...",8,"ACMG2013,ACMG2016",N,"OMIM Allelic Variant:612048.0001,UniProtKB (pr...",3,734,604400,-70.133587,50.879464
2,15837,single nucleotide variant,NM_000038.5(APC):c.904C>T (p.Arg302Ter),324,APC,HGNC:583,Pathogenic,1,"Apr 22, 2017",137854568,...,"criteria provided, multiple submitters, no con...",8,"ACMG2013,ACMG2016",N,"HGMD:CM910029,OMIM Allelic Variant:611731.0002...",3,798,175100,57.979702,40.916274
3,15840,single nucleotide variant,NM_000038.5(APC):c.4012C>T (p.Gln1338Ter),324,APC,HGNC:583,Pathogenic/Likely pathogenic,1,"Mar 22, 2016",121913327,...,"criteria provided, multiple submitters, no con...",4,"ACMG2013,ACMG2016",N,OMIM Allelic Variant:611731.0009,3,801,175100,58.0013,40.916274
4,15845,single nucleotide variant,NM_000038.5(APC):c.1621C>T (p.Gln541Ter),324,APC,HGNC:583,Pathogenic,1,"May 24, 2017",137854572,...,"criteria provided, multiple submitters, no con...",4,"ACMG2013,ACMG2016",N,OMIM Allelic Variant:611731.0014,3,806,175100,57.990875,40.916274


In [19]:
disease = pd.merge(result,
                acmg[["DISEASENAME","ACMGTAGS","OMIMURL"]],
                on='ACMGTAGS',
                how='left')

In [20]:
disease

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories,VariationID,ACMGTAGS,x,y,DISEASENAME,OMIMURL
0,15440,single nucleotide variant,NM_017841.2(SDHAF2):c.232G>A (p.Gly78Arg),54949,SDHAF2,HGNC:26034,Pathogenic,1,"Sep 06, 2017",113560320,...,"ACMG2013,ACMG2016",N,"OMIM Allelic Variant:613019.0001,UniProtKB (pr...",3,401,601650,7.802864,1.796336,Paragangliomas 2,http://omim.org/entry/601650
1,15773,single nucleotide variant,NM_024334.2(TMEM43):c.1073C>T (p.Ser358Leu),79188,TMEM43,HGNC:28472,Pathogenic,1,"Jun 09, 2017",63750743,...,"ACMG2013,ACMG2016",N,"OMIM Allelic Variant:612048.0001,UniProtKB (pr...",3,734,604400,-70.133587,50.879464,Arrhythmogenic right ventricular cardiomyopath...,http://omim.org/entry/604400
2,15837,single nucleotide variant,NM_000038.5(APC):c.904C>T (p.Arg302Ter),324,APC,HGNC:583,Pathogenic,1,"Apr 22, 2017",137854568,...,"ACMG2013,ACMG2016",N,"HGMD:CM910029,OMIM Allelic Variant:611731.0002...",3,798,175100,57.979702,40.916274,Adenomatous polyposis coli,http://omim.org/entry/175100
3,15840,single nucleotide variant,NM_000038.5(APC):c.4012C>T (p.Gln1338Ter),324,APC,HGNC:583,Pathogenic/Likely pathogenic,1,"Mar 22, 2016",121913327,...,"ACMG2013,ACMG2016",N,OMIM Allelic Variant:611731.0009,3,801,175100,58.001300,40.916274,Adenomatous polyposis coli,http://omim.org/entry/175100
4,15845,single nucleotide variant,NM_000038.5(APC):c.1621C>T (p.Gln541Ter),324,APC,HGNC:583,Pathogenic,1,"May 24, 2017",137854572,...,"ACMG2013,ACMG2016",N,OMIM Allelic Variant:611731.0014,3,806,175100,57.990875,40.916274,Adenomatous polyposis coli,http://omim.org/entry/175100
5,15846,single nucleotide variant,NM_000038.5(APC):c.1660C>T (p.Arg554Ter),324,APC,HGNC:583,Pathogenic,1,"Aug 20, 2017",137854573,...,"ACMG2013,ACMG2016",N,OMIM Allelic Variant:611731.0015,3,807,175100,57.991672,40.916274,Adenomatous polyposis coli,http://omim.org/entry/175100
6,15847,single nucleotide variant,NM_000038.5(APC):c.1690C>T (p.Arg564Ter),324,APC,HGNC:583,Pathogenic,1,"Jul 07, 2017",137854574,...,"ACMG2013,ACMG2016",N,OMIM Allelic Variant:611731.0016,3,808,175100,57.991699,40.916274,Adenomatous polyposis coli,http://omim.org/entry/175100
7,15849,single nucleotide variant,NM_000038.5(APC):c.2805C>A (p.Tyr935Ter),324,APC,HGNC:583,Pathogenic,1,"Aug 01, 2017",137854575,...,"ACMG2013,ACMG2016",N,OMIM Allelic Variant:611731.0018,3,810,175100,58.000215,40.916274,Adenomatous polyposis coli,http://omim.org/entry/175100
8,15851,deletion,NM_000038.5(APC):c.4391_4394delAGAG (p.Glu1464...,324,APC,HGNC:583,Pathogenic,1,"Jul 20, 2017",387906235,...,"ACMG2013,ACMG2016",N,OMIM Allelic Variant:611731.0020,3,812,175100,58.001640,40.916274,Adenomatous polyposis coli,http://omim.org/entry/175100
9,15853,single nucleotide variant,NM_000038.5(APC):c.643C>T (p.Gln215Ter),324,APC,HGNC:583,Pathogenic,1,"Nov 15, 2016",137854577,...,"ACMG2013,ACMG2016",N,OMIM Allelic Variant:611731.0022,3,814,175100,57.948564,40.916274,Adenomatous polyposis coli,http://omim.org/entry/175100


In [78]:
ncbiurl = 'https://www.ncbi.nlm.nih.gov/clinvar/variation/'

In [79]:
def new_url (x):
    return (ncbiurl + str(x))
disease["VARIATIONURL"] = disease["VariationID"].apply(new_url)

In [81]:
disease.to_csv(datadir + 'acmg_latlong_urls.csv')

In [82]:
disease.shape

(12138, 37)