In [1]:
import pandas as pd
from google.cloud import storage
from io import BytesIO
import re

In [2]:
# Instantiates a client
storage_client = storage.Client()
bucket = storage_client.get_bucket('datacamp-202518.appspot.com')
blob = storage.Blob('data/guide/variant_summary.txt', bucket)
content = blob.download_as_string()
type(content)

# datadir = '/home/xavier/data/guide/'
# clinvar_file = 'variant_summary.txt'

bytes

# First thing, download and survey the 'Clinvar' data
## ➜ ClinVar is a freely accessible, public archive of reports of the relationships among human variations and phenotypes, with supporting evidence.
## We are interested in the Phenotype data.

+ A phenotype results from the expression of an organism's genetic code, its genotype, as well as the influence of environmental factors and the interactions between the two. [Wikipedia](https://en.wikipedia.org/wiki/Phenotype)
+ from **[THIS FTP DIRECTORY](ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/)** (which is updated monthly)
+ ** Hosted the file [Variant_summary.txt](ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz)** (which is a 809K row dataset) on Google Cloud Storage



In [3]:
clinvar = pd.read_csv(BytesIO(content), sep='\t', lineterminator='\n')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# count the rows of data
clinvar.shape

(809509, 31)

# This has 809K Rows

In [5]:
# Look at the column names in this data
list(clinvar)

['#AlleleID',
 'Type',
 'Name',
 'GeneID',
 'GeneSymbol',
 'HGNC_ID',
 'ClinicalSignificance',
 'ClinSigSimple',
 'LastEvaluated',
 'RS# (dbSNP)',
 'nsv/esv (dbVar)',
 'RCVaccession',
 'PhenotypeIDS',
 'PhenotypeList',
 'Origin',
 'OriginSimple',
 'Assembly',
 'ChromosomeAccession',
 'Chromosome',
 'Start',
 'Stop',
 'ReferenceAllele',
 'AlternateAllele',
 'Cytogenetic',
 'ReviewStatus',
 'NumberSubmitters',
 'Guidelines',
 'TestedInGTR',
 'OtherIDs',
 'SubmitterCategories',
 'VariationID']

# Let's find all the unique values in the PhenotypeIDS column

In [6]:
uniques = clinvar.PhenotypeIDS.unique()
uniques

array(['MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511',
       'MedGen:C0795949,OMIM:251300',
       'MedGen:C1838979,OMIM:252010;MedGen:CN517202', ..., 'OMIM:301013',
       'Gene:261,MedGen:C1859721,OMIM:208100,Orphanet:ORPHA1143',
       'MedGen:C0809936,OMIM:211380'], dtype=object)

In [7]:
len(uniques)


21161

# There are 21K unique groups in the PhenotypeID column
## What are the main types of data?
+ (they seem to be listed in key pairs)

In [8]:
uniques[2]

'MedGen:C1838979,OMIM:252010;MedGen:CN517202'

In [10]:
uniques[3]

'MedGen:C1838979,OMIM:252010'

In [12]:
uniques[4]

'MeSH:D030342,MedGen:C0950123;MedGen:C1838979,OMIM:252010;MedGen:CN169374'

In [23]:
numpairs = re.compile(r"""
((\b[a-zA-Z]+\b):.+)     # Word before colon
,                # Skip the Value after colon
""", re.VERBOSE)

In [30]:
numpairs = re.compile(r"""
(\b[a-zA-Z]+\b:\b.+\b,)+     # Word before colon
""", re.VERBOSE)

In [31]:
numpairs.findall(uniques[2])

['MedGen:C1838979,']

In [32]:
numpairs.search(uniques[2]).group()

'MedGen:C1838979,'

In [None]:
print(re.findall(r'(\b[a-zA-Z]+\b):.+,',uniques[1]))

In [None]:
p = re.compile(r'(\b[a-zA-Z]+\b):.+,')

In [None]:
p.findall(uniques[1])

In [None]:
# Group By Phenotype and roll up the data in this column 
phenotypes = clinvar.groupby('PhenotypeIDS').size().reset_index(name='count')
phenotypes.sort_values(by='count', ascending=False,inplace=True)
phenotypes

## ...⇧
> **FINDING:** 21K unique groupings of Phenotypes

+ A lot of variance in the Phenotype IDs, pairs (dict) including:
    + MedGen
    + SNOMED CT
    + Human Phenotype Ontology
+ Disparate ways of listing the code
    + Human Phenotype Ontology:HP:0007018
+ Disparate separators
    + comma
    + semicolon


In [None]:
# with a small set of rows, look at a Typical 'raw' row of 
# data. Trying to understand what a typical cell contains

typical = clinvar[clinvar['PhenotypeIDS'].str.contains("MedGen:C2673611", na=False)]

# drop columns for a simpler 'preview' of these few rows
# (remember, this is a throwaway snapshot, just trying to get a view
# of the phenotype data)

typical.drop(['#AlleleID',
     'RS# (dbSNP)',
     'ClinSigSimple',
     'nsv/esv (dbVar)',
     'RCVaccession',
     'Origin',
     'Start',
     'Stop',    
     'Assembly',
     'ChromosomeAccession',
     'Chromosome',
     'ReferenceAllele',
     'AlternateAllele',
     'Cytogenetic',
     'NumberSubmitters',
     'Guidelines',
     'TestedInGTR',
     'OtherIDs',
     'SubmitterCategories',
     'VariationID'],axis=1,inplace=True)
typical

## ...⇧
> **FINDING:** Looks like a typical cell contains a list of MedGen references

+ MedGen:Number
+ separated by semicolons

In [None]:
# typical.to_csv(datadir + 'typical_clinvar_rows.csv')