In [1]:
import pandas as pd
from google.cloud import storage
from io import BytesIO
import re

In [2]:
# Instantiates a client
storage_client = storage.Client()
bucket = storage_client.get_bucket('datacamp-202518.appspot.com')
blob = storage.Blob('data/guide/variant_summary.txt', bucket)
content = blob.download_as_string()
type(content)

# datadir = '/home/xavier/data/guide/'
# clinvar_file = 'variant_summary.txt'

bytes

# First thing, download and survey the 'Clinvar' data
## ➜ ClinVar is a freely accessible, public archive of reports of the relationships among human variations and phenotypes, with supporting evidence.
## We are interested in the Phenotype data.

+ A phenotype results from the expression of an organism's genetic code, its genotype, as well as the influence of environmental factors and the interactions between the two. [Wikipedia](https://en.wikipedia.org/wiki/Phenotype)
+ from **[THIS FTP DIRECTORY](ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/)** (which is updated monthly)
+ ** Hosted the file [Variant_summary.txt](ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz)** (which is a 809K row dataset) on Google Cloud Storage



In [3]:
clinvar = pd.read_csv(BytesIO(content), sep='\t', lineterminator='\n')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# count the rows of data
clinvar.shape

(809509, 31)

# This has 809K Rows

In [6]:
# Look at the column names in this data
list(clinvar)

['#AlleleID',
 'Type',
 'Name',
 'GeneID',
 'GeneSymbol',
 'HGNC_ID',
 'ClinicalSignificance',
 'ClinSigSimple',
 'LastEvaluated',
 'RS# (dbSNP)',
 'nsv/esv (dbVar)',
 'RCVaccession',
 'PhenotypeIDS',
 'PhenotypeList',
 'Origin',
 'OriginSimple',
 'Assembly',
 'ChromosomeAccession',
 'Chromosome',
 'Start',
 'Stop',
 'ReferenceAllele',
 'AlternateAllele',
 'Cytogenetic',
 'ReviewStatus',
 'NumberSubmitters',
 'Guidelines',
 'TestedInGTR',
 'OtherIDs',
 'SubmitterCategories',
 'VariationID']

# Let's find all the unique values in the PhenotypeIDS column
### Starting with just the Phenotype data

In [7]:
# Export the data for messing around with REGEX
# datadir = '/home/xavier/data/guide/'
# pheno = ['PhenotypeIDS','PhenotypeList']
# clinvar.to_csv(datadir + 'pheno-data.csv', columns = pheno)

In [8]:
datadir = '/home/xavier/data/guide/'
phenos = pd.read_csv(datadir + 'pheno-data.csv')

In [9]:
phenos.shape

(809509, 3)

In [10]:
phenos

Unnamed: 0.1,Unnamed: 0,PhenotypeIDS,PhenotypeList
0,0,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511","Spastic paraplegia 48, autosomal recessive"
1,1,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511","Spastic paraplegia 48, autosomal recessive"
2,2,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511","Spastic paraplegia 48, autosomal recessive"
3,3,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511","Spastic paraplegia 48, autosomal recessive"
4,4,"MedGen:C0795949,OMIM:251300",Galloway-Mowat syndrome
5,5,"MedGen:C0795949,OMIM:251300",Galloway-Mowat syndrome
6,6,"MedGen:C1838979,OMIM:252010;MedGen:CN517202",Mitochondrial complex I deficiency;not provided
7,7,"MedGen:C1838979,OMIM:252010;MedGen:CN517202",Mitochondrial complex I deficiency;not provided
8,8,"MedGen:C1838979,OMIM:252010",Mitochondrial complex I deficiency
9,9,"MedGen:C1838979,OMIM:252010",Mitochondrial complex I deficiency


In [11]:
# Look at observations in Row #2
phenos.iloc[2]

Unnamed: 0                                                      2
PhenotypeIDS     MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511
PhenotypeList          Spastic paraplegia 48, autosomal recessive
Name: 2, dtype: object

In [12]:
# This returns both columns
# Let's just look at typical data in the IDs columns:
phenos.iloc[18].PhenotypeIDS

'na;Human Phenotype Ontology:HP:0000992,MedGen:C0349506;MedGen:C3469186,OMIM:235200;MedGen:C3150862;MedGen:C0027672,SNOMED CT:699346009;MedGen:C0392514,SNOMED CT:35400008;MedGen:C2673520,OMIM:612635;na;na;Human Phenotype Ontology:HP:0010473,MedGen:C0151861;MedGen:C3280096,OMIM:614193;MedGen:CN517202;MedGen:CN169374'

In [13]:
phenos.iloc[4].PhenotypeIDS

'MedGen:C0795949,OMIM:251300'

### OK, so now let's count the unique number different registry types
+ e.g. MedGen
+ e.g. OMIM
+ etc.

In [33]:
# first, just look at the registry types:
phenos.PhenotypeIDS.str.findall(r'(\w+):\w+')

0                                  [MedGen, OMIM, Orphanet]
1                                  [MedGen, OMIM, Orphanet]
2                                  [MedGen, OMIM, Orphanet]
3                                  [MedGen, OMIM, Orphanet]
4                                            [MedGen, OMIM]
5                                            [MedGen, OMIM]
6                                    [MedGen, OMIM, MedGen]
7                                    [MedGen, OMIM, MedGen]
8                                            [MedGen, OMIM]
9                                            [MedGen, OMIM]
10                     [MeSH, MedGen, MedGen, OMIM, MedGen]
11                     [MeSH, MedGen, MedGen, OMIM, MedGen]
12                                           [MedGen, OMIM]
13                                           [MedGen, OMIM]
14                                           [MedGen, OMIM]
15                                                       []
16                                      

### Looks like the findall is returning a TUPLE

In [34]:
# create an array to hold all the vales from 809K rows
alltypes = []

# create a function to match just the registry identifier
# (the bit _before_ the colon)

def extract_types (x):
    res = re.findall(r'(\w+):\w+', x)
    if res:
        for i in res:
            alltypes.append(i)

In [36]:
phenos.PhenotypeIDS.apply(extract_types)
len(alltypes)

4591448

In [37]:
# see how many unique types of registries
set(alltypes)

{'CT', 'EFO', 'Gene', 'MeSH', 'MedGen', 'OMIM', 'Ontology', 'Orphanet'}

## ...⇧
> **FINDING:** there are **8 basic 'types'** of phenotype registries. 

+ Types like CT, EFO, and Ontology need to include the whole name 


# Thinking about the new dataframe design
## where to add columns...and where to add rows?

+ Should consider a multindex (but not sure if that is easily read by ESRI
+ Should also consider the design below:
    + with 8 basic column types
    + each phenotype is listed in the approapriate 'registry column'
    + (usually in either one or two registries)
    + and the row information is repeated.

![Splitting Phenotypes into rows](https://i.imgur.com/5XqsxYD.png)

# There are 21K unique groups in the PhenotypeID column
## What are the main types of data?
+ (they seem to be listed in key pairs)

In [None]:
# # Group By Phenotype and roll up the data in this column 
# phenotypes = clinvar.groupby('PhenotypeIDS').size().reset_index(name='count')
# phenotypes.sort_values(by='count', ascending=False,inplace=True)
# phenotypes

## ...⇧
> **FINDING:** 21K unique groupings of Phenotypes

+ A lot of variance in the Phenotype IDs, pairs (dict) including:
    + MedGen
    + SNOMED CT
    + Human Phenotype Ontology
+ Disparate ways of listing the code
    + Human Phenotype Ontology:HP:0007018
+ Disparate separators
    + comma
    + semicolon
