In [1]:
import pandas as pd
import re

In [2]:
# Setup (local) directories

datadir = '/home/xavier/data/guide/'
clinvar_file = 'variant_summary.txt'

# First thing, download and survey the 'Clinvar' data
## ➜ ClinVar is a freely accessible, public archive of reports of the relationships among human variations and phenotypes, with supporting evidence.
## We are interested in the Phenotype data.

+ A phenotype results from the expression of an organism's genetic code, its genotype, as well as the influence of environmental factors and the interactions between the two. [Wikipedia](https://en.wikipedia.org/wiki/Phenotype)
+ from **[THIS FTP DIRECTORY](ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/)** (which is updated monthly)
+ ** Hosted the file [Variant_summary.txt](ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz)** (which is a 809K row dataset) on Google Cloud Storage



In [3]:
clinvar = pd.read_csv(datadir + clinvar_file, sep='\t', lineterminator='\n')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# count the rows of data
clinvar.shape

(809509, 31)

# This has 809K Rows

In [5]:
# Look at the column names in this data
list(clinvar)

['#AlleleID',
 'Type',
 'Name',
 'GeneID',
 'GeneSymbol',
 'HGNC_ID',
 'ClinicalSignificance',
 'ClinSigSimple',
 'LastEvaluated',
 'RS# (dbSNP)',
 'nsv/esv (dbVar)',
 'RCVaccession',
 'PhenotypeIDS',
 'PhenotypeList',
 'Origin',
 'OriginSimple',
 'Assembly',
 'ChromosomeAccession',
 'Chromosome',
 'Start',
 'Stop',
 'ReferenceAllele',
 'AlternateAllele',
 'Cytogenetic',
 'ReviewStatus',
 'NumberSubmitters',
 'Guidelines',
 'TestedInGTR',
 'OtherIDs',
 'SubmitterCategories',
 'VariationID']

# Let's find all the unique values in the PhenotypeIDS column
### Count the unique number different 'Registry Types'
+ e.g. MedGen
+ e.g. OMIM
+ etc.

In [6]:
# create an array to hold all the vales from 809K rows
alltypes = []

# create a function to match just the registry identifier
# (the bit _before_ the colon)

def extract_types (x):
    res = re.findall(r'(\w+):\w+', x)
    if res:
        for i in res:
            alltypes.append(i)

In [7]:
clinvar.PhenotypeIDS.apply(extract_types)
len(alltypes)

2295724

In [8]:
# see how many unique types of registries
set(alltypes)

{'CT', 'EFO', 'Gene', 'MeSH', 'MedGen', 'OMIM', 'Ontology', 'Orphanet'}

## ...⇧
> **FINDING:** there are **8 basic 'types'** of phenotype registries. 

+ NOTE TO SELF: May make sense to fix spaces in
    +  "CT" = "SNOMED CT"
    + EFO
    + "Ontology" = "Human Pehontype Ontology"


# So what should we do with as many as 8 types of values, repeated within a single cell?
## where to add columns...and where to add rows?

+ Should consider a multindex (but not sure if that is easily read by ESRI
+ Should also consider the design below:
    + with 8 basic column types
    + each phenotype is listed in the approapriate 'registry column'
    + (usually in either one or two registries)
    + and the row information is repeated.

![Splitting Phenotypes into rows](https://i.imgur.com/5XqsxYD.png)

# Now lets write a column with the ID and Description values organized as a Python List

In [9]:
# This is the working function
# to write all the groups to a LIST 
# object in a new column

def write_to_cols (x):
    # match everything before a semicolon
    res = re.findall(r"(.+?);",x)
    if res:
        # if match, match everything after the last semicolon
        tail = re.findall(r"(?:.+);(.+)$",x)
        # append 'head' and 'tail' matches
        res.extend(tail)
        # return the list of IDs between the semicolons
        return (res)
    else:
        # if no semicolons, then return 
        # the unaltered group
        return (x)

In [10]:
# Write a new column that lists the unique groups of IDs
# (separated 'neatly' as a Python List object)
# then write values in list to multiple columns
clinvar["ID_LIST"] = clinvar.PhenotypeIDS.apply(write_to_cols)
uniqueIDS = clinvar["ID_LIST"].apply(pd.Series)

In [11]:
# Write a new column that lists the unique groups of Descriptions
# (separated 'neatly' as a Python List object)
# then write values in list to multiple columns
clinvar["DESC_LIST"] = clinvar.PhenotypeList.apply(write_to_cols)
uniqueDESC = clinvar["DESC_LIST"].apply(pd.Series)

In [12]:
uniqueIDS

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511",,,,,,,,,,...,,,,,,,,,,
1,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511",,,,,,,,,,...,,,,,,,,,,
2,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511",,,,,,,,,,...,,,,,,,,,,
3,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511",,,,,,,,,,...,,,,,,,,,,
4,"MedGen:C0795949,OMIM:251300",,,,,,,,,,...,,,,,,,,,,
5,"MedGen:C0795949,OMIM:251300",,,,,,,,,,...,,,,,,,,,,
6,"MedGen:C1838979,OMIM:252010",MedGen:CN517202,,,,,,,,,...,,,,,,,,,,
7,"MedGen:C1838979,OMIM:252010",MedGen:CN517202,,,,,,,,,...,,,,,,,,,,
8,"MedGen:C1838979,OMIM:252010",,,,,,,,,,...,,,,,,,,,,
9,"MedGen:C1838979,OMIM:252010",,,,,,,,,,...,,,,,,,,,,


In [13]:
uniqueIDS.shape

(809509, 42)

In [14]:
list(uniqueIDS)

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41]

### A Max of 42 groups of IDS 

In [15]:
uniqueDESC

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,"Spastic paraplegia 48, autosomal recessive",,,,,,,,,,...,,,,,,,,,,
1,"Spastic paraplegia 48, autosomal recessive",,,,,,,,,,...,,,,,,,,,,
2,"Spastic paraplegia 48, autosomal recessive",,,,,,,,,,...,,,,,,,,,,
3,"Spastic paraplegia 48, autosomal recessive",,,,,,,,,,...,,,,,,,,,,
4,Galloway-Mowat syndrome,,,,,,,,,,...,,,,,,,,,,
5,Galloway-Mowat syndrome,,,,,,,,,,...,,,,,,,,,,
6,Mitochondrial complex I deficiency,not provided,,,,,,,,,...,,,,,,,,,,
7,Mitochondrial complex I deficiency,not provided,,,,,,,,,...,,,,,,,,,,
8,Mitochondrial complex I deficiency,,,,,,,,,,...,,,,,,,,,,
9,Mitochondrial complex I deficiency,,,,,,,,,,...,,,,,,,,,,


In [16]:
uniqueDESC.shape

(809509, 42)

In [17]:
list(uniqueDESC)

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41]

### A Max of 42 groups of Descriptions

So looks like we're matching
Now need to name those columns more descriptively

In [18]:
newIDcols = [
    "Phenotype Correlation ID 01",
    "Phenotype Correlation ID 02",
    "Phenotype Correlation ID 03",   
    "Phenotype Correlation ID 04",
    "Phenotype Correlation ID 05",
    "Phenotype Correlation ID 06",
    "Phenotype Correlation ID 07",
    "Phenotype Correlation ID 08",
    "Phenotype Correlation ID 09",
    "Phenotype Correlation ID 10",
    "Phenotype Correlation ID 11",    
    "Phenotype Correlation ID 12",   
    "Phenotype Correlation ID 13",   
    "Phenotype Correlation ID 14",    
    "Phenotype Correlation ID 15",    
    "Phenotype Correlation ID 16",    
    "Phenotype Correlation ID 17", 
    "Phenotype Correlation ID 18",     
    "Phenotype Correlation ID 19", 
    "Phenotype Correlation ID 20", 
    "Phenotype Correlation ID 21", 
    "Phenotype Correlation ID 22",
    "Phenotype Correlation ID 23", 
    "Phenotype Correlation ID 24", 
    "Phenotype Correlation ID 25", 
    "Phenotype Correlation ID 26", 
    "Phenotype Correlation ID 27", 
    "Phenotype Correlation ID 28", 
    "Phenotype Correlation ID 29", 
    "Phenotype Correlation ID 30", 
    "Phenotype Correlation ID 31", 
    "Phenotype Correlation ID 32", 
    "Phenotype Correlation ID 33", 
    "Phenotype Correlation ID 34", 
    "Phenotype Correlation ID 35",     
    "Phenotype Correlation ID 36", 
    "Phenotype Correlation ID 37", 
    "Phenotype Correlation ID 38", 
    "Phenotype Correlation ID 39", 
    "Phenotype Correlation ID 40", 
    "Phenotype Correlation ID 41",
    "Phenotype Correlation ID 42"    
]
newDESCcols = [
    "Phenotype Correlation DESC 01",
    "Phenotype Correlation DESC 02",
    "Phenotype Correlation DESC 03",   
    "Phenotype Correlation DESC 04",
    "Phenotype Correlation DESC 05",
    "Phenotype Correlation DESC 06",
    "Phenotype Correlation DESC 07",
    "Phenotype Correlation DESC 08",
    "Phenotype Correlation DESC 09",
    "Phenotype Correlation DESC 10",
    "Phenotype Correlation DESC 11",    
    "Phenotype Correlation DESC 12",   
    "Phenotype Correlation DESC 13",   
    "Phenotype Correlation DESC 14",    
    "Phenotype Correlation DESC 15",    
    "Phenotype Correlation DESC 16",    
    "Phenotype Correlation DESC 17", 
    "Phenotype Correlation DESC 18",     
    "Phenotype Correlation DESC 19", 
    "Phenotype Correlation DESC 20", 
    "Phenotype Correlation DESC 21", 
    "Phenotype Correlation DESC 22",
    "Phenotype Correlation DESC 23", 
    "Phenotype Correlation DESC 24", 
    "Phenotype Correlation DESC 25", 
    "Phenotype Correlation DESC 26", 
    "Phenotype Correlation DESC 27", 
    "Phenotype Correlation DESC 28", 
    "Phenotype Correlation DESC 29", 
    "Phenotype Correlation DESC 30", 
    "Phenotype Correlation DESC 31", 
    "Phenotype Correlation DESC 32", 
    "Phenotype Correlation DESC 33", 
    "Phenotype Correlation DESC 34", 
    "Phenotype Correlation DESC 35",     
    "Phenotype Correlation DESC 36", 
    "Phenotype Correlation DESC 37", 
    "Phenotype Correlation DESC 38", 
    "Phenotype Correlation DESC 39", 
    "Phenotype Correlation DESC 40", 
    "Phenotype Correlation DESC 41", 
    "Phenotype Correlation DESC 42"    
]


In [19]:
uniqueIDS.columns = newIDcols

In [20]:
uniqueDESC.columns = newDESCcols

In [21]:
list(clinvar)

['#AlleleID',
 'Type',
 'Name',
 'GeneID',
 'GeneSymbol',
 'HGNC_ID',
 'ClinicalSignificance',
 'ClinSigSimple',
 'LastEvaluated',
 'RS# (dbSNP)',
 'nsv/esv (dbVar)',
 'RCVaccession',
 'PhenotypeIDS',
 'PhenotypeList',
 'Origin',
 'OriginSimple',
 'Assembly',
 'ChromosomeAccession',
 'Chromosome',
 'Start',
 'Stop',
 'ReferenceAllele',
 'AlternateAllele',
 'Cytogenetic',
 'ReviewStatus',
 'NumberSubmitters',
 'Guidelines',
 'TestedInGTR',
 'OtherIDs',
 'SubmitterCategories',
 'VariationID',
 'ID_LIST',
 'DESC_LIST']

In [22]:
list(uniqueIDS)

['Phenotype Correlation ID 01',
 'Phenotype Correlation ID 02',
 'Phenotype Correlation ID 03',
 'Phenotype Correlation ID 04',
 'Phenotype Correlation ID 05',
 'Phenotype Correlation ID 06',
 'Phenotype Correlation ID 07',
 'Phenotype Correlation ID 08',
 'Phenotype Correlation ID 09',
 'Phenotype Correlation ID 10',
 'Phenotype Correlation ID 11',
 'Phenotype Correlation ID 12',
 'Phenotype Correlation ID 13',
 'Phenotype Correlation ID 14',
 'Phenotype Correlation ID 15',
 'Phenotype Correlation ID 16',
 'Phenotype Correlation ID 17',
 'Phenotype Correlation ID 18',
 'Phenotype Correlation ID 19',
 'Phenotype Correlation ID 20',
 'Phenotype Correlation ID 21',
 'Phenotype Correlation ID 22',
 'Phenotype Correlation ID 23',
 'Phenotype Correlation ID 24',
 'Phenotype Correlation ID 25',
 'Phenotype Correlation ID 26',
 'Phenotype Correlation ID 27',
 'Phenotype Correlation ID 28',
 'Phenotype Correlation ID 29',
 'Phenotype Correlation ID 30',
 'Phenotype Correlation ID 31',
 'Phenot

In [23]:
list(uniqueDESC)

['Phenotype Correlation DESC 01',
 'Phenotype Correlation DESC 02',
 'Phenotype Correlation DESC 03',
 'Phenotype Correlation DESC 04',
 'Phenotype Correlation DESC 05',
 'Phenotype Correlation DESC 06',
 'Phenotype Correlation DESC 07',
 'Phenotype Correlation DESC 08',
 'Phenotype Correlation DESC 09',
 'Phenotype Correlation DESC 10',
 'Phenotype Correlation DESC 11',
 'Phenotype Correlation DESC 12',
 'Phenotype Correlation DESC 13',
 'Phenotype Correlation DESC 14',
 'Phenotype Correlation DESC 15',
 'Phenotype Correlation DESC 16',
 'Phenotype Correlation DESC 17',
 'Phenotype Correlation DESC 18',
 'Phenotype Correlation DESC 19',
 'Phenotype Correlation DESC 20',
 'Phenotype Correlation DESC 21',
 'Phenotype Correlation DESC 22',
 'Phenotype Correlation DESC 23',
 'Phenotype Correlation DESC 24',
 'Phenotype Correlation DESC 25',
 'Phenotype Correlation DESC 26',
 'Phenotype Correlation DESC 27',
 'Phenotype Correlation DESC 28',
 'Phenotype Correlation DESC 29',
 'Phenotype Co

In [27]:
withIDS = pd.concat([clinvar,uniqueIDS],axis=1)

In [28]:
clinvar_new = pd.concat([withIDS,uniqueDESC],axis=1)

In [29]:
list(clinvar_new)

['#AlleleID',
 'Type',
 'Name',
 'GeneID',
 'GeneSymbol',
 'HGNC_ID',
 'ClinicalSignificance',
 'ClinSigSimple',
 'LastEvaluated',
 'RS# (dbSNP)',
 'nsv/esv (dbVar)',
 'RCVaccession',
 'PhenotypeIDS',
 'PhenotypeList',
 'Origin',
 'OriginSimple',
 'Assembly',
 'ChromosomeAccession',
 'Chromosome',
 'Start',
 'Stop',
 'ReferenceAllele',
 'AlternateAllele',
 'Cytogenetic',
 'ReviewStatus',
 'NumberSubmitters',
 'Guidelines',
 'TestedInGTR',
 'OtherIDs',
 'SubmitterCategories',
 'VariationID',
 'ID_LIST',
 'DESC_LIST',
 'Phenotype Correlation ID 01',
 'Phenotype Correlation ID 02',
 'Phenotype Correlation ID 03',
 'Phenotype Correlation ID 04',
 'Phenotype Correlation ID 05',
 'Phenotype Correlation ID 06',
 'Phenotype Correlation ID 07',
 'Phenotype Correlation ID 08',
 'Phenotype Correlation ID 09',
 'Phenotype Correlation ID 10',
 'Phenotype Correlation ID 11',
 'Phenotype Correlation ID 12',
 'Phenotype Correlation ID 13',
 'Phenotype Correlation ID 14',
 'Phenotype Correlation ID 15'

In [30]:
clinvar_new.to_csv(datadir + 'clinvar_flattened.csv')

In [31]:
clinvar_new.head(50).to_csv(datadir + 'clinvar_flattened_stub.csv')