In [1]:
import pandas as pd
import re

In [2]:
# Setup (local) directories

datadir = '/home/xavier/data/'
clinvar_file = 'variant_summary.txt'

In [3]:
clinvar = pd.read_csv(datadir + clinvar_file, sep='\t', lineterminator='\n')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# count the rows of data
clinvar.shape

(809509, 31)

In [5]:
# Look at the column names in this data
list(clinvar)

['#AlleleID',
 'Type',
 'Name',
 'GeneID',
 'GeneSymbol',
 'HGNC_ID',
 'ClinicalSignificance',
 'ClinSigSimple',
 'LastEvaluated',
 'RS# (dbSNP)',
 'nsv/esv (dbVar)',
 'RCVaccession',
 'PhenotypeIDS',
 'PhenotypeList',
 'Origin',
 'OriginSimple',
 'Assembly',
 'ChromosomeAccession',
 'Chromosome',
 'Start',
 'Stop',
 'ReferenceAllele',
 'AlternateAllele',
 'Cytogenetic',
 'ReviewStatus',
 'NumberSubmitters',
 'Guidelines',
 'TestedInGTR',
 'OtherIDs',
 'SubmitterCategories',
 'VariationID']

# Now lets write a column with the ID and Description values organized as a Python List

In [6]:
# This is the working function
# to write all the groups to a LIST 
# object in a new column

def write_to_cols (x):
    # match everything before a semicolon
    res = re.findall(r"(.+?);",x)
    if res:
        # if match, match everything after the last semicolon
        tail = re.findall(r"(?:.+);(.+)$",x)
        # append 'head' and 'tail' matches
        res.extend(tail)
        # return the list of IDs between the semicolons
        return (res)
    else:
        # if no semicolons, then return 
        # the unaltered group
        return (x)

In [9]:
# Write a new column that lists the unique groups of IDs
# (separated 'neatly' as a Python List object)
# then write values in list to multiple columns
clinvar["ID_LIST"] = clinvar.PhenotypeIDS.apply(write_to_cols)

In [10]:
# Write a new column that lists the unique groups of Descriptions
# (separated 'neatly' as a Python List object)
# then write values in list to multiple columns
clinvar["DESC_LIST"] = clinvar.PhenotypeList.apply(write_to_cols)

In [11]:
clinvar.to_csv(datadir + 'clinvar_multiple-diseases-listed.csv')

In [13]:
clinvar.head(20)

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories,VariationID,ID_LIST,DESC_LIST
0,15041,indel,NM_014855.2(AP5Z1):c.80_83delGGATinsTGCTGTAAAC...,9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704705,...,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0001,1,2,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511","Spastic paraplegia 48, autosomal recessive"
1,15041,indel,NM_014855.2(AP5Z1):c.80_83delGGATinsTGCTGTAAAC...,9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704705,...,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0001,1,2,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511","Spastic paraplegia 48, autosomal recessive"
2,15042,deletion,NM_014855.2(AP5Z1):c.1413_1426delGGACCTGCCCTGC...,9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704709,...,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0002,1,3,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511","Spastic paraplegia 48, autosomal recessive"
3,15042,deletion,NM_014855.2(AP5Z1):c.1413_1426delGGACCTGCCCTGC...,9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704709,...,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0002,1,3,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511","Spastic paraplegia 48, autosomal recessive"
4,15043,single nucleotide variant,NM_014630.2(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0,"Jun 29, 2015",150829393,...,15q25,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613624.0001,UniProtKB (pr...",1,4,"MedGen:C0795949,OMIM:251300",Galloway-Mowat syndrome
5,15043,single nucleotide variant,NM_014630.2(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0,"Jun 29, 2015",150829393,...,15q25.3,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613624.0001,UniProtKB (pr...",1,4,"MedGen:C0795949,OMIM:251300",Galloway-Mowat syndrome
6,15044,single nucleotide variant,NM_017547.3(FOXRED1):c.694C>T (p.Gln232Ter),55572,FOXRED1,HGNC:26927,Pathogenic,1,"Dec 07, 2017",267606829,...,11q24,"criteria provided, single submitter",2,,N,OMIM Allelic Variant:613622.0001,3,5,"[MedGen:C1838979,OMIM:252010, MedGen:CN517202]","[Mitochondrial complex I deficiency, not provi..."
7,15044,single nucleotide variant,NM_017547.3(FOXRED1):c.694C>T (p.Gln232Ter),55572,FOXRED1,HGNC:26927,Pathogenic,1,"Dec 07, 2017",267606829,...,11q24.2,"criteria provided, single submitter",2,,N,OMIM Allelic Variant:613622.0001,3,5,"[MedGen:C1838979,OMIM:252010, MedGen:CN517202]","[Mitochondrial complex I deficiency, not provi..."
8,15045,single nucleotide variant,NM_017547.3(FOXRED1):c.1289A>G (p.Asn430Ser),55572,FOXRED1,HGNC:26927,Pathogenic,1,"Oct 01, 2010",267606830,...,11q24,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613622.0002,UniProtKB (pr...",1,6,"MedGen:C1838979,OMIM:252010",Mitochondrial complex I deficiency
9,15045,single nucleotide variant,NM_017547.3(FOXRED1):c.1289A>G (p.Asn430Ser),55572,FOXRED1,HGNC:26927,Pathogenic,1,"Oct 01, 2010",267606830,...,11q24.2,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613622.0002,UniProtKB (pr...",1,6,"MedGen:C1838979,OMIM:252010",Mitochondrial complex I deficiency


In [None]:
uniqueIDS = clinvar["ID_LIST"].apply(pd.Series)
uniqueDESC = clinvar["DESC_LIST"].apply(pd.Series)

In [12]:
uniqueIDS

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511",,,,,,,,,,...,,,,,,,,,,
1,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511",,,,,,,,,,...,,,,,,,,,,
2,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511",,,,,,,,,,...,,,,,,,,,,
3,"MedGen:C3150901,OMIM:613647,Orphanet:ORPHA306511",,,,,,,,,,...,,,,,,,,,,
4,"MedGen:C0795949,OMIM:251300",,,,,,,,,,...,,,,,,,,,,
5,"MedGen:C0795949,OMIM:251300",,,,,,,,,,...,,,,,,,,,,
6,"MedGen:C1838979,OMIM:252010",MedGen:CN517202,,,,,,,,,...,,,,,,,,,,
7,"MedGen:C1838979,OMIM:252010",MedGen:CN517202,,,,,,,,,...,,,,,,,,,,
8,"MedGen:C1838979,OMIM:252010",,,,,,,,,,...,,,,,,,,,,
9,"MedGen:C1838979,OMIM:252010",,,,,,,,,,...,,,,,,,,,,


In [13]:
uniqueIDS.shape

(809509, 42)

In [14]:
list(uniqueIDS)

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41]

### A Max of 42 groups of IDS 

In [15]:
uniqueDESC

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,"Spastic paraplegia 48, autosomal recessive",,,,,,,,,,...,,,,,,,,,,
1,"Spastic paraplegia 48, autosomal recessive",,,,,,,,,,...,,,,,,,,,,
2,"Spastic paraplegia 48, autosomal recessive",,,,,,,,,,...,,,,,,,,,,
3,"Spastic paraplegia 48, autosomal recessive",,,,,,,,,,...,,,,,,,,,,
4,Galloway-Mowat syndrome,,,,,,,,,,...,,,,,,,,,,
5,Galloway-Mowat syndrome,,,,,,,,,,...,,,,,,,,,,
6,Mitochondrial complex I deficiency,not provided,,,,,,,,,...,,,,,,,,,,
7,Mitochondrial complex I deficiency,not provided,,,,,,,,,...,,,,,,,,,,
8,Mitochondrial complex I deficiency,,,,,,,,,,...,,,,,,,,,,
9,Mitochondrial complex I deficiency,,,,,,,,,,...,,,,,,,,,,


In [16]:
uniqueDESC.shape

(809509, 42)

In [17]:
list(uniqueDESC)

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41]

### A Max of 42 groups of Descriptions

So looks like we're matching
Now need to name those columns more descriptively

In [18]:
newIDcols = [
    "Phenotype Correlation ID 01",
    "Phenotype Correlation ID 02",
    "Phenotype Correlation ID 03",   
    "Phenotype Correlation ID 04",
    "Phenotype Correlation ID 05",
    "Phenotype Correlation ID 06",
    "Phenotype Correlation ID 07",
    "Phenotype Correlation ID 08",
    "Phenotype Correlation ID 09",
    "Phenotype Correlation ID 10",
    "Phenotype Correlation ID 11",    
    "Phenotype Correlation ID 12",   
    "Phenotype Correlation ID 13",   
    "Phenotype Correlation ID 14",    
    "Phenotype Correlation ID 15",    
    "Phenotype Correlation ID 16",    
    "Phenotype Correlation ID 17", 
    "Phenotype Correlation ID 18",     
    "Phenotype Correlation ID 19", 
    "Phenotype Correlation ID 20", 
    "Phenotype Correlation ID 21", 
    "Phenotype Correlation ID 22",
    "Phenotype Correlation ID 23", 
    "Phenotype Correlation ID 24", 
    "Phenotype Correlation ID 25", 
    "Phenotype Correlation ID 26", 
    "Phenotype Correlation ID 27", 
    "Phenotype Correlation ID 28", 
    "Phenotype Correlation ID 29", 
    "Phenotype Correlation ID 30", 
    "Phenotype Correlation ID 31", 
    "Phenotype Correlation ID 32", 
    "Phenotype Correlation ID 33", 
    "Phenotype Correlation ID 34", 
    "Phenotype Correlation ID 35",     
    "Phenotype Correlation ID 36", 
    "Phenotype Correlation ID 37", 
    "Phenotype Correlation ID 38", 
    "Phenotype Correlation ID 39", 
    "Phenotype Correlation ID 40", 
    "Phenotype Correlation ID 41",
    "Phenotype Correlation ID 42"    
]
newDESCcols = [
    "Phenotype Correlation DESC 01",
    "Phenotype Correlation DESC 02",
    "Phenotype Correlation DESC 03",   
    "Phenotype Correlation DESC 04",
    "Phenotype Correlation DESC 05",
    "Phenotype Correlation DESC 06",
    "Phenotype Correlation DESC 07",
    "Phenotype Correlation DESC 08",
    "Phenotype Correlation DESC 09",
    "Phenotype Correlation DESC 10",
    "Phenotype Correlation DESC 11",    
    "Phenotype Correlation DESC 12",   
    "Phenotype Correlation DESC 13",   
    "Phenotype Correlation DESC 14",    
    "Phenotype Correlation DESC 15",    
    "Phenotype Correlation DESC 16",    
    "Phenotype Correlation DESC 17", 
    "Phenotype Correlation DESC 18",     
    "Phenotype Correlation DESC 19", 
    "Phenotype Correlation DESC 20", 
    "Phenotype Correlation DESC 21", 
    "Phenotype Correlation DESC 22",
    "Phenotype Correlation DESC 23", 
    "Phenotype Correlation DESC 24", 
    "Phenotype Correlation DESC 25", 
    "Phenotype Correlation DESC 26", 
    "Phenotype Correlation DESC 27", 
    "Phenotype Correlation DESC 28", 
    "Phenotype Correlation DESC 29", 
    "Phenotype Correlation DESC 30", 
    "Phenotype Correlation DESC 31", 
    "Phenotype Correlation DESC 32", 
    "Phenotype Correlation DESC 33", 
    "Phenotype Correlation DESC 34", 
    "Phenotype Correlation DESC 35",     
    "Phenotype Correlation DESC 36", 
    "Phenotype Correlation DESC 37", 
    "Phenotype Correlation DESC 38", 
    "Phenotype Correlation DESC 39", 
    "Phenotype Correlation DESC 40", 
    "Phenotype Correlation DESC 41", 
    "Phenotype Correlation DESC 42"    
]


In [19]:
uniqueIDS.columns = newIDcols

In [20]:
uniqueDESC.columns = newDESCcols

In [21]:
list(clinvar)

['#AlleleID',
 'Type',
 'Name',
 'GeneID',
 'GeneSymbol',
 'HGNC_ID',
 'ClinicalSignificance',
 'ClinSigSimple',
 'LastEvaluated',
 'RS# (dbSNP)',
 'nsv/esv (dbVar)',
 'RCVaccession',
 'PhenotypeIDS',
 'PhenotypeList',
 'Origin',
 'OriginSimple',
 'Assembly',
 'ChromosomeAccession',
 'Chromosome',
 'Start',
 'Stop',
 'ReferenceAllele',
 'AlternateAllele',
 'Cytogenetic',
 'ReviewStatus',
 'NumberSubmitters',
 'Guidelines',
 'TestedInGTR',
 'OtherIDs',
 'SubmitterCategories',
 'VariationID',
 'ID_LIST',
 'DESC_LIST']

In [22]:
list(uniqueIDS)

['Phenotype Correlation ID 01',
 'Phenotype Correlation ID 02',
 'Phenotype Correlation ID 03',
 'Phenotype Correlation ID 04',
 'Phenotype Correlation ID 05',
 'Phenotype Correlation ID 06',
 'Phenotype Correlation ID 07',
 'Phenotype Correlation ID 08',
 'Phenotype Correlation ID 09',
 'Phenotype Correlation ID 10',
 'Phenotype Correlation ID 11',
 'Phenotype Correlation ID 12',
 'Phenotype Correlation ID 13',
 'Phenotype Correlation ID 14',
 'Phenotype Correlation ID 15',
 'Phenotype Correlation ID 16',
 'Phenotype Correlation ID 17',
 'Phenotype Correlation ID 18',
 'Phenotype Correlation ID 19',
 'Phenotype Correlation ID 20',
 'Phenotype Correlation ID 21',
 'Phenotype Correlation ID 22',
 'Phenotype Correlation ID 23',
 'Phenotype Correlation ID 24',
 'Phenotype Correlation ID 25',
 'Phenotype Correlation ID 26',
 'Phenotype Correlation ID 27',
 'Phenotype Correlation ID 28',
 'Phenotype Correlation ID 29',
 'Phenotype Correlation ID 30',
 'Phenotype Correlation ID 31',
 'Phenot

In [23]:
list(uniqueDESC)

['Phenotype Correlation DESC 01',
 'Phenotype Correlation DESC 02',
 'Phenotype Correlation DESC 03',
 'Phenotype Correlation DESC 04',
 'Phenotype Correlation DESC 05',
 'Phenotype Correlation DESC 06',
 'Phenotype Correlation DESC 07',
 'Phenotype Correlation DESC 08',
 'Phenotype Correlation DESC 09',
 'Phenotype Correlation DESC 10',
 'Phenotype Correlation DESC 11',
 'Phenotype Correlation DESC 12',
 'Phenotype Correlation DESC 13',
 'Phenotype Correlation DESC 14',
 'Phenotype Correlation DESC 15',
 'Phenotype Correlation DESC 16',
 'Phenotype Correlation DESC 17',
 'Phenotype Correlation DESC 18',
 'Phenotype Correlation DESC 19',
 'Phenotype Correlation DESC 20',
 'Phenotype Correlation DESC 21',
 'Phenotype Correlation DESC 22',
 'Phenotype Correlation DESC 23',
 'Phenotype Correlation DESC 24',
 'Phenotype Correlation DESC 25',
 'Phenotype Correlation DESC 26',
 'Phenotype Correlation DESC 27',
 'Phenotype Correlation DESC 28',
 'Phenotype Correlation DESC 29',
 'Phenotype Co

In [27]:
withIDS = pd.concat([clinvar,uniqueIDS],axis=1)

In [28]:
clinvar_new = pd.concat([withIDS,uniqueDESC],axis=1)

In [29]:
list(clinvar_new)

['#AlleleID',
 'Type',
 'Name',
 'GeneID',
 'GeneSymbol',
 'HGNC_ID',
 'ClinicalSignificance',
 'ClinSigSimple',
 'LastEvaluated',
 'RS# (dbSNP)',
 'nsv/esv (dbVar)',
 'RCVaccession',
 'PhenotypeIDS',
 'PhenotypeList',
 'Origin',
 'OriginSimple',
 'Assembly',
 'ChromosomeAccession',
 'Chromosome',
 'Start',
 'Stop',
 'ReferenceAllele',
 'AlternateAllele',
 'Cytogenetic',
 'ReviewStatus',
 'NumberSubmitters',
 'Guidelines',
 'TestedInGTR',
 'OtherIDs',
 'SubmitterCategories',
 'VariationID',
 'ID_LIST',
 'DESC_LIST',
 'Phenotype Correlation ID 01',
 'Phenotype Correlation ID 02',
 'Phenotype Correlation ID 03',
 'Phenotype Correlation ID 04',
 'Phenotype Correlation ID 05',
 'Phenotype Correlation ID 06',
 'Phenotype Correlation ID 07',
 'Phenotype Correlation ID 08',
 'Phenotype Correlation ID 09',
 'Phenotype Correlation ID 10',
 'Phenotype Correlation ID 11',
 'Phenotype Correlation ID 12',
 'Phenotype Correlation ID 13',
 'Phenotype Correlation ID 14',
 'Phenotype Correlation ID 15'

In [30]:
clinvar_new.to_csv(datadir + 'clinvar_flattened.csv')

In [31]:
clinvar_new.head(50).to_csv(datadir + 'clinvar_flattened_stub.csv')