In [10]:
import pandas as pd
import numpy as np
import tiledb
import os
tiledb.libtiledb.version()

(2, 4, 3)

# Data description

## Study

Study table columns
- study_id: unique identifier for study
- pmid: pubmed ID (GWAS Catalog studies only)
- pub_date: publication date
- pub_journal: publication journal
- pub_title: publication title
- pub_author: publication author
- trait_reported: trait reported in publication
- trait_mapped: ontology trait label
- trait_efos: EFO short_form
- ancestry_initial: ancestry of initial GWAS sample, separated by ';'
- ancestry_replication: ancestry of replication GWAS sample, separated by ';'
- n_initial: GWAS initial sample size
- n_replication: GWAS replication sample size
- n_cases: number of cases. Warning: there is currently no easy way to get this information from GWAS Catalog, therefore it is set to null
- num_assoc_loci (int): total number of associated loci for this study in the top loci table

## Credset

- study_id: unique identifier for study
- index_variantid_b37: unique variant identifier for index variant, chrom_pos_ref_alt (build 37)
- tag_variantid_b37: unique variant identifier for tagging variant, chrom_pos_ref_alt (build 37)
- log10_ABF: log10 of the approximate Bayes factor for this tagging variant
- posterior_prob: posterior probability of causality for this tagging variant compared to other variants at this locus

## Variant

variant information

# Data preprocessing

Write df to CSV files, in order to build tiledb. Tiledb can be build directly from pandas, but the datatype infer is messed up.

In [3]:
study_df = pd.read_csv('data/study_sample.csv')
variant_df = pd.read_csv('data/variant_sample.csv')

In [4]:
variant_df.head()

Unnamed: 0,chr_id,position,ref_allele,alt_allele,chr_id_b37,position_b37,rs_id,most_severe_consequence,gene_id_any_distance,gene_id_any,...,gene_id_prot_coding,raw,phred,gnomad_afr,gnomad_eas,gnomad_nfe,gnomad_nfe_est,gnomad_nfe_nwe,gnomad_nfe_onf,gnomad_oth
0,10,102953319,C,T,10,104713076,rs10883806,intron_variant,35025,ENSG00000148842,...,ENSG00000148842,-0.25872,0.482,0.045098,0.283226,0.093001,0.114085,0.082651,0.090781,0.112963
1,10,103046897,A,C,10,104806654,rs7094843,intron_variant,128603,ENSG00000148842,...,ENSG00000148842,0.263356,6.731,0.370741,0.605026,0.41474,0.459825,0.393332,0.404026,0.397978
2,10,26889827,T,C,10,27178756,rs10159883,intergenic_variant,28740,ENSG00000136754,...,ENSG00000136754,0.447466,8.879,0.599724,0.467097,0.171147,0.204536,0.152787,0.171816,0.228361
3,10,30434664,G,A,10,30723593,rs306588,intron_variant,643,ENSG00000107968,...,ENSG00000107968,0.980471,13.15,0.803243,0.536129,0.700091,0.697455,0.70326,0.693533,0.696494
4,10,35054597,G,A,10,35343525,rs2384289,intron_variant,36045,ENSG00000108094,...,ENSG00000108094,0.061417,3.62,0.432305,0.318299,0.341216,0.336534,0.340103,0.354869,0.35175


In [5]:
variant_df['pk'] = variant_df.apply(lambda x: str(x['chr_id']) + str(x['position']) + x['ref_allele'] + x['alt_allele'], axis=1)

In [6]:
credset_df = pd.read_csv('data/credset_sample.csv')
credset_df.head()

Unnamed: 0,bio_feature,is95_credset,is99_credset,lead_alt,lead_chrom,lead_pos,lead_ref,lead_variant_id,logabf,multisignal_method,...,tag_beta_cond,tag_chrom,tag_pos,tag_pval,tag_pval_cond,tag_ref,tag_se,tag_se_cond,tag_variant_id,type
0,,True,True,G,4,53111094,A,4:53111094:A:G,14.310652,conditional,...,0.39474,4,53111094,4.98e-09,4.98e-09,A,0.067503,0.067503,4:53111094:A:G,gwas
1,,True,True,G,4,53111094,A,4:53111094:A:G,12.253837,conditional,...,0.359118,4,53099535,5.34e-08,5.34e-08,C,0.066019,0.066019,4:53099535:C:T,gwas
2,,True,True,G,4,53111094,A,4:53111094:A:G,12.086437,conditional,...,0.354532,4,53075995,6.48e-08,6.48e-08,C,0.065593,0.065593,4:53075995:C:T,gwas
3,,True,True,G,4,53111094,A,4:53111094:A:G,11.969013,conditional,...,0.351341,4,53046854,7.44e-08,7.44e-08,TA,0.065303,0.065303,4:53046854:TA:T,gwas
4,,True,True,T,2,66523432,G,2:66523432:G:T,35.59497,conditional,...,0.178239,2,66523432,2.14e-18,2.14e-18,G,0.020371,0.020371,2:66523432:G:T,gwas


In [7]:
credset_df['pk'] = credset_df.apply(lambda x: str(x['tag_chrom']) + str(x['tag_pos']) + x['tag_ref'] + x['tag_alt'], axis=1)

In [18]:
joined_df = credset_df.merge(variant_df, on='pk', how='inner', suffixes=('_1', '_2'))

In [20]:
joined_df.shape

(68362, 48)

In [28]:
variant_df_new = joined_df[variant_df.columns].drop(['pk'],axis=1)
credset_df_new = joined_df[credset_df.columns].drop(['pk'],axis=1)

In [29]:
variant_df_new.to_csv('data/variant_sample.csv',index=False)
credset_df_new.to_csv('data/credset_sample.csv',index=False)


In [27]:
variant_df_new.shape

(68362, 21)

# Build TileDB Array

## Study Array

Create TileDB Array from CSV

In [61]:
# import shutil
# if tiledb.array_exists("study_tldb"):
#     shutil.rmtree('study_tldb')
    
# tiledb.from_csv("study_tldb", 'data/study.csv', 
#                 index_dims = ['study_id','trait_category'],
#                 chunksize = 10**6,
#                 sparse = True,
#                 fillna={"trait_efos":'', 'ancestry_replication':'','ancestry_initial':''})

In [11]:
with tiledb.open(os.path.join('opentarget_arrays', 'credset_tldb')) as credset_array:
    # credset_df = credset_array.df[:]
    credset_schema = credset_array.schema
    print(credset_schema)

ArraySchema(
  domain=Domain(*[
    Dim(name='tag_variant_id', domain=(None, None), tile='None', dtype='|S0', var=True),
  ]),
  attrs=[
    Attr(name='bio_feature', dtype='float64', var=False, nullable=False),
    Attr(name='is95_credset', dtype='uint8', var=False, nullable=False),
    Attr(name='is99_credset', dtype='uint8', var=False, nullable=False),
    Attr(name='lead_alt', dtype='<U0', var=True, nullable=False),
    Attr(name='lead_chrom', dtype='int64', var=False, nullable=False),
    Attr(name='lead_pos', dtype='int64', var=False, nullable=False),
    Attr(name='lead_ref', dtype='<U0', var=True, nullable=False),
    Attr(name='lead_variant_id', dtype='<U0', var=True, nullable=False),
    Attr(name='logabf', dtype='float64', var=False, nullable=False),
    Attr(name='multisignal_method', dtype='<U0', var=True, nullable=False),
    Attr(name='phenotype_id', dtype='float64', var=False, nullable=False),
    Attr(name='postprob', dtype='float64', var=False, nullable=False),
    Att

## Credset Array

In [83]:
# import shutil
# if tiledb.array_exists("credset_tldb"):
#     shutil.rmtree('credset_tldb')
    
# tiledb.from_csv("credset_tldb", 'data/credset.csv', 
#                 index_dims = ['tag_chrom','tag_pos'],
#                 sparse = True,
#                 chunksize = 10**8)

In [12]:
with tiledb.open(os.path.join('opentarget_arrays','credset_tldb')) as A:
    credset_df = A.df[:]    
    credset_schema = A.schema
print(credset_schema)
credset_df.head()

ArraySchema(
  domain=Domain(*[
    Dim(name='tag_variant_id', domain=(None, None), tile='None', dtype='|S0', var=True),
  ]),
  attrs=[
    Attr(name='bio_feature', dtype='float64', var=False, nullable=False),
    Attr(name='is95_credset', dtype='uint8', var=False, nullable=False),
    Attr(name='is99_credset', dtype='uint8', var=False, nullable=False),
    Attr(name='lead_alt', dtype='<U0', var=True, nullable=False),
    Attr(name='lead_chrom', dtype='int64', var=False, nullable=False),
    Attr(name='lead_pos', dtype='int64', var=False, nullable=False),
    Attr(name='lead_ref', dtype='<U0', var=True, nullable=False),
    Attr(name='lead_variant_id', dtype='<U0', var=True, nullable=False),
    Attr(name='logabf', dtype='float64', var=False, nullable=False),
    Attr(name='multisignal_method', dtype='<U0', var=True, nullable=False),
    Attr(name='phenotype_id', dtype='float64', var=False, nullable=False),
    Attr(name='postprob', dtype='float64', var=False, nullable=False),
    Att

Unnamed: 0_level_0,bio_feature,is95_credset,is99_credset,lead_alt,lead_chrom,lead_pos,lead_ref,lead_variant_id,logabf,multisignal_method,...,tag_beta,tag_beta_cond,tag_chrom,tag_pos,tag_pval,tag_pval_cond,tag_ref,tag_se,tag_se_cond,type
tag_variant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10:102466184:G:A,,True,True,AG,10,102469616,A,10:102469616:A:AG,16.7449,conditional,...,-0.014059,-0.013317,10,102466184,1.1e-11,1.25e-10,G,0.00207,0.00207,gwas
10:102469616:A:AG,,True,True,AG,10,102469616,A,10:102469616:A:AG,18.158018,conditional,...,-0.014447,-0.01373,10,102469616,2.6e-12,2.92e-11,A,0.002064,0.002065,gwas
10:102470255:C:T,,True,True,AG,10,102469616,A,10:102469616:A:AG,17.505033,conditional,...,-0.01424,-0.013518,10,102470255,5.16e-12,5.71e-11,C,0.002063,0.002064,gwas
10:102470399:A:G,,True,True,T,10,102504350,C,10:102504350:C:T,11.59182,conditional,...,0.055777,0.055777,10,102470399,9.63e-08,9.63e-08,A,0.010458,0.010458,gwas
10:102470399:A:G,,True,True,T,10,102504350,C,10:102504350:C:T,10.708179,conditional,...,0.066907,0.066907,10,102470399,2.9e-07,2.9e-07,A,0.013042,0.013042,gwas


In [81]:
credset_df.columns

Index(['bio_feature', 'is95_credset', 'is99_credset', 'lead_alt', 'lead_chrom',
       'lead_pos', 'lead_ref', 'lead_variant_id', 'logABF',
       'multisignal_method', 'phenotype_id', 'postprob', 'postprob_cumsum',
       'study_id', 'tag_alt', 'tag_beta', 'tag_beta_cond', 'tag_pval',
       'tag_pval_cond', 'tag_ref', 'tag_se', 'tag_se_cond', 'tag_variant_id',
       'type'],
      dtype='object')

In [86]:
cols_keep = ['lead_alt', 'lead_chrom','lead_pos', 'lead_ref', 'lead_variant_id', 'logABF', 'postprob',
       'study_id', 'tag_alt', 'tag_ref',  'tag_variant_id', 'type']

In [87]:
credset_df[cols_keep]

Unnamed: 0_level_0,Unnamed: 1_level_0,lead_alt,lead_chrom,lead_pos,lead_ref,lead_variant_id,logABF,postprob,study_id,tag_alt,tag_ref,tag_variant_id,type
tag_chrom,tag_pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2,67474,G,2,207768,A,2:207768:A:G,6.469945,0.007575,TWINSUK,T,C,2:67474:C:T,eqtl
2,92991,G,2,207768,A,2:207768:A:G,4.746941,0.001352,TWINSUK,C,T,2:92991:T:C,eqtl
2,97826,G,2,207768,A,2:207768:A:G,5.093851,0.001913,TWINSUK,G,A,2:97826:A:G,eqtl
2,97844,G,2,207768,A,2:207768:A:G,5.093851,0.001913,TWINSUK,G,C,2:97844:C:G,eqtl
2,100190,G,2,207768,A,2:207768:A:G,6.212921,0.005858,TWINSUK,C,T,2:100190:T:C,eqtl
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,246975361,T,1,246945421,G,1:246945421:G:T,2.340342,0.001043,TWINSUK,T,A,1:246975361:A:T,eqtl
1,246976539,T,1,246945421,G,1:246945421:G:T,2.850925,0.001737,TWINSUK,T,A,1:246976539:A:T,eqtl
1,246976751,T,1,246945421,G,1:246945421:G:T,3.422857,0.003078,TWINSUK,A,G,1:246976751:G:A,eqtl
1,246976758,T,1,246945421,G,1:246945421:G:T,3.008548,0.002034,TWINSUK,T,C,1:246976758:C:T,eqtl


## Variant Array

In [70]:
# import shutil
# if tiledb.array_exists("variant_tldb", isdense=False, issparse=False):
#     shutil.rmtree('variant_tldb')    
# tiledb.from_csv("variant_tldb", 'data/variant.csv', 
#                 index_dims = ['chr_id','position'],
#                 fillna={"rs_id":''},
#                 sparse = True,
#                 chunksize = 10**6)

In [13]:
with tiledb.open(os.path.join('opentarget_arrays','variant_tldb')) as A:
    variant_df = A.df[:]    
    variant_schema = A.schema
print(variant_schema)
variant_df.head()

ArraySchema(
  domain=Domain(*[
    Dim(name='variant_id', domain=(None, None), tile='None', dtype='|S0', var=True),
  ]),
  attrs=[
    Attr(name='index', dtype='int64', var=False, nullable=False),
    Attr(name='chr_id', dtype='int64', var=False, nullable=False),
    Attr(name='position', dtype='int64', var=False, nullable=False),
    Attr(name='ref_allele', dtype='<U0', var=True, nullable=False),
    Attr(name='alt_allele', dtype='<U0', var=True, nullable=False),
    Attr(name='chr_id_b37', dtype='int64', var=False, nullable=False),
    Attr(name='position_b37', dtype='int64', var=False, nullable=False),
    Attr(name='rs_id', dtype='<U0', var=True, nullable=False),
    Attr(name='most_severe_consequence', dtype='<U0', var=True, nullable=False),
    Attr(name='gene_id_any_distance', dtype='int64', var=False, nullable=False),
    Attr(name='gene_id_any', dtype='<U0', var=True, nullable=False),
    Attr(name='gene_id_prot_coding_distance', dtype='int64', var=False, nullable=False),
  

Unnamed: 0_level_0,index,chr_id,position,ref_allele,alt_allele,chr_id_b37,position_b37,rs_id,most_severe_consequence,gene_id_any_distance,...,gene_id_prot_coding,raw,phred,gnomad_afr,gnomad_eas,gnomad_nfe,gnomad_nfe_est,gnomad_nfe_nwe,gnomad_nfe_onf,gnomad_oth
variant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10:102466184:G:A,28836,10,102466184,G,A,10,104225941,rs11592299,intron_variant,4789,...,ENSG00000138111,-0.066974,1.739,0.034804,0.043115,0.174922,0.130035,0.198111,0.176195,0.168819
10:102469616:A:AG,28829,10,102469616,A,AG,10,104229373,rs71016384,5_prime_UTR_variant,8221,...,ENSG00000138111,0.160872,5.247,0.085403,0.045337,0.175841,0.118864,0.202846,0.176695,0.165362
10:102470255:C:T,28830,10,102470255,C,T,10,104230012,rs3808937,intron_variant,8860,...,ENSG00000138111,0.036976,3.212,0.086191,0.043004,0.176884,0.131255,0.20028,0.177903,0.175551
10:102470399:A:G,64167,10,102470399,A,G,10,104230156,rs2025713,intron_variant,9004,...,ENSG00000138111,0.09112,4.119,0.376468,0.351804,0.539839,0.532195,0.541589,0.548826,0.51756
10:102472959:G:A,64169,10,102472959,G,A,10,104232716,rs3740415,intron_variant,10080,...,ENSG00000138111,0.211808,6.014,0.37595,0.349741,0.538617,0.531743,0.540103,0.546948,0.519337


In [None]:
pip install tiledb-sql