# Creating Annotated Table of Global Distinct Variants

A table of distinct chrom, position, referene allele and alterneate allele will be created from all input reference and varaint files for annotation. 

### Connect to Impala

In [1]:
import ibis
import os

# connect to impala with ibis
hdfs_port = os.environ.get('glados20', 50070)
hdfs = ibis.hdfs_connect(host='glados20', port=hdfs_port, user='selasady')
con = ibis.impala.connect(host='glados19', port=21050, timeout=120)

# enable interactive mode
ibis.options.interactive = True

## Add All Variants and Annotations from Reference Tables

In [21]:
db = 'p7_ref_grch37'

# connect to variant tables
kaviar = con.table('kaviar_isb', database=db)
clinvar = con.table('clinvar', database=db)
#dbnsfp = con.table('dbnsfp_variant', database=db) TODO: Add this when bug fixed in ibis
esp_exomes = con.table('esp_exomes', database=db)
dbsnp = con.table('dbsnp', database=db)

In [79]:
# outer join clinvar and kaviar_isb
join_exprs1 = [kaviar.chrom == clinvar.chrom, kaviar.pos == clinvar.pos, kaviar.ref == clinvar.ref,\
               kaviar.alt == clinvar.alt]
clin_kav = kaviar.outer_join(clinvar, join_exprs1)[kaviar.chrom, kaviar.pos, kaviar.ref, kaviar.alt, kaviar.allele_freq.name('kav_freq') ,clinvar.clin_sig, clinvar.clin_dbn]

NameError: name 'chrom' is not defined

In [54]:
# outer join clin_kav and esp_exomes
join_exprs2 = [clin_kav.chrom == esp_exomes.chrom, clin_kav.pos == esp_exomes.pos, \
               clin_kav.ref == esp_exomes.ref, clin_kav.alt == esp_exomes.alt]
clin_kav_esp = clin_kav.outer_join(esp_exomes, join_exprs2)[clin_kav, esp_exomes.ma_fin_percent_aa.name('esp_ma_pct_aa'), esp_exomes.ma_fin_percent_ea.name('esp_ma_pct_ea')]

In [61]:
# outer join clin_kav_esp with dbsnp
join_exprs3 = [clin_kav_esp.chrom == dbsnp.chrom, clin_kav_esp.pos == dbsnp.pos, \
               clin_kav_esp.ref == dbsnp.ref, clin_kav_esp.alt == dbsnp.alt, dbsnp.dbsnpbuildid == '142']
all_vars = clin_kav_esp.outer_join(dbsnp, join_exprs3)[clin_kav_esp, dbsnp.rs_id]

In [56]:
print all_vars.limit(5)

  chrom       pos ref alt  kav_freq clin_sig clin_dbn esp_ma_pct_aa  \
0    19  38637958  GC   G  0.000038     None     None          None   
1    19  38638057   A   G  0.000038     None     None          None   
2    19  38638208   A   G  0.000423     None     None          None   
3    19  38638611   C   T  0.000038     None     None          None   
4    19  38638699   T   G  0.000038     None     None          None   

  esp_ma_pct_ea rs_id  
0          None  None  
1          None  None  
2          None  None  
3          None  None  
4          None  None  


## Add Regional Annotations

In [40]:
# connect to regional tables
cytoband = con.table('cytoband', database=db)
dann = con.table('dann', database=db)
ensembl = con.table('ensembl_genes', database=db)
go_goa = con.table('go_goa', database=db)
mirbase = con.table('mirbase', database=db)
#repeatmasker = con.table('repeatmasker', database=db)
#ucsc = con.table('ucsc_genes', database=db) # add after ibis bug fix
ucsc_kgxref = con.table('ucsc_kgxref', database=db)

In [None]:
# add cytoband regions
ensembl_exp = [all_vars.chrom == cytoband.chrom, all_vars.pos >= cytoband.start, all_vars.pos <= cytoband.stop]
all_vars = all_vars.join(cytoband, cytoband_exp)[all_vars, cytoband.name.name('cytoband'), cytoband.gie_stain]

In [82]:
# filter ensembl for distinct transcripts
ensembl_dist = ensembl.chrom.distinct()

# # add ensembl gene annotations
# ensembl_exp = [all_vars.chrom == ensembl_dist.chrom, all_vars.pos >= ensembl_dist.start, all_vars.pos <= ensembl_dist.stop]
# all_vars = all_vars.join(ensembl, ensembl_exp)[all_vars, ensembl_dist.gene_name.name('ens_gene'), 
#                                                ensembl_dist.gene_id.name('ens_geneid'),
#                                                ensembl_dist.transcript_name.name('ens_tx_name'),
#                                                ensembl_dist.transcript_id.name('ens_tx_id'),
#                                                ensembl_dist.feature,
#                                                ensembl_dist.strand
#                                               ]

In [84]:
print ensembl_dist

0                 HG186_PATCH
1               HG998_2_PATCH
2                 HG344_PATCH
3            HSCHR12_3_CTG2_1
4               HSCHR2_1_CTG1
5                  GL000191.1
6                 HG357_PATCH
7               HSCHR4_1_CTG6
8              HSCHR6_MHC_COX
9                 HG339_PATCH
10                 HG79_PATCH
11               HG1437_PATCH
12               HG1459_PATCH
13                HG730_PATCH
14     HG142_HG150_NOVEL_TEST
15             HSCHR1_1_CTG31
16               HG1146_PATCH
17                  HG7_PATCH
18                         20
19                 GL000237.1
20                HG115_PATCH
21               HG1462_PATCH
22                 GL000223.1
23             HSCHR6_MHC_MCF
24                          9
25                HG418_PATCH
26              HSCHR5_1_CTG1
27                 GL000243.1
28               HG1292_PATCH
29                HG311_PATCH
                ...          
235                GL000219.1
236      HSCHR19LRC_PGF1_CTG1
237       

In [73]:
# add dann scores
dann_case = (ibis.case()
            .when(all_vars.alt == 'A', dann.score_a)
            .when(all_vars.alt == 'T', dann.score_t)
            .when(all_vars.alt == 'C', dann.score_c)
            .when(all_vars.alt == 'G', dann.score_g)
            .else_(ibis.NA)
            .end())    

dann_exp = [all_vars.chrom == dann.chrom, all_vars.pos == dann.pos]
test = all_vars.join(dann, dann_exp)[all_vars, dann_score=dann_case]

SyntaxError: invalid syntax (<ipython-input-73-ebae4d17cf68>, line 11)

In [64]:
print test.limit(5)

KeyboardInterrupt: 

TODO: 
    - Add tables left out due to Ibis bug
    - Add normalized variants from illumina and cgi
    - Fix DANN syntax