# Database Use Cases

## Test 1: Calculate Allele Frequency

with t AS
      (
      SELECT COUNT(DISTINCT subject_id) * 2 AS n_total FROM p7_staging.two_row_varid
      ),
   -- group variants in subset by position using var_id and
   -- count number of times each variant occurs
   vars AS
      (
        SELECT var_id, count(var_id) as var_count
        FROM p7_staging.two_row_varid
        GROUP BY var_id
      )
   -- divide the number of times each variant occurs
   -- by the total number of samples
SELECT vars.*, vars.var_count/t.n_total AS freq
  FROM vars, t

Results: 

## Test 2: Extract all variants in Exon 5 of BRCA1 transcript.

-- create view with annotations for all brca1 genes
create view p7_product.brca1 as 
select * 
from p7_product.dbnsfp_vars 
where gene_name LIKE '%BRCA1%'
and chrom = '17'
and pos_block = 41;

Results: .04 seconds

### Find all Illumina Variants in BRCA1 exon 5

In [None]:
-- find all illumina variants in brca1 exon 5
select ill.subject_id, b.*
from p7_platform.wgs_illumina_variant ill, p7_product.brca1 b
where ill.chrom = b.chrom
and ill.pos = b.pos
and ill.ref = b.ref
and ill.alt = b.alt
and ill.chrom = '17'
and b.chrom = '17'
and b.exon_number = 5

Results:  4.72s
View Creation + Query = 4.76 seconds
Expected = 60 seconds
Time under expected = 55.24

## Test 3: Extract all PPC variants in BRCA2 with HGMD and ClinVar annotations

In [None]:
-- create view of all PPC BRCA2 variants with annotations


## Test 4: Find all variants in BRCA2 that are in ClinVar or HGMD.

In [None]:
import pandas as pd
from impala.dbapi import connect
import time
import csv
import subprocess
import numpy as np
from impala.util import as_pandas
import os

# disable extraneous pandas warning
pd.options.mode.chained_assignment = None

## create connection to impala
conn=connect(host=impala_host, port=impala_port, timeout=10000, user=impala_user_name)
cur = conn.cursor()

out_name = 'admix_test'

# create vcf header
def create_header(outfile_name):
   # create vcf header
    lines=[]
    lines.append('##fileformat=VCFv4.0')
    lines.append('##fileDate='+ time.strftime("%y%m%d"))
    lines.append('##reference=grch37 v.74')
    lines.append('#CHROM\t' + 'POS\t' + 'ID\t' + 'REF\t' + 'ALT\t' + 'QUAL\t' + 'FILTER\t' + 'INFO\t' + 'FORMAT\t' + \
             '102-01122-03\t' + '\n')
    header = '\n'.join(lines)
    out = open(outfile_name, 'wb')
    out.write(header)
    out.close()

### download variants that are not intergenic
def create_vcf(db_name, table_name, chrom_name):
    print "Looking for variants in chromosome {}... \n".format(chrom_name)
    # create named file for each chromosome
    vcf_out = 'chr' + chrom_name + '_' + out_name + '.vcf'
    # connect to vars_to_snpeff table
    gene_vars = '''
    with vars as (
    select chrom, pos, id, ref, alt, qual, filter, '.' as info, 'GT:' as 'format', gt as '102-01122-03'
    from p7_platform.wgs_illumina_variant
    where subject_id = '102-01122-03'
    )
   select distinct * 
   from vars
   order by rand()
   limit 10000 order by pos"    
    ''' 
    cur.execute(gene_vars)
    vars = as_pandas(cur)
    # write variants to file
    if len(vars) > 0:
        print "Creating VCF files for chromosome {}... \n".format(chrom_name)
        create_header(vcf_out)
        vars.to_csv(vcf_out, sep='\t', index=None, mode='a', header=False)
    else:
        print "No variants found for chromosome {} \n".format(chrom_name)
        

