# Run SnpEff

Genetic variant annotation and effect prediction toolbox. It annotates and predicts the effects of variants on genes (such as amino acid changes). 

### Run VCF Through SnpEff

Requires: 
- SnpEff: http://snpeff.sourceforge.net/
- Reference (downloaded automoatically) GRCh37.74 to match vcf files

### Create VCF from distinct kaviar variants

For each distinct variant found in ISB's Kaivar table, a vcf file will be created and run through SnpEff. 

### Connect to Impala

In [1]:
import ibis
import os

# connect to impala with ibis
hdfs_port = os.environ.get('glados20', 50070)
hdfs = ibis.hdfs_connect(host='glados20', port=hdfs_port, user='selasady')
con = ibis.impala.connect(host='glados19', port=21050, timeout=120)

# enable interactive mode
ibis.options.interactive = True

### Download Distinct Variant Table

In [None]:
# create ibis object from distinct vars table
#distinct_vars = con.table('global_distinct', database='p7_ref_grch37') 
##using temp table until this table is ready
distinct_vars = con.table('distinct_test', database='users_selasady') 

In [None]:
# limit table to just the selected columns
distinct_df = distinct_vars['chrom', 'pos', 'ref', 'alt']

In [None]:
# download table from ibis table connection object
distinct_df = distinct_df.execute()

In [None]:
# add varaint ID field
distinct_df['ID'] = distinct_df[['chrom', 'pos', 'ref', 'alt']].apply(lambda row: ':'.join(map(str, row)), axis=1)

### Output Distinct Variants as VCF File

In [None]:
import time
import pandas as pd

# disable extraneous pandas warning
pd.options.mode.chained_assignment = None

vcf_out = 'distinct_vars.vcf'

# create vcf header
def create_header(outfile_name):
   # create vcf header
    lines=[]
    lines.append('##fileformat=VCFv4.0')
    lines.append('##fileDate='+ time.strftime("%y%m%d"))
    lines.append('##reference=grch37 v.74 \n')
    header = '\n'.join(lines)
    out = open(outfile_name, 'wb')
    out.write(header)
    out.close()
               
# create vcf and append to file with header
def impala_to_vcf(input_df, outfile_name):
    # these columns are output to vcf file
    df = input_df[['chrom', 'pos', 'ID', 'ref', 'alt']]
    # add blank columns for vcf format and format col names
    df['QUAL'] = '30'
    df['FILTER'] = 'PASS'
    df['INFO'] = '.'
    df['FORMAT'] = 'GT'
    df['GT'] = '0/1'
    df.columns = [x.upper() for x in df.columns]
    df=df.rename(columns = {'CHROM':'#CHROM'})
    # order chromosomes to match ref fastas
    chroms = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'M', 'MT']
    df['#CHROM'] = df['#CHROM'].astype("category")
    df['#CHROM'].cat.set_categories(chroms, inplace=True)
    # sort file by chrom then pos
    df = df.sort(['#CHROM', 'POS'])
    # write to file for conversion to vcf
    df.to_csv(outfile_name, header=True, encoding='utf-8', sep="\t", index=False, mode='a')
              
create_header(vcf_out)
impala_to_vcf(distinct_df, vcf_out)

In [None]:
# command to run snpeff
!java -Xmx16g -jar /Users/summerrae/tools/snpEff/snpEff.jar -t -v -noStats GRCh37.74 distinct_vars.vcf > distinct_snpeff.vcf

### Output SnpEff effects as tsv file, one effect per line

In [None]:
cat distinct_snpeff.vcf \
    | /Users/summerrae/tools/snpEff/scripts/vcfEffOnePerLine.pl \
    | java -jar /Users/summerrae/tools/snpEff/SnpSift.jar extractFields \
    - CHROM POS REF ALT "ANN[*].EFFECT" "ANN[*].IMPACT" \
    "ANN[*].FEATURE" "ANN[*].FEATUREID" "ANN[*].BIOTYPE" "ANN[*].RANK" \
    "ANN[*].HGVS_C" "ANN[*].HGVS_P" > distinct.tsv 

### Remove Header and convert to CSV

In [None]:
!sed '1d' distinct.tsv | tr '\t' ','  > distinct_snpeff.csv

## Save Results as an Impala Table

### Upload Results to HDFS

In [5]:
path = '/user/selasady/'
file_name = 'distinct_snpeff.csv'
admix_file = path + '/' + file_name

# upload admix file
hdfs.put(path, file_name, verbose=True)



'/user/selasady/distinct_snpeff.csv'

### Convert TSV into Ibis Object

In [6]:
# define talbe schema for tsv file
schema = ibis.schema([
    ('chrom', 'string'), 
    ('pos', 'int32'),
    ('ref', 'string'),
    ('alt', 'string'),
    ('effect', 'string'),
    ('impact', 'string'),
    ('feature', 'string'),
    ('feature_id', 'string'),
    ('biotype', 'string'),
    ('rank', 'int32'),
    ('hgvs_c', 'string'),
    ('hgvs_p', 'string')
])

# create ibis object from  tsv
snpeff = con.delimited_file(path, schema)

In [7]:
print snpeff.limit(5)

  chrom     pos ref alt                   effect    impact            feature  \
0     1   64548   A   C    upstream_gene_variant  MODIFIER         transcript   
1     1   64548   A   C  downstream_gene_variant  MODIFIER         transcript   
2     1   64548   A   C        intergenic_region  MODIFIER  intergenic_region   
3     1  149954   G   C    upstream_gene_variant  MODIFIER         transcript   
4     1  149954   G   C    upstream_gene_variant  MODIFIER         transcript   

                        feature_id                 biotype  rank      hgvs_c  \
0                  ENST00000335137          protein_coding    -1  c.-4543A>C   
1                  ENST00000492842  unprocessed_pseudogene    -1   n.*661A>C   
2  ENSG00000240361-ENSG00000186092                            -1  n.64548A>C   
3                  ENST00000484859               antisense    -1   n.-247C>G   
4                  ENST00000490997               antisense    -1  n.-3123C>G   

  hgvs_p  
0         
1         

### Create impala table

In [8]:
con.create_table('coding_consequences', snpeff, database='users_selasady')