# Run SnpEff

Genetic variant annotation and effect prediction toolbox. It annotates and predicts the effects of variants on genes (such as amino acid changes). 

### Run VCF Through SnpEff

Requires: 
- SnpEff: http://snpeff.sourceforge.net/
- Reference (downloaded automoatically) GRCh37.74 to match vcf files

In [1]:
# set working directory
import os
os.chdir('/Users/selasady/impala_scripts/annotation/')

### Create VCF from distinct kaviar variants

For each distinct variant found in ISB's Kaivar table, a vcf file will be created and run through SnpEff. 

### Connect to Impala

In [4]:
import ibis
import os

# connect to impala with ibis
hdfs_port = os.environ.get('glados20', 50070)
hdfs = ibis.hdfs_connect(host='glados20', port=hdfs_port, user='selasady')
con = ibis.impala.connect(host='glados19', port=21050, timeout=120)

# enable interactive mode
ibis.options.interactive = True

### Download Distinct Variant Table

In [6]:
# create ibis object from distinct vars table
distinct_vars = con.table('global_distinct', database='p7_ref_grch37')

In [50]:
print distinct_vars.limit(5)

  chrom        pos     ref     alt  kav_freq
0     3  119260529       T       C  0.000115
1     5  143305490       A       T  0.000115
2     4   13473108       A      AG  0.000038
3     3  148798712  ATGCAT  ATGTAT  0.000038
4     5   26187494       T       C  0.000077


In [22]:
# download table from ibis table connection object
distinct_df = distinct_vars.execute()

In [40]:
# add varaint ID field
distinct_df['ID'] = distinct_df[['chrom', 'pos', 'ref', 'alt']].apply(lambda row: ':'.join(map(str, row)), axis=1)

### Output Distinct Variants as VCF File

In [53]:
import time
import pandas as pd

# disable extraneous pandas warning
pd.options.mode.chained_assignment = None

vcf_out = 'distinct_vars.vcf'

# create vcf header
def create_header(outfile_name):
   # create vcf header
    lines=[]
    lines.append('##fileformat=VCFv4.0')
    lines.append('##fileDate='+ time.strftime("%y%m%d"))
    lines.append('##reference=grch37 v.74 \n')
    header = '\n'.join(lines)
    out = open(outfile_name, 'wb')
    out.write(header)
    out.close()
               
# create vcf and append to file with header
def impala_to_vcf(input_df, outfile_name):
    # these columns are output to vcf file
    df = input_df[['chrom', 'pos', 'ID', 'ref', 'alt']]
    # add blank columns for vcf format and format col names
    df['QUAL'] = '30'
    df['FILTER'] = 'PASS'
    df['INFO'] = '.'
    df['FORMAT'] = 'GT'
    df['GT'] = '0/1'
    df.columns = [x.upper() for x in df.columns]
    df=df.rename(columns = {'CHROM':'#CHROM'})
    # order chromosomes to match ref fastas
    chroms = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'M', 'MT']
    df['#CHROM'] = df['#CHROM'].astype("category")
    df['#CHROM'].cat.set_categories(chroms, inplace=True)
    # sort file by chrom then pos
    df = df.sort(['#CHROM', 'POS'])
    # write to file for conversion to vcf
    df.to_csv(outfile_name, header=True, encoding='utf-8', sep="\t", index=False, mode='a')
              
create_header(vcf_out)
impala_to_vcf(distinct_df, vcf_out)

In [54]:
# command to run snpeff
!java -Xmx16g -jar /Users/selasady/tools/snpEff/snpEff.jar -t -v -noStats GRCh37.74 distinct_vars.vcf > distinct_snpeff.vcf

00:00:00.000	SnpEff version SnpEff 4.1a (build 2015-01-14), by Pablo Cingolani
00:00:00.003	Command: 'ann'
00:00:00.012	Reading configuration file 'snpEff.config'. Genome: 'GRCh37.74'
00:00:00.299	done
00:00:00.300	Reading database for genome version 'GRCh37.74' from file '/Users/selasady/tools/snpEff/./data/GRCh37.74/snpEffectPredictor.bin' (this might take a while)
00:00:20.061	done
00:00:20.091	Reading NextProt database from file '/Users/selasady/tools/snpEff/./data/GRCh37.74/nextProt.bin'
00:00:21.020	NextProt database: 550248 markers loaded.
00:00:21.021	Adding transcript info to NextProt markers.
00:00:21.458	NextProt database: 742952 markers added.
00:00:21.458		Loading PWMs from : /Users/selasady/tools/snpEff/./data/GRCh37.74/pwms.bin
00:00:21.476		Loading Motifs from file '/Users/selasady/tools/snpEff/./data/GRCh37.74/motif.bin'
00:00:22.011		Motif database: 284122 markers loaded.
00:00:22.011	Building interval forest
00:00:35.881	done.
00:00:35.882	Genome stats :
#-----------

### Match SnpEff annotations with Variants

In [None]:
# this function could really use some optimization
def parse_snpeff(input_df, input_vcf):
    """
    match snpeff annotations back to data frame
    :param input_df:dataframe of variants to match with snpeff annotations
    :param input_vcf:snpeff vcf file of annotations to match with dataframe
    :return: df_functional a pandas dataframe with snpeff annotations
    """
    # read in snpeff vcf file
    annot_vcf = pd.read_csv(input_vcf, sep='\t', skiprows=8)
    # split info field into separate rows for each consequence
    temp = pd.concat([pd.Series(row['ID'], row['INFO'].split(','))
                    for _, row in annot_vcf.iterrows()]).reset_index()
    # split each row into separate columns by the pipe
    info_df = pd.DataFrame(list(temp['index'].str.split('|')))
    # add variant id to beginning of data frame
    info_df.insert(0, 'var_id', temp[0])
    # # drop emtpy/unnecessary columns
    info_df = info_df[list(info_df.columns[0:12])]
    info_df.columns = ['var_id', 'alt', 'effect', 'impact', 'gene_name', 'gene_id', 'feature_type', 'transcript_id',
                       'tx_biotype', 'rank', 'hgvs_c', 'hgvs_p']
    # remove the annotation header 'ANN=' from the alt field
    info_df['alt'] = info_df['alt'].str.replace('ANN=', '')
    # keep only transcript level feature types
    info_df = info_df[(info_df['feature_type'] == 'transcript')]
    # drop extraneous columns
    info_df.drop(info_df.columns[[1,4,6]], axis=1, inplace=True)
    # recombine info_df with annot_df
    snp_df = pd.merge(annot_vcf, info_df, left_on=['ID'], right_on=['var_id'])
    # drop extraneous snp_df columns
    snp_df.drop(snp_df.columns[[2, 7]], axis=1, inplace=True)
    # edit col names to lower case for matching
    snp_df.columns = map(str.lower, snp_df.columns)
    snp_df.columns = snp_df.columns.str.replace('#','')
    # drop duplicates from each df before merge
    snp_df.drop_duplicates(inplace=True)
    info_df.drop_duplicates(inplace=True)
    # merge annotations with variant table
    annot_vars = pd.merge(input_df, snp_df, on=['var_id'], how='left')
    # # remove all duplicated columns from merge ending in _y
    cols = [c for c in annot_vars.columns if not c.endswith('_y')]
    annotated_df =annot_vars[cols]
    # rename columns ending in _x from merge
    annotated_df.rename(columns=lambda x: x.replace('_x', ''), inplace=True)
    # drop duplicate rows
    annotated_df.drop_duplicates(inplace=True)
    # add family id
    annotated_df['family_id'] = annotated_df['sample_id'].apply(lambda x: x.split('-')[1])
    return annotated_df

annotated_df = parse_snpeff(distinct_df, 'distinct_snpeff.vcf')

In [138]:
!for f in *.Q; do sed -i.q2 "s/$/\ ${f%%.*}/" $f; done

To Do: 
    - Find out which population corresponds to which q value 
    (emailed admixture author for help, waiting for response)

### Create merged Q score file 

In [146]:
!cat *.Q | sed 's/ /\,/g' >> master_admix.csv

## Upload Admixture Table to Impala

In [147]:
# read in master_admix.txt as pandas table
master_admix = pd.read_csv('./master_admix.csv')

### Connect to Impala 

In [183]:
import ibis
import os

# connect to impala with ibis
hdfs_port = os.environ.get('glados20', 50070)
hdfs = ibis.hdfs_connect(host='glados20', port=hdfs_port, user='selasady')
con = ibis.impala.connect(host='glados19', port=21050, timeout=120)

# enable interactive mode
ibis.options.interactive = True

### Upload admix tsv file to hdfs

In [199]:
path = '/user/selasady/'
file_name = 'master_admix.csv'
admix_file = path + '/' + file_name

# upload admix file
hdfs.put(path, file_name, verbose=True)



'/user/selasady/master_admix.csv'

### Download csv as ibis object to add sample id

In [204]:
# define talbe schema for tsv file
schema = ibis.schema([
    ('pop1', 'float'), 
    ('pop2', 'float'),
    ('pop3', 'float'),
    ('pop4', 'float'),
    ('pop5', 'float'),
    ('pop6', 'float'),
    ('pop7', 'float'),
    ('pop8', 'float'),
    ('pop9', 'float'),
    ('pop10', 'float'),
    ('pop11', 'float'),
    ('pop12', 'float'),
    ('pop13', 'float'),
    ('pop14', 'float'),
    ('vendor_id', 'string')
])

# create ibis object from admix tsv
admix = con.delimited_file(path, schema)

# create ibis object from mapping table
map_tbl = con.table('gms_metadata', database='p7_itmi')

In [207]:
# join tables to get sample_id
joined = admix.left_join(map_tbl, admix.vendor_id == map_tbl.genome)[admix,map_tbl.subject_id]

### Upload admix table + sample id's as an impala table

In [209]:
con.create_table('admix_test', joined, database='users_selasady')