# `Purpose:` Identifying genes in various proximity windows for the top SNPs associated with Infection or Environment

##Script names:
1. ddRAD58_GfusI1.1_near_SNP_set_top1_env.windows.sh
1. ddRAD58_GfusI1.1_near_SNP_set_top5_env.windows.sh
1. ddRAD58_GfusI1.1_near_SNP_set_top1_infection.windows.sh
1. ddRAD58_GfusI1.1_near_SNP_set_top5_infection.windows.sh

## Example code:

```shell
#!/bin/zsh

OUT_DIR=$HOME/data/projects/ddrad58/SNPs_of_interest/genes_near_SNPs

BED=$HOME/data/projects/ddrad58/SNPs_of_interest/Top01_PopPairwiseOverlap_Environm.bed
GFF3=$HOME/data/genomes/glossina_fuscipes/annotations/Glossina-fuscipes-IAEA_BASEFEATURES_GfusI1.1.gff3

BASENAME=tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window

WHICH_SNPS=top1_env


LENGTH=500
grep -P "\sgene\s" $GFF3 | bedtools window -a $BED -b stdin -w $LENGTH > ${OUT_DIR}/${BASENAME}.${LENGTH}.${WHICH_SNPS}.tsv

LENGTH=1000
grep -P "\sgene\s" $GFF3 | bedtools window -a $BED -b stdin -w $LENGTH > ${OUT_DIR}/${BASENAME}.${LENGTH}.${WHICH_SNPS}.tsv

LENGTH=5000
grep -P "\sgene\s" $GFF3 | bedtools window -a $BED -b stdin -w $LENGTH > ${OUT_DIR}/${BASENAME}.${LENGTH}.${WHICH_SNPS}.tsv

```

## Brief code explaination:
For particular set of SNPs:
- Run `bedtools window` on the SNPs of interest against the `gene` features in the `GfusI1.1` gff3 file.
- Set the window around each SNP to 500, 1000, and 5000 bp
- write out the results to files

# Set up imports and file path configuration stuff

In [1]:
# imports
from collections import defaultdict

from spartan.utils.files import tableFile2namedTuple
from spartan.utils.annotations.ensembl.gff3 import parse_gff3_attributes

In [2]:
# file paths

## base directory
base_dir = "/home/gus/remote_mounts/louise/data/projects/ddrad58/SNPs_of_interest/genes_near_SNPs"

## In files
top1_env_1000 = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.1000.top1_env.tsv"
top1_inf_1000 = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.1000.top1_infection.tsv"

top5_env_1000 = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.1000.top5_env.tsv"
top5_inf_1000 = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.1000.top5_infection.tsv"

top1_env_500 = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.500.top1_env.tsv"
top1_inf_500 = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.500.top1_infection.tsv"

top5_env_500 = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.500.top5_env.tsv"
top5_inf_500 = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.500.top5_infection.tsv"

top1_env_5000 = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.5000.top1_env.tsv"
top1_inf_5000 = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.5000.top1_infection.tsv"

top5_env_5000 = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.5000.top5_env.tsv"
top5_inf_5000 = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.5000.top5_infection.tsv"

## Out files
top1_env_1000_out = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.1000.top1_env.gene_list.tsv"
top1_inf_1000_out = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.1000.top1_infection.gene_list.tsv"

top5_env_1000_out = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.1000.top5_env.gene_list.tsv"
top5_inf_1000_out = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.1000.top5_infection.gene_list.tsv"

top1_env_500_out = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.500.top1_env.gene_list.tsv"
top1_inf_500_out = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.500.top1_infection.gene_list.tsv"

top5_env_500_out = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.500.top5_env.gene_list.tsv"
top5_inf_500_out = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.500.top5_infection.gene_list.tsv"

top1_env_5000_out = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.5000.top1_env.gene_list.tsv"
top1_inf_5000_out = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.5000.top1_infection.gene_list.tsv"

top5_env_5000_out = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.5000.top5_env.gene_list.tsv"
top5_inf_5000_out = base_dir + "/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.5000.top5_infection.gene_list.tsv"

# Functions to get data from files

In [3]:
fields = !head -n 1 $top1_env_5000
fields

['Scaffold191\t251841\t251842\tScaffold191\tVectorBase\tgene\t238121\t247143\t.\t+\t.\tID=GFUI013677;biotype=protein_coding']

In [4]:
f = fields[0].strip().split('\t')
f

['Scaffold191',
 '251841',
 '251842',
 'Scaffold191',
 'VectorBase',
 'gene',
 '238121',
 '247143',
 '.',
 '+',
 '.',
 'ID=GFUI013677;biotype=protein_coding']

In [7]:
field_headers = ["snp_contig",
                 "snp_start",
                 "snp_end",
                 "gene_contig",
                 "gene_source",
                 "gene_type",
                 "gene_start",
                 "gene_end",
                 "gene_score",
                 "gene_strand",
                 "gene_phase",
                 "gene_attributes"]

In [8]:
test = tableFile2namedTuple(top1_env_5000, sep='\t', headers=field_headers)

In [9]:
line1 = test[0]
line1.snp_contig

'Scaffold191'

In [10]:
line1.gene_contig

'Scaffold191'

In [11]:
line1.gene_attributes

'ID=GFUI013677;biotype=protein_coding'

In [12]:
line1._fields

('snp_contig',
 'snp_start',
 'snp_end',
 'gene_contig',
 'gene_source',
 'gene_type',
 'gene_start',
 'gene_end',
 'gene_score',
 'gene_strand',
 'gene_phase',
 'gene_attributes')

# Functions to collect genes to a SNP

In [13]:
def group_genes_to_SNP(loaded_lines):
    
    snp_to_genes = defaultdict(list)
    
    for line in loaded_lines:
        snp_loc = "%s:%s-%s" % (line.snp_contig, line.snp_start, line.snp_end)
        gene_attrbs = parse_gff3_attributes(line.gene_attributes)
        gene_name = gene_attrbs['ID']
        
        snp_to_genes[snp_loc].append(gene_name)
        
    return snp_to_genes
    
        

In [14]:
collected_genes = group_genes_to_SNP(test)

In [15]:
collected_genes.items()[:3]

[('Scaffold191:251841-251842', ['GFUI013677'])]

# Functions to write data out

In [20]:
def save_snp_genes(snp_genes, out_path):
    with open(out_path,'w') as out:
        
        for pair in snp_genes.iteritems():
            line = "%s\t%s\n" % (pair[0],'\t'.join(pair[1]))
            out.write(line)

# Do the work we came here to do

### Parse all the input files

In [21]:
top1_env_1000_lines = tableFile2namedTuple(top1_env_1000, sep='\t', headers=field_headers) 
top1_inf_1000_lines = tableFile2namedTuple(top1_inf_1000, sep='\t', headers=field_headers) 

top5_env_1000_lines = tableFile2namedTuple(top5_env_1000, sep='\t', headers=field_headers) 
top5_inf_1000_lines = tableFile2namedTuple(top5_inf_1000, sep='\t', headers=field_headers) 

top1_env_500_lines  = tableFile2namedTuple(top1_env_500, sep='\t', headers=field_headers)  
top1_inf_500_lines  = tableFile2namedTuple(top1_inf_500, sep='\t', headers=field_headers)  

top5_env_500_lines  = tableFile2namedTuple(top5_env_500, sep='\t', headers=field_headers)  
top5_inf_500_lines  = tableFile2namedTuple(top5_inf_500, sep='\t', headers=field_headers)  

top1_env_5000_lines = tableFile2namedTuple(top1_env_5000, sep='\t', headers=field_headers) 
top1_inf_5000_lines = tableFile2namedTuple(top1_inf_5000, sep='\t', headers=field_headers) 

top5_env_5000_lines = tableFile2namedTuple(top5_env_5000, sep='\t', headers=field_headers) 
top5_inf_5000_lines = tableFile2namedTuple(top5_inf_5000, sep='\t', headers=field_headers)

### Collect all the genes by SNPs

In [22]:
top1_env_1000_genes = group_genes_to_SNP(top1_env_1000_lines)
top1_inf_1000_genes = group_genes_to_SNP(top1_inf_1000_lines)

top5_env_1000_genes = group_genes_to_SNP(top5_env_1000_lines)
top5_inf_1000_genes = group_genes_to_SNP(top5_inf_1000_lines)

top1_env_500_genes  = group_genes_to_SNP(top1_env_500_lines )
top1_inf_500_genes  = group_genes_to_SNP(top1_inf_500_lines )

top5_env_500_genes  = group_genes_to_SNP(top5_env_500_lines )
top5_inf_500_genes  = group_genes_to_SNP(top5_inf_500_lines )

top1_env_5000_genes = group_genes_to_SNP(top1_env_5000_lines)
top1_inf_5000_genes = group_genes_to_SNP(top1_inf_5000_lines)

top5_env_5000_genes = group_genes_to_SNP(top5_env_5000_lines)
top5_inf_5000_genes = group_genes_to_SNP(top5_inf_5000_lines)

### Write the info out to files

In [23]:
save_snp_genes(top1_env_1000_genes, top1_env_1000_out)
save_snp_genes(top1_inf_1000_genes, top1_inf_1000_out)

save_snp_genes(top5_env_1000_genes, top5_env_1000_out)
save_snp_genes(top5_inf_1000_genes, top5_inf_1000_out)

save_snp_genes(top1_env_500_genes , top1_env_500_out )
save_snp_genes(top1_inf_500_genes , top1_inf_500_out )

save_snp_genes(top5_env_500_genes , top5_env_500_out )
save_snp_genes(top5_inf_500_genes , top5_inf_500_out )

save_snp_genes(top1_env_5000_genes, top1_env_5000_out)
save_snp_genes(top1_inf_5000_genes, top1_inf_5000_out)

save_snp_genes(top5_env_5000_genes, top5_env_5000_out)
save_snp_genes(top5_inf_5000_genes, top5_inf_5000_out)