In [1]:
from spartan.utils.genome_specific.GfusI1 import GfusI1_0

import pybedtools as pbt

In [2]:
# Set up file paths
assembly = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/assemblies/GfusI1/Glossina-fuscipes-IAEA_SCAFFOLDS_GfusI1.fa"

## SNPs of interest BED files
top1_env_bed = "/home/gus/remote_mounts/louise/data/projects/ddrad58/SNPs_of_interest/Top01_PopPairwiseOverlap_Environm.bed"
top5_env_bed = "/home/gus/remote_mounts/louise/data/projects/ddrad58/SNPs_of_interest/Top05_PopPairwiseOverlap_Environm.bed"
top1_infection_bed = "/home/gus/remote_mounts/louise/data/projects/ddrad58/SNPs_of_interest/Top01_PopPairwiseOverlap_Infection.bed"
top5_infection_bed = "/home/gus/remote_mounts/louise/data/projects/ddrad58/SNPs_of_interest/Top05_PopPairwiseOverlap_Infection.bed"

## Full VCF of S

# Locating the references to the MicroSats in email
- found in GMAIL with this search "`chaz.hyseni@gmail.com micro update`"
- he included sequence for `D101` but only a pretty useless link to a paper with only primers listed for `Gmm8`
- `Gmm8` sequence located via google at http://goo.gl/W1nL8g

# Seq file created

### MicroSat data:
```
FILENAME: /home/gus/remote_mounts/louise/data/projects/ddrad58/seqs/chaz_environment_microsats.fasta

>D101 with flanking regions
GCATATATTGTGAAAAGCTCATGCGTCTGCCTTTACACTGCATACTACCAGCTATAATTGGTTGTTGCAAGAGCTGAGTCTTATTGGCTGATTGGTTGCTCATACTATATGCGCGCTCTGGTTGATTAGAGAAAACTACTGTTGCTGCTGCTGCTGCTGCTGCTGCCGCTGCTGCTATCCAACACATCATTGCTCCTCTTTTTTTTTATTGTGCATTTTTGTTTAGCAGAATATTCGCTCTTAACCTACGGTTTGTGGATAGTAATAAGCAAAAGAGAAGAAGAAAAAGAAAAAGAAGAAGAAGAAAAAGCAAAAGCAGAATATGCGATTTTTATTCTCCACATAATATGTTTATGCCTTCTCCGTTGCAATAAGATTTTCTCGTTTTTGTTGTTTTTCGAGTGTTTCATCATCTGTTGCCAATTGTCAGTTGCAACGCGTC

>Gmm8 from morsitans (http://goo.gl/W1nL8g)
CATATGACTGAACATTATATCATGCAGATGCAATGCGGAGAGAGAGAGAGAGAGAGAGAGTGAGAGAGAGTGAGATTGAA
AGCAGCAGACTGCACGTAGGGTCGACTGGGCGTCATTGTCAGTTGCGAAAGCAAACATTGAAGCGCGCCCAAGCAAATGA
ACGAGCAACTGAGCAAACAGACAAACAAATAAACAAACATTACAGCTAAACATTAAATACATAGAAAGT
```



# BLASTing Vectorbase

### Unambiguous results for both seqs

| Hit        	| Query 	| Aln Length 	| E-value 	| Score 	| Identity 	| Query Start 	| Query End 	| Hit Start 	| Hit End 	|
|------------	|-------	|------------	|---------	|-------	|----------	|-------------	|-----------	|-----------	|---------	|
| KK351934.1 	| D101  	| 440        	| 0.0     	| 812   	| 97.1%    	| 3           	| 442       	| 236461    	| 236901  	|
| KK352191.1 	| Gmm8  	| 229        	| 4e-84   	| 346   	| 90.4%    	| 1           	| 229       	| 103215    	| 103434  	|


### Creating `BED` records

In [3]:
name_map = GfusI1_0.get_name_map_from_fasta_headers(assembly)

In [4]:
name_map["KK351934.1"]

'Scaffold149'

In [5]:
name_map["KK352191.1"]

'Scaffold408'

In [6]:
bed_template = "{contig}\t{start}\t{end}\t{name}"

In [7]:
bed_lines = []
bed_lines.append(bed_template.format(contig=name_map["KK351934.1"], start=236461-1, end=236901, name="D101"))
bed_lines.append(bed_template.format(contig=name_map["KK352191.1"], start=103215-1, end=103434, name="Gmm8"))
print bed_lines[0]
print bed_lines[1]

Scaffold149	236460	236901	D101
Scaffold408	103214	103434	Gmm8


### MicroSat `BED` file

```
FILENAME: /home/gus/remote_mounts/louise/data/projects/ddrad58/seqs/chaz_environment_microsats.bed

KK351934.1	236460	236901	D101
KK352191.1	103214	103434	Gmm8
```

In [8]:
micro_sat_bed_path = "/home/gus/remote_mounts/louise/data/projects/ddrad58/seqs/chaz_environment_microsats.bed"

with open(micro_sat_bed_path,'w') as micro_sat_bed_file:
    for line in bed_lines:
        micro_sat_bed_file.write(line + '\n')

# Get locations of SNPs of interest near the MicroSats

In [9]:
# load our files
top1_env = pbt.BedTool(top1_env_bed)
top5_env = pbt.BedTool(top5_env_bed)
top1_infection = pbt.BedTool(top1_infection_bed)
top5_infection = pbt.BedTool(top5_infection_bed)

micro_sats = pbt.BedTool('/home/gus/remote_mounts/louise/data/projects/ddrad58/seqs/chaz_environment_microsats.bed')

In [10]:
top5_infection.head()

Scaffold740	14262	14263
 Scaffold462	241847	241848
 Scaffold422	60573	60574
 Scaffold73	811197	811198
 Scaffold1016	755	756
 Scaffold775	12872	12873
 Scaffold466	123761	123762
 Scaffold661	127096	127097
 Scaffold73	811201	811202
 Scaffold249	119775	119776



In [11]:
micro_sats.head()

Scaffold149	236460	236901	D101
 Scaffold408	103214	103434	Gmm8



## Environmental SNPs

In [12]:
top1_env_near_micro_sats = micro_sats.window(top1_env, w=100000000)
len(top1_env_near_micro_sats)

0

In [13]:
top5_env_near_micro_sats = micro_sats.window(top5_env, w=100000)
len(top5_env_near_micro_sats)

1

## Infection SNPs

In [42]:
top1_inf_near_micro_sats = micro_sats.window(top1_infection, w=100000000)
len(top1_inf_near_micro_sats)

0

In [47]:
top5_inf_near_micro_sats = micro_sats.window(top5_infection, w=1000000)
len(top5_inf_near_micro_sats)

4