In [ ]:
import csv
from spartan.utils.genome_specific.GfusI1 import GfusI1_0

In [ ]:
# File Paths
## SNPs of interest CSV [ IN FILES ]

top1_env = "/home/gus/remote_mounts/louise/data/projects/ddrad58/SNPs_of_interest/Top01_PopPairwiseOverlap_Environm.csv"
top5_env = "/home/gus/remote_mounts/louise/data/projects/ddrad58/SNPs_of_interest/Top05_PopPairwiseOverlap_Environm.csv"
top1_infection = "/home/gus/remote_mounts/louise/data/projects/ddrad58/SNPs_of_interest/Top01_PopPairwiseOverlap_Infection.csv"
top5_infection = "/home/gus/remote_mounts/louise/data/projects/ddrad58/SNPs_of_interest/Top05_PopPairwiseOverlap_Infection.csv"

## SNPs of interest BED [ OUT FILES ]

top1_env_bed = "/home/gus/remote_mounts/louise/data/projects/ddrad58/SNPs_of_interest/Top01_PopPairwiseOverlap_Environm.bed"
top5_env_bed = "/home/gus/remote_mounts/louise/data/projects/ddrad58/SNPs_of_interest/Top05_PopPairwiseOverlap_Environm.bed"
top1_infection_bed = "/home/gus/remote_mounts/louise/data/projects/ddrad58/SNPs_of_interest/Top01_PopPairwiseOverlap_Infection.bed"
top5_infection_bed = "/home/gus/remote_mounts/louise/data/projects/ddrad58/SNPs_of_interest/Top05_PopPairwiseOverlap_Infection.bed"

## Fasta file for renaming contigs
fasta = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/assemblies/GfusI1/Glossina-fuscipes-IAEA_SCAFFOLDS_GfusI1.fa"

# Functions to convert lines from `in_files` to BED type 
### In file format:

```
SNP #	Scaffold	Position
26123	KK352174.1	73419
49227	KK351976.1	251842
1319	KK351787.1	898092
```

- Position derived from VCF so is 1-based index and must be converted to 0-based for BED

In [3]:
def make_BED_line(in_line, name_map):
    chrom = name_map[in_line[1]]
    chromstart = str(int(in_line[2]) - 1)
    chromend = str(int(in_line[2]))
    
    return chrom, chromstart, chromend

In [4]:
def get_in_lines(path, skip_first_row=True):
    with open(path, 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter='\t')
        if skip_first_row:
            reader.next()
        for line in reader:
            yield line

In [15]:
def write_line(out_file, line):
    line = "%s\n" % ('\t'.join(line))
    out_file.write(line)

In [16]:
def convert_a_file(in_path, out_path, name_map):
    with open(out_path, 'wb') as out_file:
        
        in_lines = get_in_lines(in_path)

        for line in in_lines:
            bed_data = make_BED_line(line, name_map)
            write_line(out_file,bed_data)

# Get name_map to rename contigs

In [17]:
name_map = GfusI1_0.get_name_map_from_fasta_headers(fasta)

In [18]:
name_map['KK352174.1']

'Scaffold391'

# Sanity checking 

In [19]:
lines = get_in_lines(top1_env)

In [20]:
line = lines.next()

In [21]:
line

['49227', 'KK351976.1', '251842']

In [22]:
make_BED_line(line,name_map)

('Scaffold191', '251841', '251842')

# Begining conversion 

### Top 1% Environment SNPs 

In [23]:
!head $top1_env

SNP #	Scaffold	Position
49227	KK351976.1	251842


In [24]:
convert_a_file(top1_env, top1_env_bed, name_map)

### Top 5% Environment SNPs 

In [25]:
!head $top5_env

SNP #	Scaffold	Position
26123	KK352174.1	73419
49227	KK351976.1	251842
1319	KK351787.1	898092
76179	KK351889.1	227984
86798	KK352048.1	29888
135024	KK351903.1	256390
43167	KK351851.1	856904
128541	KK351842.1	109755
75111	KK351881.1	152005


In [26]:
convert_a_file(top5_env, top5_env_bed, name_map)

In [27]:
!head $top5_env_bed

Scaffold391	73418	73419
Scaffold191	251841	251842
Scaffold2	898091	898092
Scaffold104	227983	227984
Scaffold264	29887	29888
Scaffold118	256389	256390
Scaffold66	856903	856904
Scaffold57	109754	109755
Scaffold96	152004	152005
Scaffold280	268589	268590


### Top 1% Infection SNPs 

In [30]:
!head $top1_infection

SNP #	Scaffold	Position
87549	KK352075.1	27449
57202	KK352168.1	195052
99393	KK351790.1	1269187
56393	KK352156.1	43299
75062	KK351864.1	874988
97450	KK352680.1	51363
30917	KK352381.1	116614
63964	JFJR01010968.1	263
57259	KK352168.1	307323


In [31]:
convert_a_file(top1_infection, top1_infection_bed, name_map)

In [32]:
!head $top1_infection_bed

Scaffold291	27448	27449
Scaffold385	195051	195052
Scaffold5	1269186	1269187
Scaffold373	43298	43299
Scaffold79	874987	874988
Scaffold912	51362	51363
Scaffold601	116613	116614
JFJR01010968.1	262	263
Scaffold385	307322	307323
Scaffold1030	15971	15972


### Top 5% Infection SNPs 

In [33]:
!head $top5_infection

SNP #	Scaffold	Position
123121	KK352518.1	14263
27834	KK352245.1	241848
90992	KK352205.1	60574
44182	KK351858.1	811198
34205	KK352774.1	756
62782	KK352552.1	12873
118902	KK352249.1	123762
122466	KK352440.1	127097
44184	KK351858.1	811202


In [34]:
convert_a_file(top5_infection, top5_infection_bed, name_map)

In [35]:
!head $top5_infection_bed

Scaffold740	14262	14263
Scaffold462	241847	241848
Scaffold422	60573	60574
Scaffold73	811197	811198
Scaffold1016	755	756
Scaffold775	12872	12873
Scaffold466	123761	123762
Scaffold661	127096	127097
Scaffold73	811201	811202
Scaffold249	119775	119776
