In [1]:
from spartan.utils.genome_specific.GfusI1 import GfusI1_0

In [2]:
#Set up data paths
fasta = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/assemblies/GfusI1/Glossina-fuscipes-IAEA_SCAFFOLDS_GfusI1.fa"
vcf_in = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/annotations/SNPs/tsetseFINAL_14Oct2014_f2_53.recode.renamed_scaffolds.vcf"
vcf_out = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/annotations/SNPs/tsetseFINAL_14Oct2014_f2_53.recode.contigs_as_integers.vcf"
contig_integer_map_out = "/home/gus/Dropbox/repos/git/glossina_genome_stuff/data/GfusI1/contig_to_integer_map.tsv"

# Read fasta to get original contig name map

In [3]:
name_map = GfusI1_0.get_name_map_from_fasta_headers(fasta)

In [4]:
name_map.items()[:4]

[('KK352610.1', 'Scaffold839'),
 ('KK352346.1', 'Scaffold566'),
 ('KK352241.1', 'Scaffold458'),
 ('JFJR01012964.1', 'JFJR01012964.1')]

# Use values in `name_map` dict as keys in new `integer_map` 

In [5]:
i = 0
integer_map = {}
for contig in name_map.itervalues():
    integer_map[contig] = str(i)
    i += 1

In [6]:
integer_map.items()[:3]

[('JFJR01012964.1', '3'), ('JFJR01013273.1', '33'), ('JFJR01013055.1', '1190')]

In [7]:
integer_map['Scaffold839']

'0'

In [8]:
integer_map['Scaffold566']

'1'

# Store the new map on disk for later

In [9]:
line = "{contig}\t{integer}\n"

with open(contig_integer_map_out, 'w') as int_map:
    for contig, integer in integer_map.iteritems():
        int_map.write(line.format(contig=contig,
                                  integer=integer))

In [10]:
!head $contig_integer_map_out

JFJR01012964.1	3
JFJR01013273.1	33
JFJR01013055.1	1190
JFJR01012966.1	9
JFJR01013005.1	11
JFJR01013308.1	14
JFJR01013269.1	15
Scaffold1098	606
Scaffold1096	1701
Scaffold1097	2100


# Change VCF chrom names to integers with `integer_map` 

In [11]:
GfusI1_0.change_vcf_chrom_names(in_path=vcf_in, out_path=vcf_out, name_map=integer_map)

In [13]:
!head $vcf_out

##fileformat=VCFv4.1
##samtoolsVersion=0.1.19-44428cd
##reference=file:///scratch/ag674/reference/Glossina-fuscipes-IAEA_SCAFFOLDS_GfusI1.fa
##contig=<ID=441,length=3329503>
##contig=<ID=1289,length=2865261>
##contig=<ID=434,length=2711279>
##contig=<ID=1217,length=2629603>
##contig=<ID=374,length=2590142>
##contig=<ID=1937,length=2519330>
##contig=<ID=419,length=2427971>


In [14]:
!tail -1 $vcf_out

2202	104	.	G	A	999	.	.	GT:PL:DP:SP:GQ	0/0:0,69,255:23:0:76	0/0:0,48,255:16:0:55	0/0:0,63,255:21:0:70	0/0:0,96,255:32:0:99	0/0:0,253,255:84:0:99	0/0:0,90,255:30:0:97	0/0:0,90,255:30:0:97	0/0:0,66,255:22:0:73	0/0:0,30,236:10:0:37	0/0:0,66,255:22:0:73	0/0:0,60,255:20:0:67	0/0:0,48,255:16:0:55	0/0:0,255,255:95:0:99	0/0:0,138,255:46:0:99	0/0:0,166,255:55:0:99	./.:0,12,119:4:0:19	0/0:0,166,255:55:0:99	0/0:0,84,255:28:0:91	0/0:0,114,255:38:0:99	0/0:0,151,255:50:0:99	0/0:0,72,255:24:0:79	0/0:0,102,255:34:0:99	0/0:0,69,255:23:0:76	0/0:0,42,255:14:0:49	0/0:0,72,255:24:0:79	0/0:0,75,255:25:0:82	0/0:0,172,255:57:0:99	0/0:0,148,255:49:0:99	0/0:0,102,255:34:0:99	0/1:58,0,255:68:0:51	0/1:104,0,255:32:0:97	0/1:76,0,255:64:0:69	0/1:73,0,255:24:0:66	0/1:20,0,255:74:0:14	0/0:0,96,255:32:0:99	0/1:134,0,225:26:0:99	0/1:111,0,251:27:0:99	0/1:165,0,255:95:0:99	0/1:152,0,255:72:0:99	0/0:0,255,255:112:0:99	0/1:105,0,255:48:0:98	0/0:0,169,255:56:0:99	0/1:74,0,188:14:0:67	0/0:0,108,255:36:0:99	0/0:0,84,255:28:0:

# Ok, It looks like we are good. 