Skip to content

Commit

Permalink
add analysis files to github
Browse files Browse the repository at this point in the history
  • Loading branch information
Timothy Ranallo-Benavidez committed Aug 22, 2019
1 parent 965b559 commit 3026e51
Show file tree
Hide file tree
Showing 8 changed files with 490 additions and 1 deletion.
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -42,7 +42,7 @@ After you have the histogram file, you can run GenomeScope within the online web
### Running GenomeScope Online

Users may prefer to use the online version, which offers all of the same functionality within an easy to use web interface:
http://genomescope.org/
http://genomescope.org/genomescope2.0

### Running GenomeScope on the Command Line

Expand Down
11 changes: 11 additions & 0 deletions analysis/COMMANDS
@@ -0,0 +1,11 @@
python make_random_genome.py 1000000000 random_genome_001Gbp.fa
head -n 100000000 random_genome_001Gbp.fa > random_genome.fa
head -n 100000000 random_genome_001Gbp.fa > random_genome_100Mbp.fa
head -n 10000000 random_genome_001Gbp.fa > random_genome_010Mbp.fa
head -n 1000000 random_genome_001Gbp.fa > random_genome_001Mbp.fa
python heterozygosity_commands.py
bash HETEROZYGOSITY_COMMANDS
python repetitiveness_comands.py
bash REPETITIVENESS_COMMANDS
python length_commands.py
bash LENGTH_COMMANDS
255 changes: 255 additions & 0 deletions analysis/chromosomeMutator.py
@@ -0,0 +1,255 @@
def chromosomeMutator(chromosome,heterozygosities,p, topology_model, top): #"Partition", p=2 to p=6
import random
mutate = {"A": "ACGTAC", "C": "CGTACG", "G": "GTACGT", "T": "TACGTA"}
mutatedchromosomes = {}
length = len(chromosome)
if topology_model == "Partition":
if p == 2:
mutatedchromosomes[1] = list(chromosome)
mutatedchromosomes[2] = list(chromosome)
r = [1-heterozygosities[0]] + heterozygosities
for i in range(length):
chance = random.random()
if chance <= r[0]:
nums = [0, 0]
else:
nums = [0, 1]
mutatedchromosomes[1][i] = mutate[mutatedchromosomes[1][i]][nums[0]]
mutatedchromosomes[2][i] = mutate[mutatedchromosomes[2][i]][nums[1]]
mutatedchromosomes[1] = "".join(mutatedchromosomes[1])
mutatedchromosomes[2] = "".join(mutatedchromosomes[2])
if p == 3:
mutatedchromosomes[1] = list(chromosome)
mutatedchromosomes[2] = list(chromosome)
mutatedchromosomes[3] = list(chromosome)
r = [1-heterozygosities[0]-heterozygosities[1]] + heterozygosities
for i in range(length):
chance = random.random()
if chance <= r[0]:
nums = [0, 0, 0] #how to mutate
elif chance <= r[0] + r[1]:
nums = [0, 0, 1]
else:
nums = [0, 1, 2]
mutatedchromosomes[1][i] = mutate[mutatedchromosomes[1][i]][nums[0]]
mutatedchromosomes[2][i] = mutate[mutatedchromosomes[2][i]][nums[1]]
mutatedchromosomes[3][i] = mutate[mutatedchromosomes[3][i]][nums[2]]
mutatedchromosomes[1] = "".join(mutatedchromosomes[1])
mutatedchromosomes[2] = "".join(mutatedchromosomes[2])
mutatedchromosomes[3] = "".join(mutatedchromosomes[3])
if p == 4:
mutatedchromosomes[1] = list(chromosome)
mutatedchromosomes[2] = list(chromosome)
mutatedchromosomes[3] = list(chromosome)
mutatedchromosomes[4] = list(chromosome)
if top != 0:
r = [1 - heterozygosities[0] - heterozygosities[1] - heterozygosities[2]] + heterozygosities
for i in range(length):
chance = random.random()
if chance <= r[0]:
nums = [0, 0, 0, 0]
elif chance <= r[0] + r[1]:
if top == 1:
nums = [0, 0, 0, 1]
if top == 2:
nums = [0, 0, 1, 1]
elif chance <= r[0] + r[1] + r[2]:
nums = [0, 0, 1, 2]
else:
nums = [0, 1, 2, 3]
mutatedchromosomes[1][i] = mutate[mutatedchromosomes[1][i]][nums[0]]
mutatedchromosomes[2][i] = mutate[mutatedchromosomes[2][i]][nums[1]]
mutatedchromosomes[3][i] = mutate[mutatedchromosomes[3][i]][nums[2]]
mutatedchromosomes[4][i] = mutate[mutatedchromosomes[4][i]][nums[3]]
else:
r = [1 - heterozygosities[0] - heterozygosities[1] - heterozygosities[2] - heterozygosities[3]] + heterozygosities
for i in range(length):
chance = random.random()
if chance <= r[0]:
nums = [0, 0, 0, 0]
elif chance <= r[0] + r[1]:
nums = [0, 0, 0, 1]
elif chance <= r[0] + r[1] + r[2]:
nums = [0, 0, 1, 1]
elif chance <= r[0] + r[1] + r[2] + r[3]:
nums = [0, 0, 1, 2]
else:
nums = [0, 1, 2, 3]
mutatedchromosomes[1][i] = mutate[mutatedchromosomes[1][i]][nums[0]]
mutatedchromosomes[2][i] = mutate[mutatedchromosomes[2][i]][nums[1]]
mutatedchromosomes[3][i] = mutate[mutatedchromosomes[3][i]][nums[2]]
mutatedchromosomes[4][i] = mutate[mutatedchromosomes[4][i]][nums[3]]
mutatedchromosomes[1] = "".join(mutatedchromosomes[1])
mutatedchromosomes[2] = "".join(mutatedchromosomes[2])
mutatedchromosomes[3] = "".join(mutatedchromosomes[3])
mutatedchromosomes[4] = "".join(mutatedchromosomes[4])
if p == 5:
mutatedchromosomes[1] = list(chromosome)
mutatedchromosomes[2] = list(chromosome)
mutatedchromosomes[3] = list(chromosome)
mutatedchromosomes[4] = list(chromosome)
mutatedchromosomes[5] = list(chromosome)
if top!=0:
r = [1 - heterozygosities[0] - heterozygosities[1] - heterozygosities[2] - heterozygosities[3]] + heterozygosities
for i in range(length):
chance = random.random()
if chance <= sum(r[:1]):
nums = [0, 0, 0, 0, 0]
elif chance <= sum(r[:2]):
if top in [1, 2]:
nums = [0, 0, 0, 0, 1]
if top in [3, 4, 5]:
nums = [0, 0, 0, 1, 1]
elif chance <= sum(r[:3]):
if top in [1,3]:
nums = [0, 0, 0, 1, 2]
if top in [2]:
nums = [0, 0, 1, 1, 2]
if top in [4, 5]:
nums = [0, 0, 1, 2, 2]
elif chance <= sum(r[:4]):
if top in [1, 2, 3, 4]:
nums = [0, 0, 1, 2, 3]
if top in [5]:
nums = [0, 1, 2, 3, 3]
else:
nums = [0, 1, 2, 3, 0]
mutatedchromosomes[1][i] = mutate[mutatedchromosomes[1][i]][nums[0]]
mutatedchromosomes[2][i] = mutate[mutatedchromosomes[2][i]][nums[1]]
mutatedchromosomes[3][i] = mutate[mutatedchromosomes[3][i]][nums[2]]
mutatedchromosomes[4][i] = mutate[mutatedchromosomes[4][i]][nums[3]]
mutatedchromosomes[5][i] = mutate[mutatedchromosomes[5][i]][nums[4]]
else:
r = [1 - sum(heterozygosities)] + heterozygosities
for i in range(length):
chance = random.random()
if chance <= sum(r[:1]):
nums = [0, 0, 0, 0, 0]
elif chance <= sum(r[:2]):
nums = [0, 0, 0, 0, 1]
elif chance <= sum(r[:3]):
nums = [0, 0, 0, 1, 1]
elif chance <= sum(r[:4]):
nums = [0, 0, 0, 1, 2]
elif chance <= sum(r[:5]):
nums = [0, 0, 1, 1, 2]
elif chance <= sum(r[:6]):
nums = [0, 0, 1, 2, 3] #nums = [0, 0, 1, 2, 2]
#elif chance <= sum(r[:7]):
# nums = [0, 0, 1, 2, 3]
#elif chance <= sum(r[:8]):
# nums = [0, 1, 2, 3, 3]
else:
nums = [0, 1, 2, 3, 4]
mutatedchromosomes[1][i] = mutate[mutatedchromosomes[1][i]][nums[0]]
mutatedchromosomes[2][i] = mutate[mutatedchromosomes[2][i]][nums[1]]
mutatedchromosomes[3][i] = mutate[mutatedchromosomes[3][i]][nums[2]]
mutatedchromosomes[4][i] = mutate[mutatedchromosomes[4][i]][nums[3]]
mutatedchromosomes[5][i] = mutate[mutatedchromosomes[5][i]][nums[4]]
mutatedchromosomes[1] = "".join(mutatedchromosomes[1])
mutatedchromosomes[2] = "".join(mutatedchromosomes[2])
mutatedchromosomes[3] = "".join(mutatedchromosomes[3])
mutatedchromosomes[4] = "".join(mutatedchromosomes[4])
mutatedchromosomes[5] = "".join(mutatedchromosomes[5])
if p == 6:
mutatedchromosomes[1] = list(chromosome)
mutatedchromosomes[2] = list(chromosome)
mutatedchromosomes[3] = list(chromosome)
mutatedchromosomes[4] = list(chromosome)
mutatedchromosomes[5] = list(chromosome)
mutatedchromosomes[6] = list(chromosome)
if top!=0:
r = [1 - heterozygosities[0] - heterozygosities[1] - heterozygosities[2] - heterozygosities[3] - heterozygosities[4]] + heterozygosities
for i in range(length):
chance = random.random()
if chance <= sum(r[:1]):
nums = [0, 0, 0, 0, 0, 0]
elif chance <= sum(r[:2]):
if top in [1, 2, 3, 4, 5]:
nums = [0, 0, 0, 0, 0, 1]
if top in [6, 7, 8, 9, 10, 11, 12, 13]:
nums = [0, 0, 0, 0, 1, 1]
if top in [14, 15, 16]:
nums = [0, 0, 0, 1, 1, 1]
elif chance <= sum(r[:3]):
if top in [1, 2, 6, 7]:
nums = [0, 0, 0, 0, 1, 2]
if top in [3, 4, 5, 14, 15, 16]:
nums = [0, 0, 0, 1, 1, 2]
if top in [8, 9, 10]:
nums = [0, 0, 0, 1, 2, 2]
if top in [11, 12, 13]:
nums = [0, 0, 1, 1, 2, 2]
elif chance <= sum(r[:4]):
if top in [1, 3, 6, 8, 14]:
nums = [0, 0, 0, 1, 2, 3]
if top in [2, 7, 11]:
nums = [0, 0, 1, 1, 2, 3]
if top in [4, 5, 15, 16]:
nums = [0, 0, 1, 2, 2, 3]
if top in [9, 10, 12, 13]:
nums = [0, 0, 1, 2, 3, 3]
elif chance <= sum(r[:5]):
if top in [1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15]:
nums = [0, 0, 1, 2, 3, 0]
if top in [5, 16]:
nums = [0, 1, 2, 3, 3, 0]
if top in [10, 13]:
nums = [0, 1, 2, 3, 0, 0]
else:
nums = [0, 1, 2, 3, 0, 1]
mutatedchromosomes[1][i] = mutate[mutatedchromosomes[1][i]][nums[0]]
mutatedchromosomes[2][i] = mutate[mutatedchromosomes[2][i]][nums[1]]
mutatedchromosomes[3][i] = mutate[mutatedchromosomes[3][i]][nums[2]]
mutatedchromosomes[4][i] = mutate[mutatedchromosomes[4][i]][nums[3]]
mutatedchromosomes[5][i] = mutate[mutatedchromosomes[5][i]][nums[4]]
mutatedchromosomes[6][i] = mutate[mutatedchromosomes[6][i]][nums[5]]
else:
r = [1 - sum(heterozygosities)] + heterozygosities
for i in range(length):
chance = random.random()
if chance <= sum(r[:1]):
nums = [0, 0, 0, 0, 0, 0]
elif chance <= sum(r[:2]):
nums = [0, 0, 0, 0, 0, 1]
elif chance <= sum(r[:3]):
nums = [0, 0, 0, 0, 1, 1]
elif chance <= sum(r[:4]):
nums = [0, 0, 0, 1, 1, 1]
elif chance <= sum(r[:5]):
nums = [0, 0, 0, 0, 1, 2]
elif chance <= sum(r[:6]):
nums = [0, 0, 0, 1, 1, 2]
elif chance <= sum(r[:7]):
nums = [0, 0, 1, 1, 2, 2] #nums = [0, 0, 0, 1, 2, 2]
elif chance <= sum(r[:8]):
nums = [0, 0, 0, 1, 2, 3] #nums = [0, 0, 1, 1, 2, 2]
elif chance <= sum(r[:9]):
nums = [0, 0, 1, 1, 2, 3] #nums = [0, 0, 0, 1, 2, 3]
elif chance <= sum(r[:10]):
nums = [0, 0, 1, 2, 3, 4] #nums = [0, 0, 1, 1, 2, 3]
#elif chance <= sum(r[:11]):
# nums = [0, 0, 1, 2, 2, 3]
#elif chance <= sum(r[:12]):
# nums = [0, 0, 1, 2, 3, 3]
#elif chance <= sum(r[:13]):
# nums = [0, 0, 1, 2, 3, 4]
#elif chance <= sum(r[:14]):
# nums = [0, 1, 2, 3, 3, 4]
#elif chance <= sum(r[:15]):
# nums = [0, 1, 2, 3, 4, 4]
else:
nums = [0, 1, 2, 3, 4, 5]
mutatedchromosomes[1][i] = mutate[mutatedchromosomes[1][i]][nums[0]]
mutatedchromosomes[2][i] = mutate[mutatedchromosomes[2][i]][nums[1]]
mutatedchromosomes[3][i] = mutate[mutatedchromosomes[3][i]][nums[2]]
mutatedchromosomes[4][i] = mutate[mutatedchromosomes[4][i]][nums[3]]
mutatedchromosomes[5][i] = mutate[mutatedchromosomes[5][i]][nums[4]]
mutatedchromosomes[6][i] = mutate[mutatedchromosomes[6][i]][nums[5]]
mutatedchromosomes[1] = "".join(mutatedchromosomes[1])
mutatedchromosomes[2] = "".join(mutatedchromosomes[2])
mutatedchromosomes[3] = "".join(mutatedchromosomes[3])
mutatedchromosomes[4] = "".join(mutatedchromosomes[4])
mutatedchromosomes[5] = "".join(mutatedchromosomes[5])
mutatedchromosomes[6] = "".join(mutatedchromosomes[6])
return mutatedchromosomes
38 changes: 38 additions & 0 deletions analysis/heterozygosity_commands.py
@@ -0,0 +1,38 @@
import itertools

num_het = {1:0, 2:1, 3:2, 4:4, 5:6, 6:10} #full model
t = 0 #topology = 0 (full model)
genome_filename = "random_genome.fa"
k = 21
coverage = 15
read_length = 100
br = 0
err = 0.01
model = "Partition"

COMMANDS_file = open("HETEROZYGOSITY_COMMANDS", 'w')
for i in range(1,8):
for p in [3,4,5,6]:
COMMANDS_file.write("parallel -j 17 < HETEROZYGOSITY_COMMANDS"+str(i)+"_p"+str(p)+"\n")

COMMANDS_file.close()

for p in [3, 4, 5, 6]:
files = [open("HETEROZYGOSITY_COMMANDS" + str(i) + "_p" + str(p),'w') for i in range(1, 8)]
lines = []
for d in [0.1]: #10% repetitiveness
rs = [[0.005*x/(num_het[p]) for _ in range(num_het[p])] for x in range(51)] #heterozygosity from 0% to 25% in 0.5% increments, each het form has equal het
for r in rs:
lines.append("python parameteranalysis.py " + str(k) + " " + str(p) + " " + str(coverage) + " " + str(read_length) + " " + str(br) + " " + ' '.join(['%.6f' % x for x in r]) + " " + genome_filename + " " + str(err) + " " + model + " " + '%.3f' % d + " " + str(t) + "\n")
readfilenameprefix = genome_filename + "_k" + str(k) + "_p" + str(p) + "_cov" + str(coverage) + "_rl" + str(read_length) + "_br" + str(br) + "_het" + '_'.join(['%.6f' % x for x in r]) + "_err" + '%.3f' % err + "_d" + '%.3f' % d + "_top" + str(float(t)) + "_" + model + "_reads"
lines.append("jellyfish count -C -m21 -s 1000000000 -t15 <(zcat " + readfilenameprefix + ".fa.gz) -o " + readfilenameprefix + ".jf" + "\n")
lines.append("jellyfish histo -t15 " + readfilenameprefix + ".jf > " + readfilenameprefix + ".histo" + "\n")
lines.append("genomescope.R -i " + readfilenameprefix + ".histo -k" + str(k) + " -p" + str(p) + " -l10 -t 0 --transform_exp 0" + " -o HETEROZYGOSITY_OUTPUT_p" + str(p) + "_transform_exp0 -n " + "het" + '_'.join(['%.6f' % x for x in r]) + "_err" + '%.3f' % err + "_d" + '%.3f' % d + "_top" + str(t) + " --testing --true_params=" + '%.3f' % d + "," + ",".join(['%.6f' % x for x in r]) + "," + str(t) + "\n")
lines.append("genomescope.R -i " + readfilenameprefix + ".histo -k" + str(k) + " -p" + str(p) + " -l10 -t 0 --transform_exp 1" + " -o HETEROZYGOSITY_OUTPUT_p" + str(p) + "_transform_exp1 -n " + "het" + '_'.join(['%.6f' % x for x in r]) + "_err" + '%.3f' % err + "_d" + '%.3f' % d + "_top" + str(t) + " --testing --true_params=" + '%.3f' % d + "," + ",".join(['%.6f' % x for x in r]) + "," + str(t) + "\n")
lines.append("genomescope.R -i " + readfilenameprefix + ".histo -k" + str(k) + " -p" + str(p) + " -l10 -t 0 --transform_exp 2" + " -o HETEROZYGOSITY_OUTPUT_p" + str(p) + "_transform_exp2 -n " + "het" + '_'.join(['%.6f' % x for x in r]) + "_err" + '%.3f' % err + "_d" + '%.3f' % d + "_top" + str(t) + " --testing --true_params=" + '%.3f' % d + "," + ",".join(['%.6f' % x for x in r]) + "," + str(t) + "\n")
lines.append("genomescope.R -i " + readfilenameprefix + ".histo -k" + str(k) + " -p" + str(p) + " -l10 -t 0 --transform_exp 3" + " -o HETEROZYGOSITY_OUTPUT_p" + str(p) + "_transform_exp3 -n " + "het" + '_'.join(['%.6f' % x for x in r]) + "_err" + '%.3f' % err + "_d" + '%.3f' % d + "_top" + str(t) + " --testing --true_params=" + '%.3f' % d + "," + ",".join(['%.6f' % x for x in r]) + "," + str(t) + "\n")
for i in range(7):
files[i].write(lines[i])

for i in range(7):
files[i].close()
36 changes: 36 additions & 0 deletions analysis/length_commands.py
@@ -0,0 +1,36 @@
import itertools

num_het = {1:0, 2:1, 3:2, 4:4, 5:6, 6:10} #full model
t = 0 #topology = 0 (full model)
k = 21
coverage = 15
read_length = 100
br = 0
err = 0.01
model = "Partition"
d = 0.10

COMMANDS_file = open("LENGTH_COMMANDS", 'w')
for i in range(1,8):
for p in [3,4,5,6]:
COMMANDS_file.write("parallel -j 4 < LENGTH_COMMANDS"+str(i)+"_p"+str(p)+"\n")

for p in [3, 4, 5, 6]:
files = [open("LENGTH_COMMANDS" + str(i) + "_p" + str(p),'w') for i in range(1, 8)]
lines = []
for genome_filename in ["random_genome_001Mbp.fa", "random_genome_010Mbp.fa", "random_genome_100Mbp.fa", "random_genome_001Gbp.fa"]:
rs = [[0.001/(num_het[p]) for _ in range(num_het[p])]] #heterozygosity 2.0%, equally divided among each het form
for r in rs:
lines.append("python parameteranalysis.py " + str(k) + " " + str(p) + " " + str(coverage) + " " + str(read_length) + " " + str(br) + " " + ' '.join(['%.6f' % x for x in r]) + " " + genome_filename + " " + str(err) + " " + model + " " + '%.3f' % d + " " + str(t) + "\n")
readfilenameprefix = genome_filename + "_k" + str(k) + "_p" + str(p) + "_cov" + str(coverage) + "_rl" + str(read_length) + "_br" + str(br) + "_het" + '_'.join(['%.6f' % x for x in r]) + "_err" + '%.3f' % err + "_d" + '%.3f' % d + "_top" + str(float(t)) + "_" + model + "_reads"
lines.append("jellyfish count -C -m21 -s 1000000000 -t15 <(zcat " + readfilenameprefix + ".fa.gz) -o " + readfilenameprefix + ".jf" + "\n")
lines.append("jellyfish histo -t15 " + readfilenameprefix + ".jf > " + readfilenameprefix + ".histo" + "\n")
lines.append("genomescope.R -i " + readfilenameprefix + ".histo -k" + str(k) + " -p" + str(p) + " -l10 -t 0 --transform_exp 0" + " -o LENGTH_OUTPUT_p" + str(p) + "_transform_exp0 -n " + "het" + '_'.join(['%.6f' % x for x in r]) + "_err" + '%.3f' % err + "_d" + '%.3f' % d + "_top" + str(t) + " --testing --true_params=" + '%.3f' % d + "," + ",".join(['%.6f' % x for x in r]) + "," + str(t) + "\n")
lines.append("genomescope.R -i " + readfilenameprefix + ".histo -k" + str(k) + " -p" + str(p) + " -l10 -t 0 --transform_exp 1" + " -o LENGTH_OUTPUT_p" + str(p) + "_transform_exp1 -n " + "het" + '_'.join(['%.6f' % x for x in r]) + "_err" + '%.3f' % err + "_d" + '%.3f' % d + "_top" + str(t) + " --testing --true_params=" + '%.3f' % d + "," + ",".join(['%.6f' % x for x in r]) + "," + str(t) + "\n")
lines.append("genomescope.R -i " + readfilenameprefix + ".histo -k" + str(k) + " -p" + str(p) + " -l10 -t 0 --transform_exp 2" + " -o LENGTH_OUTPUT_p" + str(p) + "_transform_exp2 -n " + "het" + '_'.join(['%.6f' % x for x in r]) + "_err" + '%.3f' % err + "_d" + '%.3f' % d + "_top" + str(t) + " --testing --true_params=" + '%.3f' % d + "," + ",".join(['%.6f' % x for x in r]) + "," + str(t) + "\n")
lines.append("genomescope.R -i " + readfilenameprefix + ".histo -k" + str(k) + " -p" + str(p) + " -l10 -t 0 --transform_exp 3" + " -o LENGTH_OUTPUT_p" + str(p) + "_transform_exp3 -n " + "het" + '_'.join(['%.6f' % x for x in r]) + "_err" + '%.3f' % err + "_d" + '%.3f' % d + "_top" + str(t) + " --testing --true_params=" + '%.3f' % d + "," + ",".join(['%.6f' % x for x in r]) + "," + str(t) + "\n")
for i in range(7):
files[i].write(lines[i])

for i in range(7):
files[i].close()
18 changes: 18 additions & 0 deletions analysis/make_random_genome.py
@@ -0,0 +1,18 @@
#python make_random_genome.py length output

#requires Python 3.6 or higher
import random
import sys

length = int(sys.argv[1])
output = sys.argv[2]

f = open(output, 'w')
num_lines_before_last = length//50
num_chars_last_line = length - num_lines_before_last*50

for i in range(num_lines_before_last):
f.write("".join(random.choices("actg", k=50))+"\n")

f.write("".join(random.choices("actg", k=num_chars_last_line)))
f.close()

0 comments on commit 3026e51

Please sign in to comment.