Skip to content

Commit

Permalink
Added scripts for new genomes and allele frequency queries, and modif…
Browse files Browse the repository at this point in the history
…ied *_map.py outputs.
  • Loading branch information
xwu committed May 1, 2009
1 parent 0b6a48d commit 7ce8522
Show file tree
Hide file tree
Showing 9 changed files with 522 additions and 39 deletions.
49 changes: 49 additions & 0 deletions core/cgi_to_gff.py
@@ -0,0 +1,49 @@
#!/usr/bin/python
# Filename: cgi_to_gff.py

"""
usage: %prog CGI-Variations-Compact.csv ...
"""

# Output GFF record for each entry in file(s)
# ---
# This code is part of the Trait-o-matic project and is governed by its license.

import fileinput, os, sys

def main():
# return if we don't have the correct arguments
if len(sys.argv) < 2:
raise SystemExit(__doc__.replace("%prog", sys.argv[0]))

for line in fileinput.input():
if line.startswith("#"):
continue
l = line.strip().split(",")
if len(l) < 10 or l[4] not in ["=", "snp"] or l[5] not in ["=", "snp"]:
continue

chr = l[1]
# for now, we treat PAR1 and PAR2 normally, with chrX coordinates
if chr in ["PAR1", "PAR2", "chrXnonPAR"]:
chr = "chrX"
elif chr == "chrYnonPAR":
chr = "chrY"

out_line = chr
out_line += "\tcgi\tSNP\t"
out_line += str(long(l[2]) + 1) + "\t" + l[3]
out_line += "\t.\t+\t.\t"

if l[7] == l[8]:
out_line += "alleles " + l[7]
else:
out_line += "alleles " + l[7] + "/" + l[8]

out_line += ";ref_allele " + l[6]
out_line += ";total_score " + l[9]

print out_line

if __name__ == "__main__":
main()
59 changes: 47 additions & 12 deletions core/gff_hgmd_map.py
Expand Up @@ -10,6 +10,7 @@
# This code is part of the Trait-o-matic project and is governed by its license.

import os, string, sys
from copy import copy
import MySQLdb
import simplejson as json
from utils import gff
Expand Down Expand Up @@ -90,6 +91,7 @@ def main():
for record in gff_file:
# lightly parse alleles
alleles = record.attributes["alleles"].strip("\"").split("/")
ref_allele = record.attributes["ref_allele"].strip("\"")

# determine zygosity
if len(alleles) == 1:
Expand Down Expand Up @@ -124,24 +126,57 @@ def main():
amino = d[1]
pmid = d[2]
acc_num = d[3]

# keep, for internal purposes, a list of alleles minus the reference
leftover_alleles = copy(alleles)
try:
leftover_alleles.remove(ref_allele)
except ValueError:
pass

# we see if what we designated the mutant allele is the phenotype-
# associated allele, or if what we the reference sequence allele is
# the phenotype-associated allele; if the latter, we have to make
# sure that the genome we're looking at actually has the reference
# allele
if string.lower(amino).endswith(string.lower(ref_aa)) or \
(string.lower(amino).endswith(string.lower(mut_aa)) and
record.attributes["ref_allele"] in alleles):
output = {
"gene": gene,
"variant": str(record),
"amino_acid_change": amino,
"zygosity": zygosity,
"phenotype": disease,
"reference": "pmid:" + pmid,
}
print json.dumps(output)
if string.lower(amino).endswith(string.lower(mut_aa)):
amino_acid_change_and_position = aa[0] + str(aa_pos) + aa[-1]
#TODO: this doesn't work when we have multiple alleles
if len(leftover_alleles) == 1:
trait_allele = leftover_alleles[0]
else:
trait_allele = '?'
elif (string.lower(amino).endswith(string.lower(ref_aa)) and
ref_allele in alleles):
amino_acid_change_and_position = aa[-1] + str(aa_pos) + aa[0]
trait_allele = ref_allele
else:
continue

# format for output
if record.start == record.end:
coordinates = str(record.start)
else:
coordinates = str(record.start) + "-" + str(record.end)

genotype = "/".join(leftover_alleles)
if ref_allele in alleles:
genotype = ref_allele + "/" + genotype

output = {
"chromosome": record.seqname,
"coordinates": coordinates,
"gene": gene,
"amino_acid_change": amino_acid_change_and_position,
"genotype": genotype,
"ref_allele": ref_allele,
"trait_allele": trait_allele,
"zygosity": zygosity,
"variant": str(record),
"phenotype": disease,
"reference": "pmid:" + pmid,
}
print json.dumps(output)

# close database cursor and connection
cursor.close()
Expand Down
31 changes: 25 additions & 6 deletions core/gff_morbid_map.py
Expand Up @@ -42,16 +42,21 @@ def main():

gff_file = gff.input(sys.argv[1])
for record in gff_file:
# lightly parse alleles
alleles = record.attributes["alleles"].strip("\"").split("/")
ref_allele = record.attributes["ref_allele"].strip("\"")

# examine each amino acid change (this takes care of alternative splicings)
amino_acid_changes = record.attributes["amino_acid"].strip("\"").split("/")

# make sure not to duplicate what we print because of multiple alternative
# splicings; so, initialize an empty list to hold previous output strings
# so we can compare before printing
previous_output_strings = []
# splicings; so, initialize an empty list to hold previous tuples of gene
# names, variant records, and phenotype strings, so that we can compare
previous_gene_variant_phenotype = []

# examine each alternative splicing
for a in amino_acid_changes:
gene_variant_phenotype = []
output_strings = []

amino_acid = a.split(" ")
Expand Down Expand Up @@ -82,11 +87,25 @@ def main():
continue

for d in data:
disorder = d[0]
disorder = d[0].strip()
omim = d[1]
gene_variant_phenotype.append((gene, str(record), disorder))

# format for output
if record.start == record.end:
coordinates = str(record.start)
else:
coordinates = str(record.start) + "-" + str(record.end)

genotype = "/".join(alleles)

output = {
"chromosome": record.seqname,
"coordinates": coordinates,
"gene": gene,
"amino_acid_change": aa,
"genotype": genotype,
"ref_allele": ref_allele,
"variant": str(record),
"phenotype": disorder,
"reference": "omim:" + str(omim),
Expand All @@ -95,8 +114,8 @@ def main():
output_strings.append(json.dumps(output))

# actually only output what's not duplicating previous
if output_strings != previous_output_strings:
previous_output_strings = output_strings
if gene_variant_phenotype != previous_gene_variant_phenotype:
previous_gene_variant_phenotype = gene_variant_phenotype
for o in output_strings:
print o

Expand Down
59 changes: 47 additions & 12 deletions core/gff_omim_map.py
Expand Up @@ -10,6 +10,7 @@
# This code is part of the Trait-o-matic project and is governed by its license.

import os, string, sys
from copy import copy
import MySQLdb
import simplejson as json
from utils import gff
Expand Down Expand Up @@ -64,6 +65,7 @@ def main():
for record in gff_file:
# lightly parse alleles
alleles = record.attributes["alleles"].strip("\"").split("/")
ref_allele = record.attributes["ref_allele"].strip("\"")

# determine zygosity
if len(alleles) == 1:
Expand Down Expand Up @@ -102,23 +104,56 @@ def main():
else:
allelic_variant_id = "omim:" + d[3]

# keep, for internal purposes, a list of alleles minus the reference
leftover_alleles = copy(alleles)
try:
leftover_alleles.remove(ref_allele)
except ValueError:
pass

# we see if what we designated the mutant allele is the phenotype-
# associated allele, or if what we the reference sequence allele is
# the phenotype-associated allele; if the latter, we have to make
# sure that the genome we're looking at actually has the reference
# allele
if string.lower(amino_acid_change).endswith(string.lower(mut_aa)) or \
(string.lower(amino_acid_change).endswith(string.lower(ref_aa)) and
record.attributes["ref_allele"] in alleles):
output = {
"gene": gene,
"variant": str(record),
"amino_acid_change": amino_acid_change,
"zygosity": zygosity,
"phenotype": phenotype,
"reference": allelic_variant_id,
}
print json.dumps(output)
if string.lower(amino_acid_change).endswith(string.lower(mut_aa)):
amino_acid_change_and_position = aa[0] + str(aa_pos) + aa[-1]
#TODO: this doesn't work when we have multiple alleles
if len(leftover_alleles) == 1:
trait_allele = leftover_alleles[0]
else:
trait_allele = '?'
elif (string.lower(amino_acid_change).endswith(string.lower(ref_aa)) and
ref_allele in alleles):
amino_acid_change_and_position = aa[-1] + str(aa_pos) + aa[0]
trait_allele = ref_allele
else:
continue

# format for output
if record.start == record.end:
coordinates = str(record.start)
else:
coordinates = str(record.start) + "-" + str(record.end)

genotype = "/".join(leftover_alleles)
if ref_allele in alleles:
genotype = ref_allele + "/" + genotype

output = {
"chromosome": record.seqname,
"coordinates": coordinates,
"gene": gene,
"amino_acid_change": amino_acid_change_and_position,
"genotype": genotype,
"ref_allele": ref_allele,
"trait_allele": trait_allele,
"zygosity": zygosity,
"variant": str(record),
"phenotype": phenotype,
"reference": allelic_variant_id,
}
print json.dumps(output)

# close database cursor and connection
cursor.close()
Expand Down

0 comments on commit 7ce8522

Please sign in to comment.