Added scripts for new genomes and allele frequency queries, and modif…

…ied *_map.py outputs.
xwu · May 1, 2009 · 7ce8522 · 7ce8522
1 parent 0b6a48d
commit 7ce8522
Show file tree

Hide file tree

Showing 9 changed files with 522 additions and 39 deletions.
diff --git a/core/cgi_to_gff.py b/core/cgi_to_gff.py
@@ -0,0 +1,49 @@
+#!/usr/bin/python
+# Filename: cgi_to_gff.py
+
+"""
+usage: %prog CGI-Variations-Compact.csv ...
+"""
+
+# Output GFF record for each entry in file(s)
+# ---
+# This code is part of the Trait-o-matic project and is governed by its license.
+
+import fileinput, os, sys
+
+def main():
+	# return if we don't have the correct arguments
+	if len(sys.argv) < 2:
+		raise SystemExit(__doc__.replace("%prog", sys.argv[0]))
+
+	for line in fileinput.input():
+		if line.startswith("#"):
+			continue
+		l = line.strip().split(",")
+		if len(l) < 10 or l[4] not in ["=", "snp"] or l[5] not in ["=", "snp"]:
+			continue
+
+		chr = l[1]
+		# for now, we treat PAR1 and PAR2 normally, with chrX coordinates
+		if chr in ["PAR1", "PAR2", "chrXnonPAR"]:
+			chr = "chrX"
+		elif chr == "chrYnonPAR":
+			chr = "chrY"
+
+		out_line = chr
+		out_line += "\tcgi\tSNP\t"
+		out_line += str(long(l[2]) + 1) + "\t" + l[3]
+		out_line += "\t.\t+\t.\t"
+
+		if l[7] == l[8]:
+			out_line += "alleles " + l[7]
+		else:
+			out_line += "alleles " + l[7] + "/" + l[8]
+
+		out_line += ";ref_allele " + l[6]
+		out_line += ";total_score " + l[9]
+
+		print out_line
+
+if __name__ == "__main__":
+	main()
diff --git a/core/gff_hgmd_map.py b/core/gff_hgmd_map.py
@@ -10,6 +10,7 @@
 # This code is part of the Trait-o-matic project and is governed by its license.
 
 import os, string, sys
+from copy import copy
 import MySQLdb
 import simplejson as json
 from utils import gff
@@ -90,6 +91,7 @@ def main():
 	for record in gff_file:
 		# lightly parse alleles
 		alleles = record.attributes["alleles"].strip("\"").split("/")
+		ref_allele = record.attributes["ref_allele"].strip("\"")
 
 		# determine zygosity
 		if len(alleles) == 1:
@@ -124,24 +126,57 @@ def main():
 					amino = d[1]
 					pmid = d[2]
 					acc_num = d[3]
+
+					# keep, for internal purposes, a list of alleles minus the reference
+					leftover_alleles = copy(alleles)
+					try:
+						leftover_alleles.remove(ref_allele)
+					except ValueError:
+						pass
 
 					# we see if what we designated the mutant allele is the phenotype-
 					# associated allele, or if what we the reference sequence allele is
 					# the phenotype-associated allele; if the latter, we have to make
 					# sure that the genome we're looking at actually has the reference
 					# allele
-					if string.lower(amino).endswith(string.lower(ref_aa)) or \
-					  (string.lower(amino).endswith(string.lower(mut_aa)) and
-					    record.attributes["ref_allele"] in alleles):
-						output = {
-							"gene": gene,
-							"variant": str(record),
-							"amino_acid_change": amino,
-							"zygosity": zygosity,
-							"phenotype": disease,
-							"reference": "pmid:" + pmid,
-						}
-						print json.dumps(output)
+					if string.lower(amino).endswith(string.lower(mut_aa)):
+						amino_acid_change_and_position = aa[0] + str(aa_pos) + aa[-1]
+						#TODO: this doesn't work when we have multiple alleles
+						if len(leftover_alleles) == 1:
+							trait_allele = leftover_alleles[0]
+						else:
+							trait_allele = '?'
+					elif (string.lower(amino).endswith(string.lower(ref_aa)) and
+					  ref_allele in alleles):
+					    amino_acid_change_and_position = aa[-1] + str(aa_pos) + aa[0]
+					    trait_allele = ref_allele
+					else:
+						continue
+
+					# format for output
+					if record.start == record.end:
+						coordinates = str(record.start)
+					else:
+						coordinates = str(record.start) + "-" + str(record.end)
+
+					genotype = "/".join(leftover_alleles)
+					if ref_allele in alleles:
+						genotype = ref_allele + "/" + genotype
+
+					output = {
+						"chromosome": record.seqname,
+						"coordinates": coordinates,
+						"gene": gene,
+						"amino_acid_change": amino_acid_change_and_position,
+						"genotype": genotype,
+						"ref_allele": ref_allele,
+						"trait_allele": trait_allele,
+						"zygosity": zygosity,
+						"variant": str(record),
+						"phenotype": disease,
+						"reference": "pmid:" + pmid,
+					}
+					print json.dumps(output)
 
 	# close database cursor and connection
 	cursor.close()

diff --git a/core/gff_morbid_map.py b/core/gff_morbid_map.py
@@ -42,16 +42,21 @@ def main():
 
 	gff_file = gff.input(sys.argv[1])	
 	for record in gff_file:
+		# lightly parse alleles
+		alleles = record.attributes["alleles"].strip("\"").split("/")
+		ref_allele = record.attributes["ref_allele"].strip("\"")
+
 		# examine each amino acid change (this takes care of alternative splicings)
 		amino_acid_changes = record.attributes["amino_acid"].strip("\"").split("/")
 
 		# make sure not to duplicate what we print because of multiple alternative
-		# splicings; so, initialize an empty list to hold previous output strings
-		# so we can compare before printing
-		previous_output_strings = []
+		# splicings; so, initialize an empty list to hold previous tuples of gene
+		# names, variant records, and phenotype strings, so that we can compare
+		previous_gene_variant_phenotype = []
 
 		# examine each alternative splicing
 		for a in amino_acid_changes:
+			gene_variant_phenotype = []
 			output_strings = []
 
 			amino_acid = a.split(" ")
@@ -82,11 +87,25 @@ def main():
 					continue
 
 				for d in data:
-					disorder = d[0]
+					disorder = d[0].strip()
 					omim = d[1]
+					gene_variant_phenotype.append((gene, str(record), disorder))
+
+					# format for output
+					if record.start == record.end:
+						coordinates = str(record.start)
+					else:
+						coordinates = str(record.start) + "-" + str(record.end)
+
+					genotype = "/".join(alleles)
 
 					output = {
+						"chromosome": record.seqname,
+						"coordinates": coordinates,
 						"gene": gene,
+						"amino_acid_change": aa,
+						"genotype": genotype,
+						"ref_allele": ref_allele,
 						"variant": str(record),
 						"phenotype": disorder,
 						"reference": "omim:" + str(omim),
@@ -95,8 +114,8 @@ def main():
 					output_strings.append(json.dumps(output))
 
 			# actually only output what's not duplicating previous 
-			if output_strings != previous_output_strings:
-				previous_output_strings = output_strings
+			if gene_variant_phenotype != previous_gene_variant_phenotype:
+				previous_gene_variant_phenotype = gene_variant_phenotype
 				for o in output_strings:
 					print o
 

diff --git a/core/gff_omim_map.py b/core/gff_omim_map.py
@@ -10,6 +10,7 @@
 # This code is part of the Trait-o-matic project and is governed by its license.
 
 import os, string, sys
+from copy import copy
 import MySQLdb
 import simplejson as json
 from utils import gff
@@ -64,6 +65,7 @@ def main():
 	for record in gff_file:
 		# lightly parse alleles
 		alleles = record.attributes["alleles"].strip("\"").split("/")
+		ref_allele = record.attributes["ref_allele"].strip("\"")
 
 		# determine zygosity
 		if len(alleles) == 1:
@@ -102,23 +104,56 @@ def main():
 					else:
 						allelic_variant_id = "omim:" + d[3]
 
+					# keep, for internal purposes, a list of alleles minus the reference
+					leftover_alleles = copy(alleles)
+					try:
+						leftover_alleles.remove(ref_allele)
+					except ValueError:
+						pass
+
 					# we see if what we designated the mutant allele is the phenotype-
 					# associated allele, or if what we the reference sequence allele is
 					# the phenotype-associated allele; if the latter, we have to make
 					# sure that the genome we're looking at actually has the reference
 					# allele
-					if string.lower(amino_acid_change).endswith(string.lower(mut_aa)) or \
-					  (string.lower(amino_acid_change).endswith(string.lower(ref_aa)) and
-					    record.attributes["ref_allele"] in alleles):
-						output = {
-							"gene": gene,
-							"variant": str(record),
-							"amino_acid_change": amino_acid_change,
-							"zygosity": zygosity,
-							"phenotype": phenotype,
-							"reference": allelic_variant_id,
-						}
-						print json.dumps(output)
+					if string.lower(amino_acid_change).endswith(string.lower(mut_aa)):
+						amino_acid_change_and_position = aa[0] + str(aa_pos) + aa[-1]
+						#TODO: this doesn't work when we have multiple alleles
+						if len(leftover_alleles) == 1:
+							trait_allele = leftover_alleles[0]
+						else:
+							trait_allele = '?'
+					elif (string.lower(amino_acid_change).endswith(string.lower(ref_aa)) and
+					  ref_allele in alleles):
+					    amino_acid_change_and_position = aa[-1] + str(aa_pos) + aa[0]
+					    trait_allele = ref_allele
+					else:
+						continue
+
+					# format for output
+					if record.start == record.end:
+						coordinates = str(record.start)
+					else:
+						coordinates = str(record.start) + "-" + str(record.end)
+
+					genotype = "/".join(leftover_alleles)
+					if ref_allele in alleles:
+						genotype = ref_allele + "/" + genotype
+
+					output = {
+						"chromosome": record.seqname,
+						"coordinates": coordinates,
+						"gene": gene,
+						"amino_acid_change": amino_acid_change_and_position,
+						"genotype": genotype,
+						"ref_allele": ref_allele,
+						"trait_allele": trait_allele,
+						"zygosity": zygosity,
+						"variant": str(record),
+						"phenotype": phenotype,
+						"reference": allelic_variant_id,
+					}
+					print json.dumps(output)
 
 	# close database cursor and connection
 	cursor.close()