From 7ce8522b8a46e9baa68d884ac0ffe7b0079ad1a9 Mon Sep 17 00:00:00 2001
From: Xiaodi Wu <xiaodi.wu@gmail.com>
Date: Fri, 1 May 2009 14:45:10 -0400
Subject: [PATCH] Added scripts for new genomes and allele frequency queries,
 and modified *_map.py outputs.

---
 core/cgi_to_gff.py                  |  49 ++++++++++
 core/gff_hgmd_map.py                |  59 ++++++++---
 core/gff_morbid_map.py              |  31 ++++--
 core/gff_omim_map.py                |  59 ++++++++---
 core/gff_snpedia_map.py             | 145 ++++++++++++++++++++++++++++
 core/gff_subtract.py                |  26 +++++
 core/json_allele_frequency_query.py |  87 +++++++++++++++++
 core/server.py                      |  57 +++++++++--
 core/snpinduse_to_gff.py            |  48 +++++++++
 9 files changed, 522 insertions(+), 39 deletions(-)
 create mode 100644 core/cgi_to_gff.py
 create mode 100644 core/gff_snpedia_map.py
 create mode 100644 core/gff_subtract.py
 create mode 100644 core/json_allele_frequency_query.py
 create mode 100644 core/snpinduse_to_gff.py

diff --git a/core/cgi_to_gff.py b/core/cgi_to_gff.py
new file mode 100644
index 0000000..3fbeb71
--- /dev/null
+++ b/core/cgi_to_gff.py
@@ -0,0 +1,49 @@
+#!/usr/bin/python
+# Filename: cgi_to_gff.py
+
+"""
+usage: %prog CGI-Variations-Compact.csv ...
+"""
+
+# Output GFF record for each entry in file(s)
+# ---
+# This code is part of the Trait-o-matic project and is governed by its license.
+
+import fileinput, os, sys
+
+def main():
+	# return if we don't have the correct arguments
+	if len(sys.argv) < 2:
+		raise SystemExit(__doc__.replace("%prog", sys.argv[0]))
+	
+	for line in fileinput.input():
+		if line.startswith("#"):
+			continue
+		l = line.strip().split(",")
+		if len(l) < 10 or l[4] not in ["=", "snp"] or l[5] not in ["=", "snp"]:
+			continue
+		
+		chr = l[1]
+		# for now, we treat PAR1 and PAR2 normally, with chrX coordinates
+		if chr in ["PAR1", "PAR2", "chrXnonPAR"]:
+			chr = "chrX"
+		elif chr == "chrYnonPAR":
+			chr = "chrY"
+		
+		out_line = chr
+		out_line += "\tcgi\tSNP\t"
+		out_line += str(long(l[2]) + 1) + "\t" + l[3]
+		out_line += "\t.\t+\t.\t"
+		
+		if l[7] == l[8]:
+			out_line += "alleles " + l[7]
+		else:
+			out_line += "alleles " + l[7] + "/" + l[8]
+		
+		out_line += ";ref_allele " + l[6]
+		out_line += ";total_score " + l[9]
+		
+		print out_line
+
+if __name__ == "__main__":
+	main()
\ No newline at end of file
diff --git a/core/gff_hgmd_map.py b/core/gff_hgmd_map.py
index f3c54b2..5fb2d47 100644
--- a/core/gff_hgmd_map.py
+++ b/core/gff_hgmd_map.py
@@ -10,6 +10,7 @@
 # This code is part of the Trait-o-matic project and is governed by its license.
 
 import os, string, sys
+from copy import copy
 import MySQLdb
 import simplejson as json
 from utils import gff
@@ -90,6 +91,7 @@ def main():
 	for record in gff_file:
 		# lightly parse alleles
 		alleles = record.attributes["alleles"].strip("\"").split("/")
+		ref_allele = record.attributes["ref_allele"].strip("\"")
 		
 		# determine zygosity
 		if len(alleles) == 1:
@@ -124,24 +126,57 @@ def main():
 					amino = d[1]
 					pmid = d[2]
 					acc_num = d[3]
+					
+					# keep, for internal purposes, a list of alleles minus the reference
+					leftover_alleles = copy(alleles)
+					try:
+						leftover_alleles.remove(ref_allele)
+					except ValueError:
+						pass
 
 					# we see if what we designated the mutant allele is the phenotype-
 					# associated allele, or if what we the reference sequence allele is
 					# the phenotype-associated allele; if the latter, we have to make
 					# sure that the genome we're looking at actually has the reference
 					# allele
-					if string.lower(amino).endswith(string.lower(ref_aa)) or \
-					  (string.lower(amino).endswith(string.lower(mut_aa)) and
-					    record.attributes["ref_allele"] in alleles):
-						output = {
-							"gene": gene,
-							"variant": str(record),
-							"amino_acid_change": amino,
-							"zygosity": zygosity,
-							"phenotype": disease,
-							"reference": "pmid:" + pmid,
-						}
-						print json.dumps(output)
+					if string.lower(amino).endswith(string.lower(mut_aa)):
+						amino_acid_change_and_position = aa[0] + str(aa_pos) + aa[-1]
+						#TODO: this doesn't work when we have multiple alleles
+						if len(leftover_alleles) == 1:
+							trait_allele = leftover_alleles[0]
+						else:
+							trait_allele = '?'
+					elif (string.lower(amino).endswith(string.lower(ref_aa)) and
+					  ref_allele in alleles):
+					    amino_acid_change_and_position = aa[-1] + str(aa_pos) + aa[0]
+					    trait_allele = ref_allele
+					else:
+						continue
+					
+					# format for output
+					if record.start == record.end:
+						coordinates = str(record.start)
+					else:
+						coordinates = str(record.start) + "-" + str(record.end)
+					
+					genotype = "/".join(leftover_alleles)
+					if ref_allele in alleles:
+						genotype = ref_allele + "/" + genotype
+					
+					output = {
+						"chromosome": record.seqname,
+						"coordinates": coordinates,
+						"gene": gene,
+						"amino_acid_change": amino_acid_change_and_position,
+						"genotype": genotype,
+						"ref_allele": ref_allele,
+						"trait_allele": trait_allele,
+						"zygosity": zygosity,
+						"variant": str(record),
+						"phenotype": disease,
+						"reference": "pmid:" + pmid,
+					}
+					print json.dumps(output)
 	
 	# close database cursor and connection
 	cursor.close()
diff --git a/core/gff_morbid_map.py b/core/gff_morbid_map.py
index ea96edf..05e73a1 100644
--- a/core/gff_morbid_map.py
+++ b/core/gff_morbid_map.py
@@ -42,16 +42,21 @@ def main():
 	
 	gff_file = gff.input(sys.argv[1])	
 	for record in gff_file:
+		# lightly parse alleles
+		alleles = record.attributes["alleles"].strip("\"").split("/")
+		ref_allele = record.attributes["ref_allele"].strip("\"")
+		
 		# examine each amino acid change (this takes care of alternative splicings)
 		amino_acid_changes = record.attributes["amino_acid"].strip("\"").split("/")
 		
 		# make sure not to duplicate what we print because of multiple alternative
-		# splicings; so, initialize an empty list to hold previous output strings
-		# so we can compare before printing
-		previous_output_strings = []
+		# splicings; so, initialize an empty list to hold previous tuples of gene
+		# names, variant records, and phenotype strings, so that we can compare
+		previous_gene_variant_phenotype = []
 		
 		# examine each alternative splicing
 		for a in amino_acid_changes:
+			gene_variant_phenotype = []
 			output_strings = []
 			
 			amino_acid = a.split(" ")
@@ -82,11 +87,25 @@ def main():
 					continue
 				
 				for d in data:
-					disorder = d[0]
+					disorder = d[0].strip()
 					omim = d[1]
+					gene_variant_phenotype.append((gene, str(record), disorder))
+					
+					# format for output
+					if record.start == record.end:
+						coordinates = str(record.start)
+					else:
+						coordinates = str(record.start) + "-" + str(record.end)
+					
+					genotype = "/".join(alleles)
 					
 					output = {
+						"chromosome": record.seqname,
+						"coordinates": coordinates,
 						"gene": gene,
+						"amino_acid_change": aa,
+						"genotype": genotype,
+						"ref_allele": ref_allele,
 						"variant": str(record),
 						"phenotype": disorder,
 						"reference": "omim:" + str(omim),
@@ -95,8 +114,8 @@ def main():
 					output_strings.append(json.dumps(output))
 			
 			# actually only output what's not duplicating previous 
-			if output_strings != previous_output_strings:
-				previous_output_strings = output_strings
+			if gene_variant_phenotype != previous_gene_variant_phenotype:
+				previous_gene_variant_phenotype = gene_variant_phenotype
 				for o in output_strings:
 					print o
 
diff --git a/core/gff_omim_map.py b/core/gff_omim_map.py
index 28a2710..1cda93a 100644
--- a/core/gff_omim_map.py
+++ b/core/gff_omim_map.py
@@ -10,6 +10,7 @@
 # This code is part of the Trait-o-matic project and is governed by its license.
 
 import os, string, sys
+from copy import copy
 import MySQLdb
 import simplejson as json
 from utils import gff
@@ -64,6 +65,7 @@ def main():
 	for record in gff_file:
 		# lightly parse alleles
 		alleles = record.attributes["alleles"].strip("\"").split("/")
+		ref_allele = record.attributes["ref_allele"].strip("\"")
 		
 		# determine zygosity
 		if len(alleles) == 1:
@@ -102,23 +104,56 @@ def main():
 					else:
 						allelic_variant_id = "omim:" + d[3]
 					
+					# keep, for internal purposes, a list of alleles minus the reference
+					leftover_alleles = copy(alleles)
+					try:
+						leftover_alleles.remove(ref_allele)
+					except ValueError:
+						pass
+					
 					# we see if what we designated the mutant allele is the phenotype-
 					# associated allele, or if what we the reference sequence allele is
 					# the phenotype-associated allele; if the latter, we have to make
 					# sure that the genome we're looking at actually has the reference
 					# allele
-					if string.lower(amino_acid_change).endswith(string.lower(mut_aa)) or \
-					  (string.lower(amino_acid_change).endswith(string.lower(ref_aa)) and
-					    record.attributes["ref_allele"] in alleles):
-						output = {
-							"gene": gene,
-							"variant": str(record),
-							"amino_acid_change": amino_acid_change,
-							"zygosity": zygosity,
-							"phenotype": phenotype,
-							"reference": allelic_variant_id,
-						}
-						print json.dumps(output)
+					if string.lower(amino_acid_change).endswith(string.lower(mut_aa)):
+						amino_acid_change_and_position = aa[0] + str(aa_pos) + aa[-1]
+						#TODO: this doesn't work when we have multiple alleles
+						if len(leftover_alleles) == 1:
+							trait_allele = leftover_alleles[0]
+						else:
+							trait_allele = '?'
+					elif (string.lower(amino_acid_change).endswith(string.lower(ref_aa)) and
+					  ref_allele in alleles):
+					    amino_acid_change_and_position = aa[-1] + str(aa_pos) + aa[0]
+					    trait_allele = ref_allele
+					else:
+						continue
+					
+					# format for output
+					if record.start == record.end:
+						coordinates = str(record.start)
+					else:
+						coordinates = str(record.start) + "-" + str(record.end)
+					
+					genotype = "/".join(leftover_alleles)
+					if ref_allele in alleles:
+						genotype = ref_allele + "/" + genotype
+					
+					output = {
+						"chromosome": record.seqname,
+						"coordinates": coordinates,
+						"gene": gene,
+						"amino_acid_change": amino_acid_change_and_position,
+						"genotype": genotype,
+						"ref_allele": ref_allele,
+						"trait_allele": trait_allele,
+						"zygosity": zygosity,
+						"variant": str(record),
+						"phenotype": phenotype,
+						"reference": allelic_variant_id,
+					}
+					print json.dumps(output)
 	
 	# close database cursor and connection
 	cursor.close()
diff --git a/core/gff_snpedia_map.py b/core/gff_snpedia_map.py
new file mode 100644
index 0000000..2a35fd8
--- /dev/null
+++ b/core/gff_snpedia_map.py
@@ -0,0 +1,145 @@
+#!/usr/bin/python
+# Filename: gff_snpedia_map.py
+
+"""
+usage: %prog gff_file
+"""
+
+# Output SNPedia information in JSON format, if available
+# ---
+# This code is part of the Trait-o-matic project and is governed by its license.
+
+import os, string, sys
+import MySQLdb, MySQLdb.cursors
+import simplejson as json
+from utils import gff
+from utils.biopython_utils import reverse_complement
+from utils.bitset import *
+from config import DB_HOST, DB_READ_USER, DB_READ_PASSWD, DB_READ_DATABASE
+
+location_query = '''
+SELECT chr, start, end FROM snpedia LIMIT %s,10000;
+'''
+
+query = '''
+SELECT phenotype, pubmed_id FROM snpedia WHERE
+rs_id=%s AND ((strand="+" AND genotype=%s) OR (strand="-" AND genotype=%s));
+'''
+
+def main():
+	# return if we don't have the correct arguments
+	if len(sys.argv) < 2:
+		raise SystemExit(__doc__.replace("%prog", sys.argv[0]))
+	
+	# first, try to connect to the databases
+	try:
+		location_connection = MySQLdb.connect(cursorclass=MySQLdb.cursors.SSCursor, host=DB_HOST, user=DB_READ_USER, passwd=DB_READ_PASSWD, db=DB_READ_DATABASE)
+		location_cursor = location_connection.cursor()
+		connection = MySQLdb.connect(host=DB_HOST, user=DB_READ_USER, passwd=DB_READ_PASSWD, db=DB_READ_DATABASE)
+		cursor = connection.cursor()
+	except MySQLdb.OperationalError, message:
+		print "Error %d while connecting to database: %s" % (message[0], message[1])
+		sys.exit()
+	
+	# build a dictionary of bitsets from the database (partly based on utility code from bx-python)
+	start_record = 0
+	last_chromosome = None
+	last_bitset = None
+	bitsets = dict()
+	# do this in 10,000 chunks
+	while True:
+		location_cursor.execute(location_query, start_record)
+		previous_start_record = start_record
+		# go through what we retrieved
+		for datum in location_cursor:
+			start_record += 1
+			chromosome = datum[0]
+			if chromosome != last_chromosome:
+				if chromosome not in bitsets:
+					bitsets[chromosome] = BinnedBitSet(MAX)
+				last_chromosome = chromosome
+				last_bitset = bitsets[chromosome]
+			start, end = datum[1], datum[2]
+			last_bitset.set_range(start, end - start)
+		# stop if we're done
+		if previous_start_record == start_record:
+			break
+	
+	# doing this intersect operation speeds up our task significantly for full genomes
+	gff_file = gff.input(sys.argv[1])	
+	for line in gff_file.intersect(bitsets):
+		# the one drawback is that intersect() was implemented to return strings, so we
+		# need to parse it
+		record = gff.input([line]).next()
+		# lightly parse to find the alleles and rs number
+		alleles = record.attributes["alleles"].strip("\"").split("/")
+		ref_allele = record.attributes["ref_allele"].strip("\"")
+		try:
+			xrefs = record.attributes["db_xref"].strip("\"").split(",")
+		except KeyError:
+			try:
+				xrefs = record.attributes["Dbxref"].strip("\"").split(",")
+			except KeyError:
+				continue
+		for x in xrefs:
+			if x.startswith("dbsnp:rs"):
+				rs = x.replace("dbsnp:", "")
+				break
+		
+		# quit if we don't have an rs number
+		if not rs:
+			continue
+		# we wouldn't know what to do with this, so pass it up for now
+		if len(alleles) > 2:
+			continue
+		
+		# create the genotype string from the given alleles
+		#TODO: do something about the Y chromosome
+		if len(alleles) == 1:
+			genotype = alleles[0] + ";" + alleles[0]
+			reverse_genotype = reverse_complement(alleles[0]) + ";" + reverse_complement(alleles[0])
+		else:
+			genotype = ';'.join(sorted(alleles))
+			reverse_genotype = ';'.join(sorted([reverse_complement(a) for a in alleles]))
+		
+		# query the database
+		cursor.execute(query, (rs, genotype, reverse_genotype))
+		data = cursor.fetchall()
+		
+		# move on if we don't have info
+		if cursor.rowcount <= 0:
+			continue
+			
+		for d in data:
+			phenotype = d[0]
+			pubmed = d[1]
+			
+			# format for output
+			if record.start == record.end:
+				coordinates = str(record.start)
+			else:
+				coordinates = str(record.start) + "-" + str(record.end)
+			
+			if pubmed != "":
+				reference = "dbsnp:" + rs + ",pmid:" + pubmed.replace(",", ",pmid:")
+			else:
+				reference = "dbsnp:" + rs
+			
+			output = {
+				"chromosome": record.seqname,
+				"coordinates": coordinates,
+				"genotype": genotype,
+				"variant": str(record),
+				"phenotype": phenotype,
+				"reference": reference
+			}
+			print json.dumps(output)
+	
+	# close database cursor and connection
+	cursor.close()
+	connection.close()
+	location_cursor.close()
+	location_connection.close()
+
+if __name__ == "__main__":
+	main()
diff --git a/core/gff_subtract.py b/core/gff_subtract.py
new file mode 100644
index 0000000..39d1251
--- /dev/null
+++ b/core/gff_subtract.py
@@ -0,0 +1,26 @@
+#!/usr/bin/python
+# Filename: gff_subtract.py
+
+"""
+usage: %prog gff_file_1 gff_file_2
+"""
+
+# Output the subtraction of two GFF files, with attributes taken from the first
+# ---
+# This code is part of the Trait-o-matic project and is governed by its license.
+
+import sys
+from utils import gff
+
+def main():
+	# return if we don't have the correct arguments
+	if len(sys.argv) < 3:
+		raise SystemExit(__doc__.replace("%prog", sys.argv[0]))
+	
+	g1 = gff.input(sys.argv[1])
+	g2 = gff.input(sys.argv[2])
+	for line in g1.subtract(g2):
+		print line
+
+if __name__ == "__main__":
+	main()
diff --git a/core/json_allele_frequency_query.py b/core/json_allele_frequency_query.py
new file mode 100644
index 0000000..bf3d1f6
--- /dev/null
+++ b/core/json_allele_frequency_query.py
@@ -0,0 +1,87 @@
+#!/usr/bin/python
+# Filename: json_allele_frequency_query.py
+
+"""
+usage: %prog json ... [options]
+  -i, --in-place: move input into a backup file and write output in its place
+"""
+
+# Output allele frequency information in JSON format, if available
+# ---
+# This code is part of the Trait-o-matic project and is governed by its license.
+
+import fileinput, os, string, sys
+import MySQLdb
+import simplejson as json
+from utils import doc_optparse
+from utils.biopython_utils import reverse_complement
+from config import DB_HOST, DB_READ_USER, DB_READ_PASSWD, DB_READ_DATABASE
+
+query = '''
+SELECT strand, pop, ref_allele, ref_allele_freq, oth_allele, oth_allele_freq FROM hapmap27 WHERE
+chr=%s AND start=%s AND end=%s;
+'''
+
+def main():
+	# parse options
+	option, args = doc_optparse.parse(__doc__)
+	
+	if len(args) < 1:
+		doc_optparse.exit()
+	
+	# first, try to connect to the databases
+	try:
+		connection = MySQLdb.connect(host=DB_HOST, user=DB_READ_USER, passwd=DB_READ_PASSWD, db=DB_READ_DATABASE)
+		cursor = connection.cursor()
+	except MySQLdb.OperationalError, message:
+		print "Error %d while connecting to database: %s" % (message[0], message[1])
+		sys.exit()
+	
+	for line in fileinput.input(args, inplace=option.in_place):
+		l = json.loads(line.strip())
+		
+		# sanity check--should always pass
+		if not ("chromosome" in l or "coordinates" in l):
+			continue
+		
+		# we can't handle variants longer than 1 bp
+		unavailable = True
+		if not "-" in "coordinates":
+			chr = l["chromosome"]
+			start = int(l["coordinates"]) - 1
+			end = int(l["coordinates"])
+			
+			# get allele frequency data based on coordinates
+			cursor.execute(query, (chr, start, end))
+			data = cursor.fetchall()
+			
+			# move on if we don't have info
+			if cursor.rowcount > 0:
+				unavailable = False
+		
+		if unavailable:
+			l["maf"] = "N/A"
+			if "trait_allele" in l:
+				l["taf"] = "N/A"
+		else:
+			# output minor allele frequency as a dictionary, with population abbrs as keys
+			l["maf"] = dict(zip([d[1] for d in data], [float(min(d[3], d[5])) for d in data]))
+			# output trait allele frequency as a dictionary; this one is a little trickier
+			if "trait_allele" in l:
+				l["taf"] = dict()
+				for d in data:
+					if d[0] == "+" and l["trait_allele"] == d[2] \
+					  or d[0] == "-" and l["trait_allele"] == reverse_complement(d[2]):
+						l["taf"][d[1]] = float(d[3])
+					elif d[0] == "+" and l["trait_allele"] == d[4] \
+					  or d[0] == "-" and l["trait_allele"] == reverse_complement(d[4]):
+						l["taf"][d[1]] = float(d[5])
+		
+		print json.dumps(l)
+	
+	# close database cursor and connection
+	cursor.close()
+	connection.close()
+
+if __name__ == "__main__":
+	main()
diff --git a/core/server.py b/core/server.py
index a243a0c..a927e66 100644
--- a/core/server.py
+++ b/core/server.py
@@ -5,22 +5,47 @@
 usage: %prog [options]
   -h, --host=STRING: the host on which to listen
   -p, --port=NUMBER: the port on which to listen
+  -t, --trackback: invoke the server's trackback function with arguments url, path, kind, request_token (does not start a new server)
 """
 
 # Start an XMLRPC server for Trait-o-matic
 # ---
 # This code is part of the Trait-o-matic project and is governed by its license.
 
-import base64, hashlib, os, shutil, subprocess, sys, urllib2
+import base64, hashlib, os, shutil, subprocess, sys, urllib, urllib2
 from SimpleXMLRPCServer import SimpleXMLRPCServer as xrs
 from tempfile import mkstemp
 from utils import doc_optparse
 from config import UPLOAD_DIR, REFERENCE_GENOME
 
+def trackback(url, params):
+	request = urllib2.Request(url, params)
+	request.add_header('User-agent', 'Trait-o-matic/20090123 Python')
+	request.add_header('Content-type', 'application/x-www-form-urlencoded;charset=utf-8')
+	try:
+		file = urllib2.urlopen(request)
+	except URLError:
+		return False
+	file.close()
+	return True
+
 def main():
 	# parse options
 	option, args = doc_optparse.parse(__doc__)
 	
+	# deal with the trackback option
+	if option.trackback:
+		if len(args) < 4:
+			doc_optparse.exit()
+		url = args[0]
+		path = args[1]
+		kind = args[2]
+		request_token = args[3]
+		params = urllib.urlencode({ 'path': path, 'kind': kind, 'request_token': request_token })            
+		trackback(url, params)
+		return
+	
+	# otherwise, figure out the host and port
 	host = option.host or "localhost"
 	port = int(option.port or 8080)
 	
@@ -28,7 +53,7 @@ def main():
 	server = xrs((host, port))
 	server.register_introspection_functions()
 	
-	def submit(genotype_file, coverage_file=None, username=None, password=None):
+	def submit(genotype_file, coverage_file='', username=None, password=None):
 		# get genotype file
 		r = urllib2.Request(genotype_file)
 		if username is not None:
@@ -56,26 +81,33 @@ def submit(genotype_file, coverage_file=None, username=None, password=None):
 		return s
 	server.register_function(submit)
 	
-	def submit_local(genotype_file, coverage_file=None):
+	def submit_local(genotype_file, coverage_file='', trackback_url='', request_token=''):
 		# execute script
 		script_dir = os.path.dirname(sys.argv[0])
 		output_dir = os.path.dirname(genotype_file)
 		# letters refer to scripts; numbers refer to outputs
-		args = { 'A': os.path.join(script_dir, "gff_query_twobit.py"),
-		         'B': os.path.join(script_dir, "gff_query_dbsnp.py"),
+		args = { 'A': os.path.join(script_dir, "gff_twobit_query.py"),
+		         'B': os.path.join(script_dir, "gff_dbsnp_query.py"),
 		         'C': os.path.join(script_dir, "gff_nonsynonymous_filter.py"),
 		         'D': os.path.join(script_dir, "gff_omim_map.py"),
 		         'E': os.path.join(script_dir, "gff_hgmd_map.py"),
 		         'F': os.path.join(script_dir, "gff_morbid_map.py"),
+		         'G': os.path.join(script_dir, "gff_snpedia_map.py"),
+		         'H': os.path.join(script_dir, "json_allele_frequency_query.py"),
+		         'Z': os.path.join(script_dir, "server.py"),
 		         'in': genotype_file,
 		         'reference': REFERENCE_GENOME,
-		         '1': os.path.join(output_dir, "genotype.1.gff"),
-		         '2': os.path.join(output_dir, "genotype.2.gff"),
+		         'url': trackback_url,
+		         'token': request_token,
+		         '1': os.path.join(output_dir, "genotype.gff"),
+		         '2': os.path.join(output_dir, "genotype.dbsnp.gff"),
 		         '3': os.path.join(output_dir, "ns.gff"),
 		         '4': os.path.join(output_dir, "omim.json"),
 		         '5': os.path.join(output_dir, "hgmd.json"),
 		         '6': os.path.join(output_dir, "morbid.json"),
-		         '7': os.path.join(output_dir, "README") }
+		         '7': os.path.join(output_dir, "snpedia.json"),
+		         '8': "",
+		         '0': os.path.join(output_dir, "README") }
 		cmd = '''(
 		python '%(A)s' '%(in)s' '%(reference)s' > '%(1)s'
 		python '%(B)s' '%(1)s' > '%(2)s'
@@ -83,7 +115,14 @@ def submit_local(genotype_file, coverage_file=None):
 		python '%(D)s' '%(3)s' > '%(4)s'
 		python '%(E)s' '%(3)s' > '%(5)s'
 		python '%(F)s' '%(3)s' > '%(6)s'
-		touch '%(7)s'
+		python '%(G)s' '%(2)s' > '%(7)s'
+		python '%(H)s' '%(4)s' '%(5)s' '%(6)s' '%(7)s' --in-place
+		touch '%(0)s'
+		python '%(Z)s' -t '%(url)s' '%(4)s' 'out/omim' '%(token)s'
+		python '%(Z)s' -t '%(url)s' '%(5)s' 'out/hgmd' '%(token)s'
+		python '%(Z)s' -t '%(url)s' '%(6)s' 'out/morbid' '%(token)s'
+		python '%(Z)s' -t '%(url)s' '%(7)s' 'out/snpedia' '%(token)s'
+		python '%(Z)s' -t '%(url)s' '%(0)s' 'out/readme' '%(token)s'
 		)&''' % args
 		subprocess.call(cmd, shell=True)
 		return output_dir
diff --git a/core/snpinduse_to_gff.py b/core/snpinduse_to_gff.py
new file mode 100644
index 0000000..dbc9514
--- /dev/null
+++ b/core/snpinduse_to_gff.py
@@ -0,0 +1,48 @@
+#!/usr/bin/python
+# Filename: snpinduse_to_gff.py
+
+"""
+usage: %prog *.snpinduse.txt ...
+"""
+
+# Output GFF record for each entry in file(s)
+# ---
+# This code is part of the Trait-o-matic project and is governed by its license.
+
+import fileinput, os, sys
+
+# based on <http://www.peterbe.com/plog/uniqifiers-benchmark>
+def unique(seq): # not order preserving, but that's OK; we can sort it later
+	return {}.fromkeys(seq).keys()
+
+def main():
+	# return if we don't have the correct arguments
+	if len(sys.argv) < 2:
+		raise SystemExit(__doc__.replace("%prog", sys.argv[0]))
+	
+	for line in fileinput.input():
+		if not line.startswith("SNP:"):
+			continue
+		l = line.strip().split("|")
+		if l[2] != "SS_STRAND_FWD":
+			print >> sys.stderr, l[1]
+			continue
+		
+		snp = l[1].split(":")
+		details = snp[0].split("_")
+		alleles = snp[1].split("/")
+		
+		out_line = details[3]
+		out_line += "\t"
+		out_line += details[0]
+		out_line += "\tSNP\t"
+		out_line += details[4] + "\t" + details[4]
+		out_line += "\t.\t+\t.\t"
+		
+		out_line += "alleles " + '/'.join(unique(alleles))
+		out_line += ";id " + details[1]
+		
+		print out_line
+
+if __name__ == "__main__":
+	main()
\ No newline at end of file