Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 379 lines (335 sloc) 12.8 KB
#!/usr/bin/php
<?php
;
// Copyright: see COPYING
// Authors: see git-blame(1)
chdir ('public_html');
require_once 'lib/setup.php';
require_once 'lib/genomes.php';
require_once 'lib/openid.php';
require_once 'lib/bp.php';
ini_set ("output_buffering", FALSE);
if ($_SERVER["argc"] == 2)
$want_genome = $_SERVER["argv"][1];
genomes_create_tables ();
openid_login_as_robot ("Genome Importing Robot");
theDb()->query ("CREATE TEMPORARY TABLE import_genomes_tmp (
variant_id BIGINT UNSIGNED NOT NULL,
genome_id BIGINT UNSIGNED NOT NULL,
chr CHAR(6) NOT NULL,
chr_pos INT UNSIGNED NOT NULL,
trait_allele CHAR(1),
taf FLOAT,
rsid BIGINT UNSIGNED,
dataset_id VARCHAR(16) NOT NULL,
zygosity ENUM('heterozygous','homozygous') NOT NULL DEFAULT 'heterozygous',
UNIQUE(variant_id,dataset_id),
INDEX(dataset_id),
INDEX(chr,chr_pos))");
theDb()->query ("CREATE TEMPORARY TABLE imported_datasets (
dataset_id VARCHAR(16) NOT NULL,
UNIQUE(dataset_id))");
// Dump current list of variants into import_genomes_tmp table
$and = "";
$params = array ($pgp_data_user, $public_data_user);
if (isset ($want_genome)) {
if (strlen($want_genome) == 40)
$and = "AND shasum=?";
else
$and = "AND private_genome_id=?";
$params[] = $want_genome;
}
$sql = "SELECT * FROM private_genomes WHERE oid IN (?,?) $and";
$public_genomes = theDb()->getAll ($sql, $params);
foreach ($public_genomes as $g) {
$datadir = "$gBackendBaseDir/upload/{$g['shasum']}-out";
print "Importing genome {$g['private_genome_id']} sha {$g['shasum']} nick \"{$g['nickname']}\" uploaded by {$g['oid']}\n";
// FIXME: add global_human_id column to the private_genomes table
if (isset ($g['global_human_id']) && 0 < strlen($g['global_human_id']))
$global_human_id = $g['global_human_id'];
else {
print "No global_human_id. Skipping.\n";
continue;
}
$fh = fopen ("$datadir/get-evidence.json", "r");
if (!$fh) { print "open($datadir/get-evidence.json) failed.\n"; continue; }
if (file_exists ("$datadir/lock")) { print "Skipping because backend is still processing.\n"; continue; }
$genome_id = evidence_get_genome_id ($global_human_id);
$dataset_id = substr($g['shasum'], 0, 16);
print "Using dataset_id $dataset_id\n";
$sex = ''; // TODO: store this somewhere? figure it out from chrX/Y?
theDb()->query ("UPDATE genomes SET name=? WHERE genome_id=?",
array ($g['nickname'], $genome_id));
theDb()->query ("REPLACE INTO datasets SET dataset_id=?, genome_id=?, sex=?, dataset_url=?",
array ($dataset_id, $genome_id, $sex,
"/genomes?{$g['private_genome_id']}"));
$ops = 0;
$count_existing_variants = 0;
$count_new_variants = 0;
while (($line = fgets ($fh)) !== false) {
++$ops;
if ($ops % 10000 == 0)
print $ops;
else if ($ops % 1000 == 0)
print ".";
$jvariant = json_decode($line, true);
if (!$jvariant) {
print "(skipping unparseable JSON at line $ops)\n";
continue;
}
if (isset($jvariant["gene"]) &&
isset($jvariant["amino_acid_change"])) {
$variant_names = array();
foreach (split (' ', $jvariant["amino_acid_change"])
as $aa_change) {
array_push ($variant_names,
$jvariant["gene"] . " " . $aa_change);
}
}
else if (isset($jvariant["dbSNP"]))
$variant_names = explode(',', $jvariant["dbSNP"]);
else if (isset($jvariant["dbsnp"]))
$variant_names = explode(',', $jvariant["dbsnp"]);
else {
if ($jvariant["GET-Evidence"])
print "(Why is this here?) $line";
continue;
}
foreach ($variant_names as $variant_name) {
$rsid = null;
if (preg_match ('/^rs(\d+)$/', $variant_name, $regs) ||
(isset ($jvariant["dbSNP"]) && preg_match ('/^(?:rs)?(\d+)/', $jvariant["dbSNP"], $regs)) ||
(isset ($jvariant["dbsnp"]) && preg_match ('/^(?:rs)?(\d+)/', $jvariant["dbsnp"], $regs)))
$rsid = $regs[1];
$variant_id = evidence_get_variant_id ($variant_name);
if ($variant_id)
++$count_existing_variants;
else {
// Create the variant, and an "initial edit/add" row in edits table
$variant_id = evidence_get_variant_id ($variant_name,
false, false, false,
true);
if (!$variant_id) {
error_log ("failed to add variant: $variant_name");
continue;
}
++$count_new_variants;
$edit_id = evidence_get_latest_edit ($variant_id,
0, 0, 0,
true);
}
$taf = null;
if (isset($jvariant["num"]) && $jvariant["denom"]>0)
// FIXME: taf is no longer in JSON so downstream needs to be updated before it can be used
$taf = $jvariant["num"] / $jvariant["denom"];
$allele = $jvariant["genotype"];
$zygosity = preg_match ('{^[ACGT]$}', $allele) ? "homozygous" : "heterozygous";
$trait_allele = preg_replace ('{[^ACGT]|'.$jvariant["ref_allele"].'}', '', $jvariant["genotype"]);
if (strlen($trait_allele) > 1) {
$trait_allele = bp_flatten ($trait_allele); // compound het
$taf = null;
}
$ok = theDb()->query ("INSERT IGNORE INTO import_genomes_tmp SET variant_id=?, genome_id=?, chr=?, chr_pos=?, trait_allele=?, taf=?, rsid=?, dataset_id=?, zygosity=?",
array ($variant_id,
$genome_id,
$jvariant["chromosome"],
$jvariant["coordinates"],
$trait_allele,
$taf,
$rsid,
$dataset_id,
$zygosity));
if (theDb()->isError($ok)) die ($line."\n".$ok->getMessage()."\n");
}
}
print "\n$count_existing_variants existing and $count_new_variants new variants.\n";
fclose ($fh);
if (file_exists ("$datadir/ns.gff.gz"))
$fh = gzopen ($nsfile = "$datadir/ns.gff.gz", "r");
else
$fh = fopen ($nsfile = "$datadir/ns.gff", "r");
if (!$fh) { print "open($nsfile) failed.\n"; continue; }
if (file_exists ("$datadir/lock")) { print "Skipping because backend is still processing.\n"; continue; }
print "\nReading $nsfile\n";
$ops = 0;
$count_existing_variants = 0;
$count_new_variants = 0;
while (($line = fgets ($fh)) !== false) {
if ($line[0] == "#")
continue;
++$ops;
if ($ops % 100000 == 0)
print $ops;
else if ($ops % 10000 == 0)
print ".";
$gff = explode ("\t", $line);
$rsid = null;
if (preg_match ('{db_xref dbsnp(?:\.\d+)?:rs(\d+)}', $gff[8], $regs))
$rsid = $regs[1];
if (preg_match ('{amino_acid ([^;\n]+)}', $gff[8], $regs)) {
$variant_names = array();
foreach (explode ("/", $regs[1]) as $gene_aa_aa) {
$gene_aa_aa = split(' ', $gene_aa_aa);
$gene = array_shift ($gene_aa_aa);
foreach ($gene_aa_aa as $aa) {
array_push ($variant_names, "$gene $aa");
}
}
}
else if ($rsid)
// If we wanted to add all dbsnp variants, we would do...
// $variant_names = array ("rs$rsid");
// ...but we don't.
continue;
else
continue;
if (preg_match ('{alleles ([A-Z,/]+)}', $gff[8], $regs))
$allele = $regs[1];
else continue;
if (preg_match ('{ref_allele ([ACGT])}', $gff[8], $regs))
$ref_allele = $regs[1];
else continue;
$chromosome = $gff[0];
$position = $gff[3];
$zygosity = preg_match ('{^[ACGT]$}', $allele) ? "homozygous" : "heterozygous";
$trait_allele = preg_replace ('{[^ACGT]|'.$ref_allele.'}', '', $allele);
if (strlen($trait_allele) > 1) {
$trait_allele = bp_flatten ($trait_allele); // compound het
}
foreach ($variant_names as $variant_name) {
$variant_id = evidence_get_variant_id ($variant_name);
if ($variant_id)
++$count_existing_variants;
else if (!preg_match ('{^rs\d+$}', $variant_name)) {
++$count_new_variants;
// Create the variant, and an "initial edit/add" row in edits table
$variant_id = evidence_get_variant_id ($variant_name,
false, false, false,
true);
if (!$variant_id) {
error_log ("failed to add variant: $variant_name");
continue;
}
$edit_id = evidence_get_latest_edit ($variant_id,
0, 0, 0,
true);
}
$ok = theDb()->query ("INSERT IGNORE INTO import_genomes_tmp SET variant_id=?, genome_id=?, chr=?, chr_pos=?, trait_allele=?, taf=?, rsid=?, dataset_id=?, zygosity=?",
array ($variant_id,
$genome_id,
$chromosome,
$position,
$trait_allele,
$taf,
$rsid,
$dataset_id,
$zygosity));
if (theDb()->isError($ok)) die ($line."\n".$ok->getMessage());
}
}
print "\n$count_existing_variants existing and $count_new_variants new variants.\n";
fclose ($fh);
theDb()->query ("INSERT INTO imported_datasets SET dataset_id=?",
array ($dataset_id));
}
print "Adding {dataset,variant} associations to variant_occurs table...";
theDb()->query ("REPLACE INTO variant_occurs (variant_id, rsid, dataset_id, zygosity, chr, chr_pos, allele) SELECT variant_id, rsid, dataset_id, zygosity, chr, chr_pos, trait_allele FROM import_genomes_tmp");
print theDb()->affectedRows();
print "\n";
// Look for new variants in import_genomes_tmp that aren't in
// snap_latest, and add suitable edits
print "Looking for new {genome,variant} associations and adding them as edits...";
$timestamp = theDb()->getOne ("SELECT NOW()");
theDb()->query ("INSERT INTO edits
(variant_id, genome_id, article_pmid, disease_id, is_draft, edit_oid, edit_timestamp)
SELECT DISTINCT i.variant_id, i.genome_id, 0, 0, 0, ?, ?
FROM import_genomes_tmp i
LEFT JOIN snap_latest s ON s.variant_id = i.variant_id AND s.article_pmid = '0' AND s.disease_id = '0' AND s.genome_id = i.genome_id
WHERE s.variant_id IS NULL",
array (getCurrentUser("oid"), $timestamp));
print theDb()->affectedRows();
print "\n";
print "Pushing new associations to snap_latest...";
theDb()->query ("INSERT IGNORE INTO snap_latest SELECT * FROM edits WHERE edit_oid=? and edit_timestamp=?",
array (getCurrentUser("oid"), $timestamp));
print theDb()->affectedRows();
print "\n";
// Look for variant+dataset associations in variant_occurs that are no
// longer supported by the latest imported data in import_genomes_tmp
print "Finding variant_occurs rows disputed by this import...";
theDb()->query ("
CREATE TEMPORARY TABLE variant_occurs_not
SELECT v.variant_id, v.rsid, v.dataset_id FROM variant_occurs v
LEFT JOIN imported_datasets
ON v.dataset_id=imported_datasets.dataset_id
LEFT JOIN datasets d
ON v.dataset_id=d.dataset_id
LEFT JOIN import_genomes_tmp i
ON v.variant_id=i.variant_id
AND d.dataset_id=i.dataset_id
WHERE imported_datasets.dataset_id IS NOT NULL
AND i.variant_id IS NULL
");
print theDb()->affectedRows();
print "\n";
// Delete the no-longer-occurring variants from variant_occurs
print "Deleting disputed rows from variant_occurs...";
theDb()->query ("DELETE v.*
FROM variant_occurs_not n, variant_occurs v
WHERE n.variant_id=v.variant_id
AND n.dataset_id=v.dataset_id");
print theDb()->affectedRows();
print "\n";
// For each deleted variant+genome_id assoc, if the latest edit was
// made by this program (i.e., nobody has written any comments about
// this variant+genome_id association), and there is no longer any
// evidence in variant_occurs supporting it, add a "delete" edit and
// remove the entry from snap_latest.
print "Making list of {genome,variant} pairs to check...";
$q = theDb()->query ("
CREATE TEMPORARY TABLE check_genome_variant
SELECT DISTINCT genome_id, variant_id
FROM variant_occurs_not v
LEFT JOIN datasets d
ON d.dataset_id=v.dataset_id");
if (theDb()->isError($q)) print $q->getMessage();
print ($count_check_pairs = theDb()->affectedRows());
print "\n";
print "Entering \"delete\" edits for \"genome\" comment which have no supporting evidence after deleting the disputed rows and have not been edited by users...";
$q = theDb()->query ("
INSERT IGNORE INTO edits
(variant_id, genome_id, article_pmid, previous_edit_id, is_draft, is_delete,
edit_oid, edit_timestamp)
-- find all variants deleted from these datasets...
SELECT old.variant_id, old.genome_id, 0, old.edit_id, 0, 1, ?, ?
FROM check_genome_variant del
-- ...find 'genome X added by robot' edits in snap_latest...
LEFT JOIN snap_latest old
ON old.variant_id=del.variant_id
AND old.article_pmid=0
AND old.genome_id=del.genome_id
AND old.disease_id=0
AND old.edit_oid=?
-- ...find any remaining rows linking this variant to this genome...
LEFT JOIN variant_occurs v
ON v.variant_id=old.variant_id
AND v.dataset_id IN (SELECT dataset_id FROM datasets WHERE genome_id = del.genome_id)
-- Submit a 'delete' entry if the entry hasn't been touched since it was added by the robot...
WHERE old.edit_id IS NOT NULL
-- ...and none of the remaining datasets mention the variant
AND v.variant_id IS NULL
", array (getCurrentUser("oid"), $timestamp,
getCurrentUser("oid")));
if (theDb()->isError($q)) print $q->getMessage();
print ($count_removals = theDb()->affectedRows());
print "\n";
if ($count_removals > 0)
{
print "Really removing them from snap_latest...";
theDb()->query ("
DELETE FROM snap_latest
WHERE edit_id IN (SELECT previous_edit_id FROM edits WHERE edit_oid=? AND edit_timestamp=? AND is_delete=1)
", array (getCurrentUser("oid"), $timestamp));
print theDb()->affectedRows();
print "\n";
}