Skip to content

Commit

Permalink
Merge pull request #1152 from tripal/1151-tv3-sequence_deflines
Browse files Browse the repository at this point in the history
Missing sequence definition line
  • Loading branch information
laceysanderson committed Jan 27, 2021
2 parents ee38a9c + 8f131f1 commit a453a45
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 53 deletions.
16 changes: 8 additions & 8 deletions tripal_chado/api/modules/tripal_chado.feature.api.inc
Original file line number Diff line number Diff line change
Expand Up @@ -785,21 +785,21 @@ function chado_get_fasta_defline($feature, $notes = '', $featureloc = NULL, $typ

// Construct the definition line.
$defline = $feature->uniquename . " " .
'ID=' . $feature->uniquename . "|" .
'Name=' . $feature->name . "|" .
'organism=' . $feature->organism_id->genus . " " . $feature->organism_id->species . "|" .
'type=' . $type . '|';
'ID=' . $feature->uniquename . "; " .
'Name=' . $feature->name . "; " .
'organism=' . $feature->organism_id->genus . " " . $feature->organism_id->species . "; " .
'type=' . $type . '; ';
if ($length > 0) {
$defline .= "length=" . $length . "bp|";
$defline .= "length=" . $length . "bp; ";
}
if ($featureloc) {
$defline .= "location=Sequence derived from alignment at " . chado_get_location_string($featureloc);
$defline .= "location=Sequence derived from: " . chado_get_location_string($featureloc);
$defline .= " (" . $featureloc->srcfeature_id->organism_id->genus . " " . $featureloc->srcfeature_id->organism_id->species . ")|";
}
if ($notes) {
$defline .= "Notes=$notes|";
$defline .= "Notes=$notes; ";
}
$defline = substr($defline, 0, -1); // remove the trailing |
$defline = substr($defline, 0, -2); // remove the trailing "; "
return $defline;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ class data__sequence_record extends ChadoField {
$field_name = $this->field['field_name'];
$feature = $entity->chado_record;

// Intialize the field items array
$entity->{$field_name}['und'] = [];

// Add the primary sequence from the Chada feature table, residues column.
$feature = chado_expand_var($feature, 'field', 'feature.residues');

Expand All @@ -84,8 +87,8 @@ class data__sequence_record extends ChadoField {
// If this is an mRNA feature then add the gene parent, full length
// mRNA, CDS and protein.
if ($feature->type_id->name == 'mRNA') {
$this->addGeneParent($entity, $feature, $field_name);
$featurelocs = $this->addFLmRNA($entity, $feature, $field_name);
$this->addGeneParent($entity, $feature, $field_name);
if (count($featurelocs) > 0) {
$this->addCDS($entity, $feature, $field_name, $featurelocs);
$this->addProtein($entity, $feature, $field_name);
Expand All @@ -111,6 +114,7 @@ class data__sequence_record extends ChadoField {
$seq_coords_term = 'data:2012';
$seq_length_term = chado_get_semweb_term('feature', 'seqlen');
$seq_md5sum_term = chado_get_semweb_term('feature', 'md5checksum');
$fasta_defline = 'local:fasta_definition';

$options = [
'derive_from_parent' => 1,
Expand All @@ -123,12 +127,13 @@ class data__sequence_record extends ChadoField {

$entity->{$field_name}['und'][]['value'] = [
$sequence_term => $seq['residues'],
$label_term => ucfirst(preg_replace('/_/', ' ', $feature->type_id->name)) . ' Sequence (' . number_format(strlen($seq['residues'])) . 'bp)',
$description_term => 'This sequence was extracted from the reference sequence location at ' . $coords['schema:description'] . '.',
$label_term => 'Derived ' . ucfirst(preg_replace('/_/', ' ', $feature->type_id->name)) . ' Sequence (' . number_format(strlen($seq['residues'])) . 'bp)',
$description_term => 'This sequence was derived by extracting bases from the reference sequence location at ' . $coords['schema:description'] . '.',
$seq_coords_term => $coords,
$seq_length_term => strlen($seq['residues']),
$seq_md5sum_term => md5($seq['residues']),
$type_term => $feature->type_id->name
$type_term => $feature->type_id->name,
$fasta_defline => $seq['defline'],
];

$featurelocs[] = $featureloc;
Expand All @@ -137,10 +142,7 @@ class data__sequence_record extends ChadoField {
}

/**
*
* @param unknown $entity
* @param unknown $feature
* @param unknown $field_name
* Adds the primary sequence from the feature.residues column.
*/
private function addPrimary(&$entity, $feature, $field_name) {

Expand All @@ -150,25 +152,24 @@ class data__sequence_record extends ChadoField {
$sequence_term = chado_get_semweb_term('feature', 'residues');
$seq_length_term = chado_get_semweb_term('feature', 'seqlen');
$seq_md5sum_term = chado_get_semweb_term('feature', 'md5checksum');
$fasta_defline = 'local:fasta_definition';

if ($feature->residues) {
$entity->{$field_name}['und'][]['value'] = [
$label_term => 'Primary ' . preg_replace('/_/', ' ', $feature->type_id->name) . 'Sequence (' . number_format($feature->seqlen) . 'bp)',
$label_term => 'Primary ' . preg_replace('/_/', ' ', $feature->type_id->name) . ' Sequence (' . number_format($feature->seqlen) . 'bp)',
$description_term => 'This is the primary representative sequence for this feature.',
$sequence_term => $feature->residues,
$seq_length_term => $feature->seqlen,
$seq_md5sum_term => $feature->md5checksum,
$type_term => $feature->type_id->name
$type_term => $feature->type_id->name,
$fasta_defline => chado_get_fasta_defline($feature)
];
}
}


/**
*
* @param unknown $entity
* @param unknown $feature
* @param unknown $field_name
* Adds the full length mRNA sequence (only for gene features).
*/
private function addFLmRNA(&$entity, $feature, $field_name) {
$label_term = 'rdfs:label';
Expand All @@ -178,6 +179,7 @@ class data__sequence_record extends ChadoField {
$seq_coords_term = 'data:2012';
$seq_length_term = chado_get_semweb_term('feature', 'seqlen');
$seq_md5sum_term = chado_get_semweb_term('feature', 'md5checksum');
$fasta_defline = 'local:fasta_definition';

// Sometimes an mRNA may have only exons, only CDS or both exons and
// CDS. We need to know which.
Expand Down Expand Up @@ -212,12 +214,13 @@ class data__sequence_record extends ChadoField {

$entity->{$field_name}['und'][]['value'] = [
$sequence_term => $seq['residues'],
$label_term => 'Full Length mRNA Sequence (' . number_format(strlen($seq['residues'])) . 'bp)',
$description_term => 'This full length mRNA sequence was extracted from the reference sequence location at ' . $coords['schema:description'] . ' and contains: ' . implode(', ', $types) ,
$label_term => 'Derived mRNA Sequence (' . number_format(strlen($seq['residues'])) . 'bp)',
$description_term => 'This full length mRNA sequence was derived by extracting bases from the reference sequence location at ' . $coords['schema:description'] . ' and contains: ' . implode(', ', $types) ,
$seq_coords_term => $coords,
$seq_length_term => strlen($seq['residues']),
$seq_md5sum_term => md5($seq['residues']),
$type_term => 'mRNA'
$type_term => 'mRNA',
$fasta_defline => $seq['defline']
];

$featurelocs[] = $featureloc;
Expand All @@ -226,9 +229,7 @@ class data__sequence_record extends ChadoField {
}

/**
*
* @param unknown $featureloc_id
* @return unknown
* Retrieves the feature location information.
*/
private function getFeatureLoc($featureloc_id) {
$featurelocs_sql = "
Expand All @@ -241,9 +242,7 @@ class data__sequence_record extends ChadoField {
}

/**
*
* @param unknown $featureloc
* @return string[]|number[]|NULL[]|unknown[]
* Gets the sequence location string for a featureloc record.
*/
private function getSequenceCoords($featureloc) {
$description_term = 'schema:description';
Expand Down Expand Up @@ -277,10 +276,7 @@ class data__sequence_record extends ChadoField {
}

/**
*
* @param unknown $entity
* @param unknown $feature
* @param unknown $field_name
* Adds the CDS sequence (only for an mRNA feature)
*/
private function addCDS(&$entity, $feature, $field_name, $featurelocs) {
$label_term = 'rdfs:label';
Expand All @@ -289,6 +285,7 @@ class data__sequence_record extends ChadoField {
$sequence_term = chado_get_semweb_term('feature', 'residues');
$seq_length_term = chado_get_semweb_term('feature', 'seqlen');
$seq_md5sum_term = chado_get_semweb_term('feature', 'md5checksum');
$fasta_defline = 'local:fasta_definition';

foreach ($featurelocs as $featureloc) {
$cds_feature = [
Expand All @@ -311,20 +308,18 @@ class data__sequence_record extends ChadoField {
$entity->{$field_name}['und'][]['value'] = [
$label_term => 'Coding Sequence (' . number_format($cds_sequence[0]['length']) . 'bp)',
$sequence_term => $cds_sequence[0]['residues'],
$description_term => 'This CDS was extracted from the reference sequence location at ' . $coords['schema:description'] . '.' ,
$description_term => 'This CDS was derived by extracting bases from the reference sequence location at ' . $coords['schema:description'] . '.' ,
$seq_length_term => $cds_sequence[0]['length'],
$seq_md5sum_term => md5($cds_sequence[0]['residues']),
$type_term => 'CDS'
$type_term => 'CDS',
$fasta_defline => $cds_sequence[0]['defline']
];
}
}
}

/**
*
* @param unknown $entity
* @param unknown $feature
* @param unknown $field_name
* Adds the sequecne for a gene parent (only for gene children).
*/
private function addGeneParent(&$entity, $feature, $field_name) {
$label_term = 'rdfs:label';
Expand All @@ -334,6 +329,7 @@ class data__sequence_record extends ChadoField {
$sequence_term = chado_get_semweb_term('feature', 'residues');
$seq_length_term = chado_get_semweb_term('feature', 'seqlen');
$seq_md5sum_term = chado_get_semweb_term('feature', 'md5checksum');
$fasta_defline = 'local:fasta_definition';

$sql = "
SELECT FO.*
Expand All @@ -355,7 +351,8 @@ class data__sequence_record extends ChadoField {
$label_term => 'The gene sequence.',
$seq_length_term => strlen($gene->residues),
$seq_md5sum_term => md5($gene->residues),
$type_term => 'polypeptide'
$type_term => 'gene',
$fasta_defline => chado_get_fasta_defline($gene),
];
}
else {
Expand All @@ -364,24 +361,22 @@ class data__sequence_record extends ChadoField {
$featureloc = $this->getFeatureLoc($seq['featureloc_id']);
$coords = $this->getSequenceCoords($featureloc);
$entity->{$field_name}['und'][]['value'] = [
$label_term => 'Gene Sequence (' . number_format(strlen($seq['residues'])) . 'bp)',
$label_term => 'Derived Gene Sequence (' . number_format(strlen($seq['residues'])) . 'bp)',
$sequence_term => $seq['residues'],
$description_term => 'This gene sequence was extracted from the reference sequence location at ' . $coords['schema:description'] . '.' ,
$description_term => 'This gene sequence was derived by extracting bases from the reference sequence location at ' . $coords['schema:description'] . '.' ,
$seq_coords_term => $coords,
$seq_length_term => strlen($seq['residues']),
$seq_md5sum_term => md5($seq['residues']),
$type_term => 'gene'
$type_term => 'gene',
$fasta_defline => $seq['defline'],
];
}
}
}
}

/**
*
* @param unknown $entity
* @param unknown $feature
* @param unknown $field_name
* Adds the protein sequence (only for mRNA features).
*/
private function addProtein(&$entity, $feature, $field_name) {
$label_term = 'rdfs:label';
Expand All @@ -390,6 +385,7 @@ class data__sequence_record extends ChadoField {
$sequence_term = chado_get_semweb_term('feature', 'residues');
$seq_length_term = chado_get_semweb_term('feature', 'seqlen');
$seq_md5sum_term = chado_get_semweb_term('feature', 'md5checksum');
$fasta_defline = 'local:fasta_definition';

$sql = "
SELECT F.*
Expand All @@ -412,7 +408,8 @@ class data__sequence_record extends ChadoField {
$description_term => 'The protein sequence.',
$seq_length_term => strlen($protein->residues),
$seq_md5sum_term => md5($protein->residues),
$type_term => 'polypeptide'
$type_term => 'polypeptide',
$fasta_defline => chado_get_fasta_defline($protein),
];
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class data__sequence_record_formatter extends ChadoFieldFormatter {
$fmax_term = chado_get_semweb_term('featureloc', 'fmax');
$strand_term = chado_get_semweb_term('featureloc', 'strand');
$phase_term = chado_get_semweb_term('featureloc', 'phase');
$fasta_defline = 'local:fasta_definition';

$content = [];

Expand All @@ -36,6 +37,7 @@ class data__sequence_record_formatter extends ChadoFieldFormatter {

$num_bases = 50;
$residues = '<pre class="residues-formatter">';
$residues .= ">" . $item['value'][$fasta_defline] . "<br>";
$residues .= wordwrap($item['value'][$sequence_term], $num_bases, "<br>", TRUE);
$residues .= '</pre>';

Expand All @@ -49,7 +51,6 @@ class data__sequence_record_formatter extends ChadoFieldFormatter {

}


if (empty($content)) {
$element[0] = [
'#type' => 'markup',
Expand Down
11 changes: 11 additions & 0 deletions tripal_chado/includes/tripal_chado.semweb.inc
Original file line number Diff line number Diff line change
Expand Up @@ -1457,6 +1457,17 @@ function tripal_chado_populate_vocab_LOCAL() {
'db_name' => 'local',
]);

//--------
// Feature
//--------
$term = chado_insert_cvterm([
'name' => 'fasta_definition',
'definition' => 'The definition line for a FASTA formatted sequence',
'cv_name' => 'local',
'is_relationship' => 0,
'db_name' => 'local',
]);

//--------------
// Feature Map
//--------------
Expand Down
23 changes: 21 additions & 2 deletions tripal_chado/tripal_chado.install
Original file line number Diff line number Diff line change
Expand Up @@ -2081,7 +2081,7 @@ function tripal_chado_update_7338() {
*/
function tripal_chado_update_7339() {
try {
$term = chado_insert_cvterm([
chado_insert_cvterm([
'id' => 'data:0849',
'name' => 'Sequence record',
'cv_name' => 'EDAM',
Expand All @@ -2092,4 +2092,23 @@ function tripal_chado_update_7339() {
$error = $e->getMessage();
throw new DrupalUpdateException('Could not perform update: '. $error);
}
}
}


/**
* Adds the "FASTA definition" cvterm for the data__sequence_record field.
*/
function tripal_chado_update_7340() {
try {
chado_insert_cvterm([
'name' => 'fasta_definition',
'definition' => 'The definition line for a FASTA formatted sequence',
'cv_name' => 'local',
'is_relationship' => 0,
'db_name' => 'local',
]);
} catch (\PDOException $e) {
$error = $e->getMessage();
throw new DrupalUpdateException('Could not perform update: '. $error);
}
}

0 comments on commit a453a45

Please sign in to comment.