Skip to content

Commit

Permalink
Make EXIF Location optional (defaults to enabled).
Browse files Browse the repository at this point in the history
  • Loading branch information
anjackson committed Oct 19, 2018
1 parent dd4d77d commit c2dbff8
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 2 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@

**NOTE** Generally, we only add terms to the Solr schema, so it should usually be compatible with previous versions (i.e. clients should be able to query across both without modification). However, there are been a small number of fixes which unfortunately required breaking changes you may need to be aware of or work-around. e.g. [hash becomes single-valued](https://github.com/ukwa/webarchive-discovery/issues/95)... TBA...


3.1.0
-----

Expand All @@ -16,6 +20,7 @@
-----

**NOTE** The changes to the schema mean this version is not compatible with 2.1.0 indexes. We've also moved to Java 7.

* Validation/statistics for WARC file name matching rules, given a list of WARC file names
* Added some experimental face detection code with tests.
* Fixed licence headers #182
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ public class TikaPayloadAnalyser extends AbstractPayloadAnalyser {
/** Extract all metadata? */
private boolean extractAllMetadata;

private boolean extractExifLocation;

private boolean passUriToFormatTools = false;

/* --- --- --- --- */
Expand Down Expand Up @@ -134,6 +136,10 @@ public void configure(Config conf) {
.getBoolean("warc.index.tika.extract_all_metadata");
log.info("Config: extractAllMetadata "+this.extractAllMetadata);

this.extractExifLocation = conf
.getBoolean("warc.index.tika.extract_exif_location");
log.info("Config: extractExifLocation " + this.extractExifLocation);

this.useBoilerpipe = conf.getBoolean("warc.index.tika.use_boilerpipe");
log.info("Config: useBoilerpipe " + this.useBoilerpipe);

Expand Down Expand Up @@ -408,7 +414,9 @@ public SolrRecord extract( String source, SolrRecord solr, InputStream is, Strin
// This potentially results in multiple author, which is valid
solr.addField(SolrFields.SOLR_AUTHOR, exif_artist);
}


if (this.extractExifLocation) {

String exif_latitude = metadata.get("GPS Latitude");
String exif_longitude = metadata.get("GPS Longitude");

Expand All @@ -430,6 +438,7 @@ public SolrRecord extract( String source, SolrRecord solr, InputStream is, Strin
log.warn("error parsing exif gps data. latitude:"+exif_latitude +" longitude:"+exif_longitude);
}
}
}
}
//End image exif metadata

Expand Down
4 changes: 3 additions & 1 deletion warc-indexer/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
# The random sampling rate:
# (where '1' means 'extract from all images',
# and '100' would mean 'extract from 1 out of every 100 images')
"analysisSamplingRate": 1
"analysisSamplingRate": 50
}

# Language profiles to load for langdetect
Expand Down Expand Up @@ -133,6 +133,8 @@
"css",
"octet-stream"
],
# Should we extract EXIF location data:
"extract_exif_location": true,
# Should we extract all the available metadata:
"extract_all_metadata": false
},
Expand Down

0 comments on commit c2dbff8

Please sign in to comment.