Skip to content

Commit

Permalink
added enrichment of synonyms and vocabularies for imported documents
Browse files Browse the repository at this point in the history
during surrogate reading: those attributes from the dump are removed
during the import process and replaced by new detected attributes
according to the setting of the YaCy peer.
This may cause that all such attributes are removed if the importing
peer has no synonyms and/or no vocabularies defined.
  • Loading branch information
Orbiter committed Jul 1, 2015
1 parent 7829480 commit 90f75c8
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 40 deletions.
17 changes: 8 additions & 9 deletions source/net/yacy/document/Document.java
Expand Up @@ -253,24 +253,23 @@ public void addTags(Set<String> tags) {
* @param tags
*/
protected void addMetatags(Map<String, Set<Tagging.Metatag>> tags) {
//String subject = YaCyMetadata.hashURI(this.source.hash());
//for (String s: this.keywords) {
// tags.remove(s);
//}
this.generic_facets.putAll(computeGenericFacets(tags));
}

public static Map<String, Set<String>> computeGenericFacets(Map<String, Set<Tagging.Metatag>> tags) {
Map<String, Set<String>> gf = new HashMap<String, Set<String>>();
for (Map.Entry<String, Set<Tagging.Metatag>> e: tags.entrySet()) {
Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(e.getKey());
if (vocabulary == null) continue;
//String objectspace = vocabulary.getObjectspace();
//StringBuilder sb = new StringBuilder(e.getValue().size() * 20);
Set<String> objects = new HashSet<String>();
for (Tagging.Metatag s: e.getValue()) {
objects.add(s.getObject());
//sb.append(',').append(s.getObject());
}
this.generic_facets.put(vocabulary.getName(), objects);
gf.put(vocabulary.getName(), objects);
}
return gf;
}

public String[] dc_subject() {
// sort out doubles and empty words
final TreeSet<String> hs = new TreeSet<String>();
Expand Down
4 changes: 4 additions & 0 deletions source/net/yacy/document/Tokenizer.java
Expand Up @@ -237,5 +237,9 @@ public List<String> synonyms() {
for (String s: this.synonyms) l.add(s);
return l;
}

public Map<String, Set<Tagging.Metatag>> tags() {
return this.tags;
}

}
4 changes: 3 additions & 1 deletion source/net/yacy/document/VocabularyScraper.java
Expand Up @@ -40,8 +40,10 @@ public VocabularyScraper() {
this.vocMap = new ConcurrentHashMap<>();
}

/**
* @param init must be a property list of property lists: the key of the top property list is the name of the vocabulary, the name of the embedded property list is the entity class and the value of the embedded property is the entity name
*/
public VocabularyScraper(JSONObject init) {
// init must be a property list of property lists: the key of the top property list is the name of the vocabulary, the name of the embedded property list is the entity class and the value of the embedded property is the entity name
this.scraperDefinition = init == null ? new JSONObject() : init;
this.vocMap = new ConcurrentHashMap<>();
if (this.scraperDefinition.length() == 0) {
Expand Down
2 changes: 1 addition & 1 deletion source/net/yacy/document/WordTokenizer.java
Expand Up @@ -72,7 +72,7 @@ public StringBuilder nextElement() {
final StringBuilder r = (this.buffer == null) ? null : this.buffer;
this.buffer = nextElement0();
// put word to words statistics cache
if (this.meaningLib != null) WordCache.learn(r);
if (this.meaningLib != null && r != null) WordCache.learn(r);
return r;
}

Expand Down
19 changes: 18 additions & 1 deletion source/net/yacy/search/Switchboard.java
Expand Up @@ -154,7 +154,9 @@
import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.Parser.Failure;
import net.yacy.document.Tokenizer;
import net.yacy.document.content.SurrogateReader;
import net.yacy.document.importer.OAIListFriendsLoader;
import net.yacy.document.parser.audioTagParser;
Expand Down Expand Up @@ -1996,10 +1998,25 @@ public void processSurrogate(final InputStream is, final String name) throws IOE
indexer[t] = new Thread() {
@Override
public void run() {
VocabularyScraper scraper = new VocabularyScraper();
SolrInputDocument surrogate;
while ((surrogate = reader.take()) != SurrogateReader.POISON_DOCUMENT ) {
// check if url is in accepted domain
assert surrogate != null;
try {
// enrich the surrogate
final DigestURL root = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName())));
final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
if (text != null && text.length() > 0) {
// run the tokenizer on the text to get vocabularies and synonyms
final Tokenizer tokenizer = new Tokenizer(root, text, LibraryProvider.dymLib, true, scraper);
final Map<String, Set<String>> facets = Document.computeGenericFacets(tokenizer.tags());
// overwrite the given vocabularies and synonyms with new computed ones
Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(surrogate, tokenizer.synonyms(), facets);
}
} catch (MalformedURLException e) {
ConcurrentLog.logException(e);
}
// write the surrogate into the index
Switchboard.this.index.putDocument(surrogate);
if (shallTerminate()) break;
}
Expand Down
61 changes: 33 additions & 28 deletions source/net/yacy/search/schema/CollectionConfiguration.java
Expand Up @@ -82,6 +82,7 @@
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.SentenceReader;
import net.yacy.document.Tokenizer;
import net.yacy.document.content.DCEntry;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
Expand Down Expand Up @@ -301,7 +302,7 @@ public SolrInputDocument metadata2solr(final URIMetadataNode md) {

String keywords = md.dc_subject();
Bitfield flags = md.flags();
if (flags.get(Condenser.flag_cat_indexof)) {
if (flags.get(Tokenizer.flag_cat_indexof)) {
if (keywords == null || keywords.isEmpty()) keywords = "indexof"; else {
if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof";
}
Expand Down Expand Up @@ -511,10 +512,6 @@ public SolrVector yacy2solr(
}
add(doc, CollectionSchema.keywords, keywords);
}
if (allAttr || contains(CollectionSchema.synonyms_sxt)) {
List<String> synonyms = condenser.synonyms();
add(doc, CollectionSchema.synonyms_sxt, synonyms);
}

// unique-fields; these values must be corrected during postprocessing. (the following logic is !^ (not-xor) but I prefer to write it that way as it is)
add(doc, CollectionSchema.http_unique_b, setUnique || UNIQUE_HEURISTIC_PREFER_HTTPS ? digestURL.isHTTPS() : digestURL.isHTTP()); // this must be corrected afterwards during storage!
Expand Down Expand Up @@ -993,29 +990,7 @@ public SolrVector yacy2solr(
if (allAttr || contains(CollectionSchema.videolinkscount_i)) add(doc, CollectionSchema.videolinkscount_i, document.getVideolinks().size());
if (allAttr || contains(CollectionSchema.applinkscount_i)) add(doc, CollectionSchema.applinkscount_i, document.getApplinks().size());

// write generic navigation
// there are no pre-defined solr fields for navigation because the vocabulary is generic
// we use dynamically allocated solr fields for this.
// It must be a multi-value string/token field, therefore we use _sxt extensions for the field names
List<String> vocabularies = new ArrayList<>();
for (Map.Entry<String, Set<String>> facet: document.getGenericFacets().entrySet()) {
String facetName = facet.getKey();
Set<String> facetValues = facet.getValue();
int count = facetValues.size();
if (count == 0) continue;
int logcount = (int) (Math.log(count) / Math.log(2));
Integer[] counts = new Integer[logcount + 1]; for (int i = 0; i <= logcount; i++) counts[i] = i;
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_TERMS_SUFFIX, facetValues.toArray(new String[count]));
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_COUNT_SUFFIX, facetValues.size());
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNT_SUFFIX, logcount);
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNTS_SUFFIX, counts);
vocabularies.add(facetName);
}
if ((allAttr || contains(CollectionSchema.vocabularies_sxt)) && vocabularies.size() > 0) {
add(doc, CollectionSchema.vocabularies_sxt, vocabularies);
}


// document post-processing
if ((allAttr || contains(CollectionSchema.process_sxt)) && processTypes.size() > 0) {
List<String> p = new ArrayList<String>();
for (ProcessType t: processTypes) p.add(t.name());
Expand All @@ -1024,8 +999,38 @@ public SolrVector yacy2solr(
add(doc, CollectionSchema.harvestkey_s, sourceName);
}
}

// document enrichments (synonyms, facets)
enrich(doc, condenser.synonyms(), document.getGenericFacets());
return doc;
}

public void enrich(SolrInputDocument doc, List<String> synonyms, Map<String, Set<String>> genericFacets) {
if (this.isEmpty() || contains(CollectionSchema.vocabularies_sxt)) {
// write generic navigation
// there are no pre-defined solr fields for navigation because the vocabulary is generic
// we use dynamically allocated solr fields for this.
// It must be a multi-value string/token field, therefore we use _sxt extensions for the field names
List<String> vocabularies = new ArrayList<>();
for (Map.Entry<String, Set<String>> facet: genericFacets.entrySet()) {
String facetName = facet.getKey();
Set<String> facetValues = facet.getValue();
int count = facetValues.size();
if (count == 0) continue;
int logcount = (int) (Math.log(count) / Math.log(2));
Integer[] counts = new Integer[logcount + 1]; for (int i = 0; i <= logcount; i++) counts[i] = i;
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_TERMS_SUFFIX, facetValues.toArray(new String[count]));
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_COUNT_SUFFIX, facetValues.size());
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNT_SUFFIX, logcount);
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNTS_SUFFIX, counts);
vocabularies.add(facetName);
}
if (vocabularies.size() > 0) add(doc, CollectionSchema.vocabularies_sxt, vocabularies);
}
if (this.isEmpty() || contains(CollectionSchema.synonyms_sxt)) {
if (synonyms.size() > 0) add(doc, CollectionSchema.synonyms_sxt, synonyms);
}
}

public static boolean postprocessingRunning = false;
public static String postprocessingActivity = "";
Expand Down

0 comments on commit 90f75c8

Please sign in to comment.