From e965cf87b4a29d9879f2775dc6bc0dd5a94ac57c Mon Sep 17 00:00:00 2001 From: addshore Date: Tue, 26 Jan 2016 09:02:04 +0000 Subject: [PATCH 1/2] Add monotext snak language counter --- .../Processor/MonolingualTextProcessor.java | 75 +++++++++++++++++++ .../wikidata/analyzer/WikidataAnalyzer.java | 14 ++++ 2 files changed, 89 insertions(+) create mode 100644 java/analyzer/src/main/java/org/wikidata/analyzer/Processor/MonolingualTextProcessor.java diff --git a/java/analyzer/src/main/java/org/wikidata/analyzer/Processor/MonolingualTextProcessor.java b/java/analyzer/src/main/java/org/wikidata/analyzer/Processor/MonolingualTextProcessor.java new file mode 100644 index 0000000..2dcecb8 --- /dev/null +++ b/java/analyzer/src/main/java/org/wikidata/analyzer/Processor/MonolingualTextProcessor.java @@ -0,0 +1,75 @@ +package main.java.org.wikidata.analyzer.Processor; + +import org.wikidata.wdtk.datamodel.interfaces.*; + +import java.util.*; + +/** + * MonolingualTextProcessor for wikidata-analysis + * + * Counts the languages used in monolingual text value snaks + * + * @author Addshore + */ +public class MonolingualTextProcessor implements EntityDocumentProcessor { + + private Map counters; + + public MonolingualTextProcessor(Map counters) { + this.counters = counters; + } + + private void increment(String counter) { + this.increment(counter, 1); + } + + private void increment(String counter, int quantity) { + this.initiateCounterIfNotReady(counter); + this.counters.put(counter, this.counters.get(counter) + (long) quantity); + } + + private void initiateCounterIfNotReady(String counter) { + if (!this.counters.containsKey(counter)) { + this.counters.put(counter, (long) 0); + } + } + + @Override + public void processItemDocument(ItemDocument item) { + for (Iterator statements = item.getAllStatements(); statements.hasNext(); ) { + this.processStatement( statements.next() ); + } + } + + @Override + public void processPropertyDocument(PropertyDocument property) { + for (Iterator statements = property.getAllStatements(); statements.hasNext(); ) { + this.processStatement( statements.next() ); + } + } + + private void processStatement( Statement statement ) { + this.processSnak(statement.getClaim().getMainSnak()); + for (Iterator qualifierSnaks = statement.getClaim().getAllQualifiers(); qualifierSnaks.hasNext(); ) { + Snak qualifierSnak = qualifierSnaks.next(); + this.processSnak(qualifierSnak); + } + for (Reference reference : statement.getReferences()) { + for (Iterator referenceSnaks = reference.getAllSnaks(); referenceSnaks.hasNext(); ) { + Snak referenceSnak = referenceSnaks.next(); + this.processSnak(referenceSnak); + } + } + } + + private void processSnak( Snak snak ) { + if (snak instanceof ValueSnak) { + Value value = ((ValueSnak) snak).getValue(); + if (value instanceof MonolingualTextValue) { + MonolingualTextValue textValue = (MonolingualTextValue) value; + this.increment( textValue.getLanguageCode() ); + } + } + } + +} \ No newline at end of file diff --git a/java/analyzer/src/main/java/org/wikidata/analyzer/WikidataAnalyzer.java b/java/analyzer/src/main/java/org/wikidata/analyzer/WikidataAnalyzer.java index fbb770b..03310dd 100644 --- a/java/analyzer/src/main/java/org/wikidata/analyzer/WikidataAnalyzer.java +++ b/java/analyzer/src/main/java/org/wikidata/analyzer/WikidataAnalyzer.java @@ -130,6 +130,12 @@ public void run() throws IOException { controller.registerEntityDocumentProcessor(new MetricProcessor(metricsCounters), null, true); } + // MonolingualText + Map monotextCounters = new HashMap<>(); + if (processors.contains("MonolingualText")) { + controller.registerEntityDocumentProcessor(new MonolingualTextProcessor(monotextCounters), null, true); + } + // Map JSONObject mapGeoData = new JSONObject(); JSONObject mapGraphData = new JSONObject(); @@ -175,6 +181,14 @@ public void run() throws IOException { metricsJsonWriter.close(); } + // MonolingualText + if (processors.contains("MonolingualText")) { + File monotextJsonFile = new File(outputDir.getAbsolutePath() + File.separator + "monotext.json"); + BufferedWriter monotextJsonWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(monotextJsonFile))); + new JSONObject(monotextCounters).writeJSONString(monotextJsonWriter); + monotextJsonWriter.close(); + } + // Map if (processors.contains("Map")) { System.out.println("Writing map wdlabel.json"); From ee46b66a11c95f9d2024002711bb848884c63bcb Mon Sep 17 00:00:00 2001 From: addshore Date: Tue, 26 Jan 2016 12:37:55 +0100 Subject: [PATCH 2/2] Add MonolingualTextProcessorTest --- .../MonolingualTextProcessorTest.java | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 java/analyzer/src/test/java/org/wikidata/analyzer/Processor/MonolingualTextProcessorTest.java diff --git a/java/analyzer/src/test/java/org/wikidata/analyzer/Processor/MonolingualTextProcessorTest.java b/java/analyzer/src/test/java/org/wikidata/analyzer/Processor/MonolingualTextProcessorTest.java new file mode 100644 index 0000000..aa7cf98 --- /dev/null +++ b/java/analyzer/src/test/java/org/wikidata/analyzer/Processor/MonolingualTextProcessorTest.java @@ -0,0 +1,69 @@ +package test.java.org.wikidata.analyzer.Processor; + +import junit.framework.TestCase; +import main.java.org.wikidata.analyzer.Processor.MonolingualTextProcessor; +import org.wikidata.wdtk.datamodel.helpers.*; +import org.wikidata.wdtk.datamodel.implementation.ItemIdValueImpl; +import org.wikidata.wdtk.datamodel.implementation.PropertyIdValueImpl; +import org.wikidata.wdtk.datamodel.interfaces.ItemDocument; +import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue; +import org.wikidata.wdtk.datamodel.interfaces.PropertyDocument; +import org.wikidata.wdtk.datamodel.interfaces.PropertyIdValue; + +import java.util.HashMap; +import java.util.Map; + +/** + * @author Addshore + */ +public class MonolingualTextProcessorTest extends TestCase { + + private void assertCounter( Map counters, String counter, int expected ) { + assertTrue( "Assert counter name exists '" + counter + "'", counters.containsKey( counter ) ); + assertEquals( "Assert counter '" + counter + "'value correct", (long)expected, (long)counters.get( counter ) ); + } + + public void testProcessItemDocument() throws Exception { + Map counters = new HashMap<>(); + MonolingualTextProcessor processor = new MonolingualTextProcessor(counters); + + ItemIdValue itemId = ItemIdValueImpl.create("Q42", "foo"); + ItemDocument itemDocument = ItemDocumentBuilder.forItemId(itemId) + .withStatement( + StatementBuilder + .forSubjectAndProperty(itemId, PropertyIdValueImpl.create("P1", "bar")) + .withValue(Datamodel.makeMonolingualTextValue("text", "en")) + .withQualifier(Datamodel.makeValueSnak(PropertyIdValueImpl.create("P1", "bar"), Datamodel.makeMonolingualTextValue("text", "de"))) + .withQualifier(Datamodel.makeValueSnak(PropertyIdValueImpl.create("P1", "bar"), Datamodel.makeMonolingualTextValue("text", "fr"))) + .withReference(ReferenceBuilder.newInstance().withPropertyValue( + PropertyIdValueImpl.create("P2", "Foo"), + Datamodel.makeMonolingualTextValue("text", "fr") + ).build()) + .withReference(ReferenceBuilder.newInstance().withPropertyValue( + PropertyIdValueImpl.create("P2", "Foo"), + Datamodel.makeMonolingualTextValue("text", "pt") + ).build()) + .build() + ) + .build(); + + PropertyIdValue propertyId = PropertyIdValueImpl.create("P42", "foo"); + PropertyDocument propertyDocument = PropertyDocumentBuilder.forPropertyIdAndDatatype(propertyId, "foo") + .withStatement( + StatementBuilder + .forSubjectAndProperty(propertyId, PropertyIdValueImpl.create("P1", "bar")) + .withValue(Datamodel.makeMonolingualTextValue("text", "pt")) + .build() + ) + .build(); + + processor.processItemDocument( itemDocument ); + processor.processPropertyDocument( propertyDocument ); + + this.assertCounter(counters, "en", 1); + this.assertCounter(counters, "de", 1); + this.assertCounter(counters, "fr", 2 ); + this.assertCounter(counters, "pt", 2 ); + } + +} \ No newline at end of file