Permalink
Browse files

This kba.thrift file was in trec-kba.org private repo, and it should …

…have been here instead.
  • Loading branch information...
1 parent c7a22f1 commit 39fbfeaf723b7acd96076bf071499e50c3200161 John R. Frank committed Oct 4, 2012
Showing with 105 additions and 0 deletions.
  1. +105 −0 kba.thrift
View
@@ -0,0 +1,105 @@
+
+/**
+ * This set of thrift structures is analogous to the JSON schemas
+ * defined in http://trec-kba.org/schemas/v1.0/
+ *
+ * The comments below should be enough to interact with the text of
+ * the corpus. The JSON schemas contain additional details,
+ * especially for the SourceMetadata, which is stored in the thrift as
+ * a JSON string using the schemas linked below.
+ */
+namespace java kba
+namespace py kba
+
+/**
+ * ContentItem is the thrift analog of
+ * http://trec-kba.org/schemas/v1.0/content-item.json
+ *
+ * The JSON version has a 'stages' property that contains descriptions
+ * **and also names** of additional properties on the ContentItem.
+ * That was overly flexible. Each content-item in the KBA corpus can
+ * have a 'cleansed' and 'ner' property. 'cleansed' is generated from
+ * 'raw', and 'ner' is generated from 'cleansed.' Generally,
+ * 'cleansed' is a tag-stripped version of 'raw', and 'ner' is the
+ * output of a named entity recognizer that generates
+ * one-word-per-line output.
+ *
+ * For the kba-stream-corpus-2012, the specific tag-stripping and NER
+ * configurations were:
+ * 'raw' --> boilerpipe 1.2.0 ArticleExtractor --> 'cleansed'
+ *
+ * 'cleansed' -> Stanford CoreNLP ver 1.2.0 with annotators
+ * {tokenize, cleanxml, ssplit, pos, lemma, ner}, property
+ * pos.maxlen=100" --> 'ner'
+ */
+struct ContentItem {
+ 1: binary raw,
+ 2: string encoding,
+ 3: optional binary cleansed,
+ 4: optional binary ner,
+}
+
+/**
+ * SourceMetadata is a JSON string with one of these schemas
+ *
+ * - http://trec-kba.org/schemas/v1.0/news-metadata.json
+ * - http://trec-kba.org/schemas/v1.0/linking-metadata.json
+ * - http://trec-kba.org/schemas/v1.0/social-metadata.json
+ *
+ * where 'news', 'social', 'linking' is the string found in
+ * CorpusItem.source
+ *
+ */
+typedef binary SourceMetadata
+
+/**
+ * CorpusItem is the thrift equivalent of
+ * http://trec-kba.org/schemas/v1.0/corpus-item.json
+ */
+struct CorpusItem {
+ 1: string doc_id,
+ 2: binary abs_url,
+ 3: string schost,
+ 4: binary original_url,
+ 5: string source,
+ 6: ContentItem title,
+ 7: ContentItem body,
+ 8: ContentItem anchor,
+ 9: SourceMetadata source_metadata,
+}
+
+/**
+ * StreamTime is a timestamp measured in seconds since the 1970 epoch.
+ * 'news', 'linking', and 'social' each have slightly different ways
+ * of generating these timestamps. See details:
+ * http://trec-kba.org/kba-stream-corpus-2012.shtml
+ */
+struct StreamTime {
+ 1: double epoch_ticks,
+ 2: string zulu_timestamp,
+}
+
+/**
+ * This is the primary interface to the data. StreamItem is the
+ * thrift equivalent of
+ * http://trec-kba.org/schemas/v1.0/stream-item.json
+ *
+ * which extends corpus-item.json. For better or worse, thrift does
+ * not support inheritence on struct, so this copies the first nine
+ * fields of CorpusItem and then adds two more fields.
+ */
+struct StreamItem {
+ 1: string doc_id,
+ 2: binary abs_url,
+ 3: string schost,
+ 4: binary original_url,
+ 5: string source,
+ 6: ContentItem title,
+ 7: ContentItem body,
+ 8: ContentItem anchor,
+ 9: SourceMetadata source_metadata,
+ // stream_id is the actual unique identifier for the stream corpus,
+ // stream_id = '%d-%s' % (int(stream_time.epoch_ticks), doc_id)
+ 10: string stream_id,
+ 11: StreamTime stream_time,
+}

0 comments on commit 39fbfea

Please sign in to comment.