-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This kba.thrift file was in trec-kba.org private repo, and it should …
…have been here instead.
- Loading branch information
John R. Frank
committed
Oct 4, 2012
1 parent
c7a22f1
commit 39fbfea
Showing
1 changed file
with
105 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
|
||
/** | ||
* This set of thrift structures is analogous to the JSON schemas | ||
* defined in http://trec-kba.org/schemas/v1.0/ | ||
* | ||
* The comments below should be enough to interact with the text of | ||
* the corpus. The JSON schemas contain additional details, | ||
* especially for the SourceMetadata, which is stored in the thrift as | ||
* a JSON string using the schemas linked below. | ||
*/ | ||
namespace java kba | ||
namespace py kba | ||
|
||
/** | ||
* ContentItem is the thrift analog of | ||
* http://trec-kba.org/schemas/v1.0/content-item.json | ||
* | ||
* The JSON version has a 'stages' property that contains descriptions | ||
* **and also names** of additional properties on the ContentItem. | ||
* That was overly flexible. Each content-item in the KBA corpus can | ||
* have a 'cleansed' and 'ner' property. 'cleansed' is generated from | ||
* 'raw', and 'ner' is generated from 'cleansed.' Generally, | ||
* 'cleansed' is a tag-stripped version of 'raw', and 'ner' is the | ||
* output of a named entity recognizer that generates | ||
* one-word-per-line output. | ||
* | ||
* For the kba-stream-corpus-2012, the specific tag-stripping and NER | ||
* configurations were: | ||
* 'raw' --> boilerpipe 1.2.0 ArticleExtractor --> 'cleansed' | ||
* | ||
* 'cleansed' -> Stanford CoreNLP ver 1.2.0 with annotators | ||
* {tokenize, cleanxml, ssplit, pos, lemma, ner}, property | ||
* pos.maxlen=100" --> 'ner' | ||
*/ | ||
struct ContentItem { | ||
1: binary raw, | ||
2: string encoding, | ||
3: optional binary cleansed, | ||
4: optional binary ner, | ||
} | ||
|
||
/** | ||
* SourceMetadata is a JSON string with one of these schemas | ||
* | ||
* - http://trec-kba.org/schemas/v1.0/news-metadata.json | ||
* - http://trec-kba.org/schemas/v1.0/linking-metadata.json | ||
* - http://trec-kba.org/schemas/v1.0/social-metadata.json | ||
* | ||
* where 'news', 'social', 'linking' is the string found in | ||
* CorpusItem.source | ||
* | ||
*/ | ||
typedef binary SourceMetadata | ||
|
||
/** | ||
* CorpusItem is the thrift equivalent of | ||
* http://trec-kba.org/schemas/v1.0/corpus-item.json | ||
*/ | ||
struct CorpusItem { | ||
1: string doc_id, | ||
2: binary abs_url, | ||
3: string schost, | ||
4: binary original_url, | ||
5: string source, | ||
6: ContentItem title, | ||
7: ContentItem body, | ||
8: ContentItem anchor, | ||
9: SourceMetadata source_metadata, | ||
} | ||
|
||
/** | ||
* StreamTime is a timestamp measured in seconds since the 1970 epoch. | ||
* 'news', 'linking', and 'social' each have slightly different ways | ||
* of generating these timestamps. See details: | ||
* http://trec-kba.org/kba-stream-corpus-2012.shtml | ||
*/ | ||
struct StreamTime { | ||
1: double epoch_ticks, | ||
2: string zulu_timestamp, | ||
} | ||
|
||
/** | ||
* This is the primary interface to the data. StreamItem is the | ||
* thrift equivalent of | ||
* http://trec-kba.org/schemas/v1.0/stream-item.json | ||
* | ||
* which extends corpus-item.json. For better or worse, thrift does | ||
* not support inheritence on struct, so this copies the first nine | ||
* fields of CorpusItem and then adds two more fields. | ||
*/ | ||
struct StreamItem { | ||
1: string doc_id, | ||
2: binary abs_url, | ||
3: string schost, | ||
4: binary original_url, | ||
5: string source, | ||
6: ContentItem title, | ||
7: ContentItem body, | ||
8: ContentItem anchor, | ||
9: SourceMetadata source_metadata, | ||
// stream_id is the actual unique identifier for the stream corpus, | ||
// stream_id = '%d-%s' % (int(stream_time.epoch_ticks), doc_id) | ||
10: string stream_id, | ||
11: StreamTime stream_time, | ||
} |