Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
106 lines (99 sloc) 3.19 KB
/**
* This set of thrift structures is analogous to the JSON schemas
* defined in http://trec-kba.org/schemas/v1.0/
*
* The comments below should be enough to interact with the text of
* the corpus. The JSON schemas contain additional details,
* especially for the SourceMetadata, which is stored in the thrift as
* a JSON string using the schemas linked below.
*/
namespace java kba
namespace py kba
/**
* ContentItem is the thrift analog of
* http://trec-kba.org/schemas/v1.0/content-item.json
*
* The JSON version has a 'stages' property that contains descriptions
* **and also names** of additional properties on the ContentItem.
* That was overly flexible. Each content-item in the KBA corpus can
* have a 'cleansed' and 'ner' property. 'cleansed' is generated from
* 'raw', and 'ner' is generated from 'cleansed.' Generally,
* 'cleansed' is a tag-stripped version of 'raw', and 'ner' is the
* output of a named entity recognizer that generates
* one-word-per-line output.
*
* For the kba-stream-corpus-2012, the specific tag-stripping and NER
* configurations were:
* 'raw' --> boilerpipe 1.2.0 ArticleExtractor --> 'cleansed'
*
* 'cleansed' -> Stanford CoreNLP ver 1.2.0 with annotators
* {tokenize, cleanxml, ssplit, pos, lemma, ner}, property
* pos.maxlen=100" --> 'ner'
*/
struct ContentItem {
1: binary raw,
2: string encoding,
3: optional binary cleansed,
4: optional binary ner,
}
/**
* SourceMetadata is a JSON string with one of these schemas
*
* - http://trec-kba.org/schemas/v1.0/news-metadata.json
* - http://trec-kba.org/schemas/v1.0/linking-metadata.json
* - http://trec-kba.org/schemas/v1.0/social-metadata.json
*
* where 'news', 'social', 'linking' is the string found in
* CorpusItem.source
*
*/
typedef binary SourceMetadata
/**
* CorpusItem is the thrift equivalent of
* http://trec-kba.org/schemas/v1.0/corpus-item.json
*/
struct CorpusItem {
1: string doc_id,
2: binary abs_url,
3: string schost,
4: binary original_url,
5: string source,
6: ContentItem title,
7: ContentItem body,
8: ContentItem anchor,
9: SourceMetadata source_metadata,
}
/**
* StreamTime is a timestamp measured in seconds since the 1970 epoch.
* 'news', 'linking', and 'social' each have slightly different ways
* of generating these timestamps. See details:
* http://trec-kba.org/kba-stream-corpus-2012.shtml
*/
struct StreamTime {
1: double epoch_ticks,
2: string zulu_timestamp,
}
/**
* This is the primary interface to the data. StreamItem is the
* thrift equivalent of
* http://trec-kba.org/schemas/v1.0/stream-item.json
*
* which extends corpus-item.json. For better or worse, thrift does
* not support inheritence on struct, so this copies the first nine
* fields of CorpusItem and then adds two more fields.
*/
struct StreamItem {
1: string doc_id,
2: binary abs_url,
3: string schost,
4: binary original_url,
5: string source,
6: ContentItem title,
7: ContentItem body,
8: ContentItem anchor,
9: SourceMetadata source_metadata,
// stream_id is the actual unique identifier for the stream corpus,
// stream_id = '%d-%s' % (int(stream_time.epoch_ticks), doc_id)
10: string stream_id,
11: StreamTime stream_time,
}