# Assignment 1 - Part 1

In [9]:
import re
import gzip
import os
import pprint
from fnmatch import fnmatch
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch
from elasticsearch import helpers

In [2]:
INDEX_NAME = "aquaint"
DOC_TYPE = "doc"
INDEX_SETTINGS = {
    "settings" : {
        "index" : {
            "number_of_shards" : 1,
            "number_of_replicas" : 1
        }
    }
}

Add a set of documents to the index in a bulk.

In [3]:
def add_docs_bulk(es, docs):
    actions = []
    for doc_id, doc in docs.items():
        action = {
            "_index": INDEX_NAME,
            "_type": DOC_TYPE,
            "_id": doc_id,
            "_source": doc
        }
        actions.append(action)

    if len(actions) > 0:
        helpers.bulk(es, actions)

Indexes a given file

In [4]:
def index(es, file_name):
    print("Processing", file_name)
    with gzip.open(file_name, "rt") as fin:
        is_body = False
        docs = {}
        doc_id, body = None, None
        for line in fin:
            line = line.strip()
            if line.startswith("<DOCNO>"):  # get doc id
                doc_id = re.sub("<DOCNO> | </DOCNO>", "", line)
            elif line.startswith("<BODY>"):  # start to parse body
                is_body = True
                body = []
            elif line.startswith("</BODY>"):  # finished reading body
                soup = BeautifulSoup("\n".join(body), "lxml")
                headline = soup.find("headline")
                text = soup.find("text")
                docs[doc_id] = {
                    "title": headline.text if headline is not None else "",  # use an empty string if no <HEADLINE> found
                    "content": text.text if text is not None else ""  # everything inside <TEXT> is indexed as content
                }
                # get ready for next document
                doc_id = None
                is_body = False
            elif is_body:  # accumulate body content
                body.append(line)

        # bulk index the collected documents
        print("Bulk indexing", len(docs), "documents")
        add_docs_bulk(es, docs)

### Indexing

In [7]:
es = Elasticsearch()
if not es.indices.exists(INDEX_NAME):
    es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)

Index a single input file

In [6]:
index(es, "data/aquaint/nyt/2000/20000101_NYT.gz")

Processing data/aquaint/nyt/2000/20000101_NYT.gz
Bulk indexing 243 documents


**TODO** Call the `index()` method on all data files in the collection.

In [8]:
root = "./data/aquaint"
suffix = "*.gz"
for path, subdirs, files in os.walk(root): # import os
    for name in files:
        if fnmatch(name, suffix):          # import fnmatch
            index(es, str(os.path.join(path, name)))

Processing ./data/aquaint\apw\1998\19980601_APW_ENG.gz
Bulk indexing 856 documents
Processing ./data/aquaint\apw\1998\19980602_APW_ENG.gz
Bulk indexing 823 documents
Processing ./data/aquaint\apw\1998\19980603_APW_ENG.gz
Bulk indexing 964 documents
Processing ./data/aquaint\apw\1998\19980604_APW_ENG.gz
Bulk indexing 1006 documents
Processing ./data/aquaint\apw\1998\19980605_APW_ENG.gz
Bulk indexing 973 documents
Processing ./data/aquaint\apw\1998\19980606_APW_ENG.gz
Bulk indexing 362 documents
Processing ./data/aquaint\apw\1998\19980607_APW_ENG.gz
Bulk indexing 407 documents
Processing ./data/aquaint\apw\1998\19980608_APW_ENG.gz
Bulk indexing 864 documents
Processing ./data/aquaint\apw\1998\19980609_APW_ENG.gz
Bulk indexing 902 documents
Processing ./data/aquaint\apw\1998\19980610_APW_ENG.gz
Bulk indexing 948 documents
Processing ./data/aquaint\apw\1998\19980611_APW_ENG.gz
Bulk indexing 951 documents
Processing ./data/aquaint\apw\1998\19980612_APW_ENG.gz
Bulk indexing 925 documents
Pro

Bulk indexing 493 documents
Processing ./data/aquaint\apw\1998\19980909_APW_ENG.gz
Bulk indexing 590 documents
Processing ./data/aquaint\apw\1998\19980910_APW_ENG.gz
Bulk indexing 563 documents
Processing ./data/aquaint\apw\1998\19980911_APW_ENG.gz
Bulk indexing 533 documents
Processing ./data/aquaint\apw\1998\19980912_APW_ENG.gz
Bulk indexing 204 documents
Processing ./data/aquaint\apw\1998\19980913_APW_ENG.gz
Bulk indexing 424 documents
Processing ./data/aquaint\apw\1998\19980914_APW_ENG.gz
Bulk indexing 511 documents
Processing ./data/aquaint\apw\1998\19980915_APW_ENG.gz
Bulk indexing 566 documents
Processing ./data/aquaint\apw\1998\19980916_APW_ENG.gz
Bulk indexing 492 documents
Processing ./data/aquaint\apw\1998\19980917_APW_ENG.gz
Bulk indexing 499 documents
Processing ./data/aquaint\apw\1998\19980918_APW_ENG.gz
Bulk indexing 568 documents
Processing ./data/aquaint\apw\1998\19980919_APW_ENG.gz
Bulk indexing 398 documents
Processing ./data/aquaint\apw\1998\19980920_APW_ENG.gz
Bulk

Bulk indexing 802 documents
Processing ./data/aquaint\apw\1998\19981217_APW_ENG.gz
Bulk indexing 759 documents
Processing ./data/aquaint\apw\1998\19981218_APW_ENG.gz
Bulk indexing 576 documents
Processing ./data/aquaint\apw\1998\19981219_APW_ENG.gz
Bulk indexing 428 documents
Processing ./data/aquaint\apw\1998\19981220_APW_ENG.gz
Bulk indexing 383 documents
Processing ./data/aquaint\apw\1998\19981221_APW_ENG.gz
Bulk indexing 480 documents
Processing ./data/aquaint\apw\1998\19981222_APW_ENG.gz
Bulk indexing 465 documents
Processing ./data/aquaint\apw\1998\19981223_APW_ENG.gz
Bulk indexing 454 documents
Processing ./data/aquaint\apw\1998\19981224_APW_ENG.gz
Bulk indexing 323 documents
Processing ./data/aquaint\apw\1998\19981225_APW_ENG.gz
Bulk indexing 261 documents
Processing ./data/aquaint\apw\1998\19981226_APW_ENG.gz
Bulk indexing 269 documents
Processing ./data/aquaint\apw\1998\19981227_APW_ENG.gz
Bulk indexing 280 documents
Processing ./data/aquaint\apw\1998\19981228_APW_ENG.gz
Bulk

Bulk indexing 353 documents
Processing ./data/aquaint\apw\1999\19990326_APW_ENG.gz
Bulk indexing 305 documents
Processing ./data/aquaint\apw\1999\19990327_APW_ENG.gz
Bulk indexing 141 documents
Processing ./data/aquaint\apw\1999\19990328_APW_ENG.gz
Bulk indexing 122 documents
Processing ./data/aquaint\apw\1999\19990329_APW_ENG.gz
Bulk indexing 259 documents
Processing ./data/aquaint\apw\1999\19990330_APW_ENG.gz
Bulk indexing 271 documents
Processing ./data/aquaint\apw\1999\19990331_APW_ENG.gz
Bulk indexing 308 documents
Processing ./data/aquaint\apw\1999\19990401_APW_ENG.gz
Bulk indexing 280 documents
Processing ./data/aquaint\apw\1999\19990402_APW_ENG.gz
Bulk indexing 250 documents
Processing ./data/aquaint\apw\1999\19990403_APW_ENG.gz
Bulk indexing 110 documents
Processing ./data/aquaint\apw\1999\19990404_APW_ENG.gz
Bulk indexing 76 documents
Processing ./data/aquaint\apw\1999\19990405_APW_ENG.gz
Bulk indexing 232 documents
Processing ./data/aquaint\apw\1999\19990406_APW_ENG.gz
Bulk 

Bulk indexing 253 documents
Processing ./data/aquaint\apw\1999\19990703_APW_ENG.gz
Bulk indexing 147 documents
Processing ./data/aquaint\apw\1999\19990704_APW_ENG.gz
Bulk indexing 150 documents
Processing ./data/aquaint\apw\1999\19990705_APW_ENG.gz
Bulk indexing 188 documents
Processing ./data/aquaint\apw\1999\19990706_APW_ENG.gz
Bulk indexing 246 documents
Processing ./data/aquaint\apw\1999\19990707_APW_ENG.gz
Bulk indexing 328 documents
Processing ./data/aquaint\apw\1999\19990708_APW_ENG.gz
Bulk indexing 314 documents
Processing ./data/aquaint\apw\1999\19990709_APW_ENG.gz
Bulk indexing 302 documents
Processing ./data/aquaint\apw\1999\19990710_APW_ENG.gz
Bulk indexing 198 documents
Processing ./data/aquaint\apw\1999\19990711_APW_ENG.gz
Bulk indexing 111 documents
Processing ./data/aquaint\apw\1999\19990712_APW_ENG.gz
Bulk indexing 287 documents
Processing ./data/aquaint\apw\1999\19990713_APW_ENG.gz
Bulk indexing 286 documents
Processing ./data/aquaint\apw\1999\19990714_APW_ENG.gz
Bulk

Bulk indexing 169 documents
Processing ./data/aquaint\apw\1999\19991011_APW_ENG.gz
Bulk indexing 338 documents
Processing ./data/aquaint\apw\1999\19991012_APW_ENG.gz
Bulk indexing 287 documents
Processing ./data/aquaint\apw\1999\19991013_APW_ENG.gz
Bulk indexing 220 documents
Processing ./data/aquaint\apw\1999\19991014_APW_ENG.gz
Bulk indexing 325 documents
Processing ./data/aquaint\apw\1999\19991015_APW_ENG.gz
Bulk indexing 331 documents
Processing ./data/aquaint\apw\1999\19991016_APW_ENG.gz
Bulk indexing 214 documents
Processing ./data/aquaint\apw\1999\19991017_APW_ENG.gz
Bulk indexing 191 documents
Processing ./data/aquaint\apw\1999\19991018_APW_ENG.gz
Bulk indexing 346 documents
Processing ./data/aquaint\apw\1999\19991019_APW_ENG.gz
Bulk indexing 349 documents
Processing ./data/aquaint\apw\1999\19991020_APW_ENG.gz
Bulk indexing 361 documents
Processing ./data/aquaint\apw\1999\19991021_APW_ENG.gz
Bulk indexing 365 documents
Processing ./data/aquaint\apw\1999\19991022_APW_ENG.gz
Bulk

Bulk indexing 224 documents
Processing ./data/aquaint\apw\2000\20000318_APW_ENG.gz
Bulk indexing 183 documents
Processing ./data/aquaint\apw\2000\20000319_APW_ENG.gz
Bulk indexing 120 documents
Processing ./data/aquaint\apw\2000\20000320_APW_ENG.gz
Bulk indexing 223 documents
Processing ./data/aquaint\apw\2000\20000321_APW_ENG.gz
Bulk indexing 187 documents
Processing ./data/aquaint\apw\2000\20000322_APW_ENG.gz
Bulk indexing 261 documents
Processing ./data/aquaint\apw\2000\20000323_APW_ENG.gz
Bulk indexing 269 documents
Processing ./data/aquaint\apw\2000\20000324_APW_ENG.gz
Bulk indexing 212 documents
Processing ./data/aquaint\apw\2000\20000325_APW_ENG.gz
Bulk indexing 184 documents
Processing ./data/aquaint\apw\2000\20000326_APW_ENG.gz
Bulk indexing 135 documents
Processing ./data/aquaint\apw\2000\20000327_APW_ENG.gz
Bulk indexing 213 documents
Processing ./data/aquaint\apw\2000\20000328_APW_ENG.gz
Bulk indexing 225 documents
Processing ./data/aquaint\apw\2000\20000329_APW_ENG.gz
Bulk

Bulk indexing 145 documents
Processing ./data/aquaint\apw\2000\20000625_APW_ENG.gz
Bulk indexing 138 documents
Processing ./data/aquaint\apw\2000\20000626_APW_ENG.gz
Bulk indexing 145 documents
Processing ./data/aquaint\apw\2000\20000627_APW_ENG.gz
Bulk indexing 200 documents
Processing ./data/aquaint\apw\2000\20000628_APW_ENG.gz
Bulk indexing 241 documents
Processing ./data/aquaint\apw\2000\20000629_APW_ENG.gz
Bulk indexing 179 documents
Processing ./data/aquaint\apw\2000\20000630_APW_ENG.gz
Bulk indexing 189 documents
Processing ./data/aquaint\apw\2000\20000701_APW_ENG.gz
Bulk indexing 165 documents
Processing ./data/aquaint\apw\2000\20000702_APW_ENG.gz
Bulk indexing 118 documents
Processing ./data/aquaint\apw\2000\20000703_APW_ENG.gz
Bulk indexing 162 documents
Processing ./data/aquaint\apw\2000\20000704_APW_ENG.gz
Bulk indexing 149 documents
Processing ./data/aquaint\apw\2000\20000705_APW_ENG.gz
Bulk indexing 178 documents
Processing ./data/aquaint\apw\2000\20000706_APW_ENG.gz
Bulk

Bulk indexing 573 documents
Processing ./data/aquaint\nyt\1998\19980602_NYT.gz
Bulk indexing 465 documents
Processing ./data/aquaint\nyt\1998\19980603_NYT.gz
Bulk indexing 538 documents
Processing ./data/aquaint\nyt\1998\19980604_NYT.gz
Bulk indexing 489 documents
Processing ./data/aquaint\nyt\1998\19980605_NYT.gz
Bulk indexing 465 documents
Processing ./data/aquaint\nyt\1998\19980606_NYT.gz
Bulk indexing 258 documents
Processing ./data/aquaint\nyt\1998\19980607_NYT.gz
Bulk indexing 166 documents
Processing ./data/aquaint\nyt\1998\19980608_NYT.gz
Bulk indexing 462 documents
Processing ./data/aquaint\nyt\1998\19980609_NYT.gz
Bulk indexing 458 documents
Processing ./data/aquaint\nyt\1998\19980610_NYT.gz
Bulk indexing 578 documents
Processing ./data/aquaint\nyt\1998\19980611_NYT.gz
Bulk indexing 430 documents
Processing ./data/aquaint\nyt\1998\19980612_NYT.gz
Bulk indexing 436 documents
Processing ./data/aquaint\nyt\1998\19980613_NYT.gz
Bulk indexing 229 documents
Processing ./data/aquain

Bulk indexing 494 documents
Processing ./data/aquaint\nyt\1998\19980915_NYT.gz
Bulk indexing 491 documents
Processing ./data/aquaint\nyt\1998\19980916_NYT.gz
Bulk indexing 423 documents
Processing ./data/aquaint\nyt\1998\19980917_NYT.gz
Bulk indexing 498 documents
Processing ./data/aquaint\nyt\1998\19980918_NYT.gz
Bulk indexing 529 documents
Processing ./data/aquaint\nyt\1998\19980919_NYT.gz
Bulk indexing 234 documents
Processing ./data/aquaint\nyt\1998\19980920_NYT.gz
Bulk indexing 243 documents
Processing ./data/aquaint\nyt\1998\19980921_NYT.gz
Bulk indexing 527 documents
Processing ./data/aquaint\nyt\1998\19980922_NYT.gz
Bulk indexing 432 documents
Processing ./data/aquaint\nyt\1998\19980923_NYT.gz
Bulk indexing 545 documents
Processing ./data/aquaint\nyt\1998\19980924_NYT.gz
Bulk indexing 534 documents
Processing ./data/aquaint\nyt\1998\19980925_NYT.gz
Bulk indexing 487 documents
Processing ./data/aquaint\nyt\1998\19980926_NYT.gz
Bulk indexing 238 documents
Processing ./data/aquain

Bulk indexing 158 documents
Processing ./data/aquaint\nyt\1998\19981228_NYT.gz
Bulk indexing 463 documents
Processing ./data/aquaint\nyt\1998\19981229_NYT.gz
Bulk indexing 427 documents
Processing ./data/aquaint\nyt\1998\19981230_NYT.gz
Bulk indexing 458 documents
Processing ./data/aquaint\nyt\1998\19981231_NYT.gz
Bulk indexing 436 documents
Processing ./data/aquaint\nyt\1999\19990101_NYT.gz
Bulk indexing 268 documents
Processing ./data/aquaint\nyt\1999\19990102_NYT.gz
Bulk indexing 260 documents
Processing ./data/aquaint\nyt\1999\19990103_NYT.gz
Bulk indexing 250 documents
Processing ./data/aquaint\nyt\1999\19990104_NYT.gz
Bulk indexing 523 documents
Processing ./data/aquaint\nyt\1999\19990105_NYT.gz
Bulk indexing 479 documents
Processing ./data/aquaint\nyt\1999\19990106_NYT.gz
Bulk indexing 433 documents
Processing ./data/aquaint\nyt\1999\19990107_NYT.gz
Bulk indexing 476 documents
Processing ./data/aquaint\nyt\1999\19990108_NYT.gz
Bulk indexing 489 documents
Processing ./data/aquain

Bulk indexing 244 documents
Processing ./data/aquaint\nyt\1999\19990411_NYT.gz
Bulk indexing 204 documents
Processing ./data/aquaint\nyt\1999\19990412_NYT.gz
Bulk indexing 479 documents
Processing ./data/aquaint\nyt\1999\19990413_NYT.gz
Bulk indexing 430 documents
Processing ./data/aquaint\nyt\1999\19990414_NYT.gz
Bulk indexing 491 documents
Processing ./data/aquaint\nyt\1999\19990415_NYT.gz
Bulk indexing 513 documents
Processing ./data/aquaint\nyt\1999\19990416_NYT.gz
Bulk indexing 434 documents
Processing ./data/aquaint\nyt\1999\19990417_NYT.gz
Bulk indexing 252 documents
Processing ./data/aquaint\nyt\1999\19990418_NYT.gz
Bulk indexing 235 documents
Processing ./data/aquaint\nyt\1999\19990419_NYT.gz
Bulk indexing 512 documents
Processing ./data/aquaint\nyt\1999\19990420_NYT.gz
Bulk indexing 450 documents
Processing ./data/aquaint\nyt\1999\19990421_NYT.gz
Bulk indexing 515 documents
Processing ./data/aquaint\nyt\1999\19990422_NYT.gz
Bulk indexing 545 documents
Processing ./data/aquain

Bulk indexing 379 documents
Processing ./data/aquaint\nyt\1999\19990724_NYT.gz
Bulk indexing 224 documents
Processing ./data/aquaint\nyt\1999\19990725_NYT.gz
Bulk indexing 44 documents
Processing ./data/aquaint\nyt\1999\19990726_NYT.gz
Bulk indexing 481 documents
Processing ./data/aquaint\nyt\1999\19990727_NYT.gz
Bulk indexing 397 documents
Processing ./data/aquaint\nyt\1999\19990728_NYT.gz
Bulk indexing 565 documents
Processing ./data/aquaint\nyt\1999\19990729_NYT.gz
Bulk indexing 436 documents
Processing ./data/aquaint\nyt\1999\19990730_NYT.gz
Bulk indexing 382 documents
Processing ./data/aquaint\nyt\1999\19990731_NYT.gz
Bulk indexing 223 documents
Processing ./data/aquaint\nyt\1999\19990801_NYT.gz
Bulk indexing 211 documents
Processing ./data/aquaint\nyt\1999\19990802_NYT.gz
Bulk indexing 394 documents
Processing ./data/aquaint\nyt\1999\19990803_NYT.gz
Bulk indexing 413 documents
Processing ./data/aquaint\nyt\1999\19990804_NYT.gz
Bulk indexing 498 documents
Processing ./data/aquaint

Bulk indexing 557 documents
Processing ./data/aquaint\nyt\1999\19991109_NYT.gz
Bulk indexing 477 documents
Processing ./data/aquaint\nyt\1999\19991110_NYT.gz
Bulk indexing 499 documents
Processing ./data/aquaint\nyt\1999\19991111_NYT.gz
Bulk indexing 425 documents
Processing ./data/aquaint\nyt\1999\19991112_NYT.gz
Bulk indexing 388 documents
Processing ./data/aquaint\nyt\1999\19991113_NYT.gz
Bulk indexing 80 documents
Processing ./data/aquaint\nyt\1999\19991115_NYT.gz
Bulk indexing 410 documents
Processing ./data/aquaint\nyt\1999\19991116_NYT.gz
Bulk indexing 427 documents
Processing ./data/aquaint\nyt\1999\19991117_NYT.gz
Bulk indexing 451 documents
Processing ./data/aquaint\nyt\1999\19991118_NYT.gz
Bulk indexing 537 documents
Processing ./data/aquaint\nyt\1999\19991119_NYT.gz
Bulk indexing 425 documents
Processing ./data/aquaint\nyt\1999\19991120_NYT.gz
Bulk indexing 257 documents
Processing ./data/aquaint\nyt\1999\19991121_NYT.gz
Bulk indexing 174 documents
Processing ./data/aquaint

Bulk indexing 214 documents
Processing ./data/aquaint\nyt\2000\20000227_NYT.gz
Bulk indexing 178 documents
Processing ./data/aquaint\nyt\2000\20000228_NYT.gz
Bulk indexing 526 documents
Processing ./data/aquaint\nyt\2000\20000229_NYT.gz
Bulk indexing 505 documents
Processing ./data/aquaint\nyt\2000\20000301_NYT.gz
Bulk indexing 557 documents
Processing ./data/aquaint\nyt\2000\20000302_NYT.gz
Bulk indexing 469 documents
Processing ./data/aquaint\nyt\2000\20000303_NYT.gz
Bulk indexing 390 documents
Processing ./data/aquaint\nyt\2000\20000304_NYT.gz
Bulk indexing 255 documents
Processing ./data/aquaint\nyt\2000\20000305_NYT.gz
Bulk indexing 189 documents
Processing ./data/aquaint\nyt\2000\20000306_NYT.gz
Bulk indexing 527 documents
Processing ./data/aquaint\nyt\2000\20000307_NYT.gz
Bulk indexing 469 documents
Processing ./data/aquaint\nyt\2000\20000308_NYT.gz
Bulk indexing 478 documents
Processing ./data/aquaint\nyt\2000\20000309_NYT.gz
Bulk indexing 458 documents
Processing ./data/aquain

Bulk indexing 456 documents
Processing ./data/aquaint\nyt\2000\20000630_NYT.gz
Bulk indexing 445 documents
Processing ./data/aquaint\nyt\2000\20000701_NYT.gz
Bulk indexing 183 documents
Processing ./data/aquaint\nyt\2000\20000702_NYT.gz
Bulk indexing 158 documents
Processing ./data/aquaint\nyt\2000\20000703_NYT.gz
Bulk indexing 445 documents
Processing ./data/aquaint\nyt\2000\20000704_NYT.gz
Bulk indexing 182 documents
Processing ./data/aquaint\nyt\2000\20000705_NYT.gz
Bulk indexing 348 documents
Processing ./data/aquaint\nyt\2000\20000706_NYT.gz
Bulk indexing 382 documents
Processing ./data/aquaint\nyt\2000\20000707_NYT.gz
Bulk indexing 396 documents
Processing ./data/aquaint\nyt\2000\20000708_NYT.gz
Bulk indexing 230 documents
Processing ./data/aquaint\nyt\2000\20000709_NYT.gz
Bulk indexing 175 documents
Processing ./data/aquaint\nyt\2000\20000710_NYT.gz
Bulk indexing 491 documents
Processing ./data/aquaint\nyt\2000\20000711_NYT.gz
Bulk indexing 418 documents
Processing ./data/aquain

Bulk indexing 256 documents
Processing ./data/aquaint\xie\1996\19960116_XIN_ENG.gz
Bulk indexing 282 documents
Processing ./data/aquaint\xie\1996\19960117_XIN_ENG.gz
Bulk indexing 311 documents
Processing ./data/aquaint\xie\1996\19960118_XIN_ENG.gz
Bulk indexing 316 documents
Processing ./data/aquaint\xie\1996\19960119_XIN_ENG.gz
Bulk indexing 290 documents
Processing ./data/aquaint\xie\1996\19960120_XIN_ENG.gz
Bulk indexing 203 documents
Processing ./data/aquaint\xie\1996\19960121_XIN_ENG.gz
Bulk indexing 162 documents
Processing ./data/aquaint\xie\1996\19960122_XIN_ENG.gz
Bulk indexing 237 documents
Processing ./data/aquaint\xie\1996\19960123_XIN_ENG.gz
Bulk indexing 281 documents
Processing ./data/aquaint\xie\1996\19960124_XIN_ENG.gz
Bulk indexing 274 documents
Processing ./data/aquaint\xie\1996\19960125_XIN_ENG.gz
Bulk indexing 283 documents
Processing ./data/aquaint\xie\1996\19960126_XIN_ENG.gz
Bulk indexing 246 documents
Processing ./data/aquaint\xie\1996\19960127_XIN_ENG.gz
Bulk

Bulk indexing 282 documents
Processing ./data/aquaint\xie\1996\19960424_XIN_ENG.gz
Bulk indexing 303 documents
Processing ./data/aquaint\xie\1996\19960425_XIN_ENG.gz
Bulk indexing 319 documents
Processing ./data/aquaint\xie\1996\19960426_XIN_ENG.gz
Bulk indexing 297 documents
Processing ./data/aquaint\xie\1996\19960427_XIN_ENG.gz
Bulk indexing 281 documents
Processing ./data/aquaint\xie\1996\19960428_XIN_ENG.gz
Bulk indexing 193 documents
Processing ./data/aquaint\xie\1996\19960429_XIN_ENG.gz
Bulk indexing 237 documents
Processing ./data/aquaint\xie\1996\19960430_XIN_ENG.gz
Bulk indexing 269 documents
Processing ./data/aquaint\xie\1996\19960501_XIN_ENG.gz
Bulk indexing 195 documents
Processing ./data/aquaint\xie\1996\19960502_XIN_ENG.gz
Bulk indexing 244 documents
Processing ./data/aquaint\xie\1996\19960503_XIN_ENG.gz
Bulk indexing 286 documents
Processing ./data/aquaint\xie\1996\19960504_XIN_ENG.gz
Bulk indexing 222 documents
Processing ./data/aquaint\xie\1996\19960505_XIN_ENG.gz
Bulk

Bulk indexing 293 documents
Processing ./data/aquaint\xie\1996\19960801_XIN_ENG.gz
Bulk indexing 320 documents
Processing ./data/aquaint\xie\1996\19960802_XIN_ENG.gz
Bulk indexing 320 documents
Processing ./data/aquaint\xie\1996\19960803_XIN_ENG.gz
Bulk indexing 236 documents
Processing ./data/aquaint\xie\1996\19960804_XIN_ENG.gz
Bulk indexing 188 documents
Processing ./data/aquaint\xie\1996\19960805_XIN_ENG.gz
Bulk indexing 223 documents
Processing ./data/aquaint\xie\1996\19960806_XIN_ENG.gz
Bulk indexing 264 documents
Processing ./data/aquaint\xie\1996\19960807_XIN_ENG.gz
Bulk indexing 305 documents
Processing ./data/aquaint\xie\1996\19960808_XIN_ENG.gz
Bulk indexing 293 documents
Processing ./data/aquaint\xie\1996\19960809_XIN_ENG.gz
Bulk indexing 278 documents
Processing ./data/aquaint\xie\1996\19960810_XIN_ENG.gz
Bulk indexing 238 documents
Processing ./data/aquaint\xie\1996\19960811_XIN_ENG.gz
Bulk indexing 169 documents
Processing ./data/aquaint\xie\1996\19960812_XIN_ENG.gz
Bulk

Bulk indexing 326 documents
Processing ./data/aquaint\xie\1996\19961108_XIN_ENG.gz
Bulk indexing 289 documents
Processing ./data/aquaint\xie\1996\19961109_XIN_ENG.gz
Bulk indexing 236 documents
Processing ./data/aquaint\xie\1996\19961110_XIN_ENG.gz
Bulk indexing 186 documents
Processing ./data/aquaint\xie\1996\19961111_XIN_ENG.gz
Bulk indexing 306 documents
Processing ./data/aquaint\xie\1996\19961112_XIN_ENG.gz
Bulk indexing 292 documents
Processing ./data/aquaint\xie\1996\19961113_XIN_ENG.gz
Bulk indexing 312 documents
Processing ./data/aquaint\xie\1996\19961114_XIN_ENG.gz
Bulk indexing 312 documents
Processing ./data/aquaint\xie\1996\19961115_XIN_ENG.gz
Bulk indexing 314 documents
Processing ./data/aquaint\xie\1996\19961116_XIN_ENG.gz
Bulk indexing 229 documents
Processing ./data/aquaint\xie\1996\19961117_XIN_ENG.gz
Bulk indexing 203 documents
Processing ./data/aquaint\xie\1996\19961118_XIN_ENG.gz
Bulk indexing 245 documents
Processing ./data/aquaint\xie\1996\19961119_XIN_ENG.gz
Bulk

Bulk indexing 238 documents
Processing ./data/aquaint\xie\1997\19970216_XIN_ENG.gz
Bulk indexing 193 documents
Processing ./data/aquaint\xie\1997\19970217_XIN_ENG.gz
Bulk indexing 214 documents
Processing ./data/aquaint\xie\1997\19970218_XIN_ENG.gz
Bulk indexing 292 documents
Processing ./data/aquaint\xie\1997\19970219_XIN_ENG.gz
Bulk indexing 321 documents
Processing ./data/aquaint\xie\1997\19970220_XIN_ENG.gz
Bulk indexing 285 documents
Processing ./data/aquaint\xie\1997\19970221_XIN_ENG.gz
Bulk indexing 302 documents
Processing ./data/aquaint\xie\1997\19970222_XIN_ENG.gz
Bulk indexing 254 documents
Processing ./data/aquaint\xie\1997\19970223_XIN_ENG.gz
Bulk indexing 227 documents
Processing ./data/aquaint\xie\1997\19970224_XIN_ENG.gz
Bulk indexing 256 documents
Processing ./data/aquaint\xie\1997\19970225_XIN_ENG.gz
Bulk indexing 342 documents
Processing ./data/aquaint\xie\1997\19970226_XIN_ENG.gz
Bulk indexing 307 documents
Processing ./data/aquaint\xie\1997\19970227_XIN_ENG.gz
Bulk

Bulk indexing 179 documents
Processing ./data/aquaint\xie\1997\19970526_XIN_ENG.gz
Bulk indexing 240 documents
Processing ./data/aquaint\xie\1997\19970527_XIN_ENG.gz
Bulk indexing 292 documents
Processing ./data/aquaint\xie\1997\19970528_XIN_ENG.gz
Bulk indexing 296 documents
Processing ./data/aquaint\xie\1997\19970529_XIN_ENG.gz
Bulk indexing 315 documents
Processing ./data/aquaint\xie\1997\19970530_XIN_ENG.gz
Bulk indexing 261 documents
Processing ./data/aquaint\xie\1997\19970531_XIN_ENG.gz
Bulk indexing 195 documents
Processing ./data/aquaint\xie\1997\19970601_XIN_ENG.gz
Bulk indexing 181 documents
Processing ./data/aquaint\xie\1997\19970602_XIN_ENG.gz
Bulk indexing 254 documents
Processing ./data/aquaint\xie\1997\19970603_XIN_ENG.gz
Bulk indexing 297 documents
Processing ./data/aquaint\xie\1997\19970604_XIN_ENG.gz
Bulk indexing 285 documents
Processing ./data/aquaint\xie\1997\19970605_XIN_ENG.gz
Bulk indexing 309 documents
Processing ./data/aquaint\xie\1997\19970606_XIN_ENG.gz
Bulk

Bulk indexing 177 documents
Processing ./data/aquaint\xie\1997\19970902_XIN_ENG.gz
Bulk indexing 267 documents
Processing ./data/aquaint\xie\1997\19970903_XIN_ENG.gz
Bulk indexing 279 documents
Processing ./data/aquaint\xie\1997\19970904_XIN_ENG.gz
Bulk indexing 322 documents
Processing ./data/aquaint\xie\1997\19970905_XIN_ENG.gz
Bulk indexing 290 documents
Processing ./data/aquaint\xie\1997\19970906_XIN_ENG.gz
Bulk indexing 239 documents
Processing ./data/aquaint\xie\1997\19970907_XIN_ENG.gz
Bulk indexing 183 documents
Processing ./data/aquaint\xie\1997\19970908_XIN_ENG.gz
Bulk indexing 267 documents
Processing ./data/aquaint\xie\1997\19970909_XIN_ENG.gz
Bulk indexing 309 documents
Processing ./data/aquaint\xie\1997\19970910_XIN_ENG.gz
Bulk indexing 304 documents
Processing ./data/aquaint\xie\1997\19970911_XIN_ENG.gz
Bulk indexing 337 documents
Processing ./data/aquaint\xie\1997\19970912_XIN_ENG.gz
Bulk indexing 328 documents
Processing ./data/aquaint\xie\1997\19970913_XIN_ENG.gz
Bulk

Bulk indexing 292 documents
Processing ./data/aquaint\xie\1997\19971210_XIN_ENG.gz
Bulk indexing 359 documents
Processing ./data/aquaint\xie\1997\19971211_XIN_ENG.gz
Bulk indexing 333 documents
Processing ./data/aquaint\xie\1997\19971212_XIN_ENG.gz
Bulk indexing 326 documents
Processing ./data/aquaint\xie\1997\19971213_XIN_ENG.gz
Bulk indexing 190 documents
Processing ./data/aquaint\xie\1997\19971214_XIN_ENG.gz
Bulk indexing 224 documents
Processing ./data/aquaint\xie\1997\19971215_XIN_ENG.gz
Bulk indexing 258 documents
Processing ./data/aquaint\xie\1997\19971216_XIN_ENG.gz
Bulk indexing 334 documents
Processing ./data/aquaint\xie\1997\19971217_XIN_ENG.gz
Bulk indexing 339 documents
Processing ./data/aquaint\xie\1997\19971218_XIN_ENG.gz
Bulk indexing 363 documents
Processing ./data/aquaint\xie\1997\19971219_XIN_ENG.gz
Bulk indexing 345 documents
Processing ./data/aquaint\xie\1997\19971220_XIN_ENG.gz
Bulk indexing 273 documents
Processing ./data/aquaint\xie\1997\19971221_XIN_ENG.gz
Bulk

Bulk indexing 304 documents
Processing ./data/aquaint\xie\1998\19980319_XIN_ENG.gz
Bulk indexing 347 documents
Processing ./data/aquaint\xie\1998\19980320_XIN_ENG.gz
Bulk indexing 296 documents
Processing ./data/aquaint\xie\1998\19980321_XIN_ENG.gz
Bulk indexing 232 documents
Processing ./data/aquaint\xie\1998\19980322_XIN_ENG.gz
Bulk indexing 178 documents
Processing ./data/aquaint\xie\1998\19980323_XIN_ENG.gz
Bulk indexing 251 documents
Processing ./data/aquaint\xie\1998\19980324_XIN_ENG.gz
Bulk indexing 322 documents
Processing ./data/aquaint\xie\1998\19980325_XIN_ENG.gz
Bulk indexing 371 documents
Processing ./data/aquaint\xie\1998\19980326_XIN_ENG.gz
Bulk indexing 321 documents
Processing ./data/aquaint\xie\1998\19980327_XIN_ENG.gz
Bulk indexing 271 documents
Processing ./data/aquaint\xie\1998\19980328_XIN_ENG.gz
Bulk indexing 228 documents
Processing ./data/aquaint\xie\1998\19980329_XIN_ENG.gz
Bulk indexing 196 documents
Processing ./data/aquaint\xie\1998\19980330_XIN_ENG.gz
Bulk

Bulk indexing 374 documents
Processing ./data/aquaint\xie\1998\19980626_XIN_ENG.gz
Bulk indexing 348 documents
Processing ./data/aquaint\xie\1998\19980627_XIN_ENG.gz
Bulk indexing 296 documents
Processing ./data/aquaint\xie\1998\19980628_XIN_ENG.gz
Bulk indexing 194 documents
Processing ./data/aquaint\xie\1998\19980629_XIN_ENG.gz
Bulk indexing 263 documents
Processing ./data/aquaint\xie\1998\19980630_XIN_ENG.gz
Bulk indexing 311 documents
Processing ./data/aquaint\xie\1998\19980701_XIN_ENG.gz
Bulk indexing 357 documents
Processing ./data/aquaint\xie\1998\19980702_XIN_ENG.gz
Bulk indexing 331 documents
Processing ./data/aquaint\xie\1998\19980703_XIN_ENG.gz
Bulk indexing 304 documents
Processing ./data/aquaint\xie\1998\19980704_XIN_ENG.gz
Bulk indexing 241 documents
Processing ./data/aquaint\xie\1998\19980705_XIN_ENG.gz
Bulk indexing 207 documents
Processing ./data/aquaint\xie\1998\19980706_XIN_ENG.gz
Bulk indexing 268 documents
Processing ./data/aquaint\xie\1998\19980707_XIN_ENG.gz
Bulk

Bulk indexing 244 documents
Processing ./data/aquaint\xie\1998\19981003_XIN_ENG.gz
Bulk indexing 210 documents
Processing ./data/aquaint\xie\1998\19981004_XIN_ENG.gz
Bulk indexing 173 documents
Processing ./data/aquaint\xie\1998\19981005_XIN_ENG.gz
Bulk indexing 240 documents
Processing ./data/aquaint\xie\1998\19981006_XIN_ENG.gz
Bulk indexing 295 documents
Processing ./data/aquaint\xie\1998\19981007_XIN_ENG.gz
Bulk indexing 331 documents
Processing ./data/aquaint\xie\1998\19981008_XIN_ENG.gz
Bulk indexing 295 documents
Processing ./data/aquaint\xie\1998\19981009_XIN_ENG.gz
Bulk indexing 330 documents
Processing ./data/aquaint\xie\1998\19981010_XIN_ENG.gz
Bulk indexing 216 documents
Processing ./data/aquaint\xie\1998\19981011_XIN_ENG.gz
Bulk indexing 180 documents
Processing ./data/aquaint\xie\1998\19981012_XIN_ENG.gz
Bulk indexing 266 documents
Processing ./data/aquaint\xie\1998\19981013_XIN_ENG.gz
Bulk indexing 339 documents
Processing ./data/aquaint\xie\1998\19981014_XIN_ENG.gz
Bulk

Bulk indexing 269 documents
Processing ./data/aquaint\xie\1999\19990110_XIN_ENG.gz
Bulk indexing 185 documents
Processing ./data/aquaint\xie\1999\19990111_XIN_ENG.gz
Bulk indexing 231 documents
Processing ./data/aquaint\xie\1999\19990112_XIN_ENG.gz
Bulk indexing 339 documents
Processing ./data/aquaint\xie\1999\19990113_XIN_ENG.gz
Bulk indexing 347 documents
Processing ./data/aquaint\xie\1999\19990114_XIN_ENG.gz
Bulk indexing 332 documents
Processing ./data/aquaint\xie\1999\19990115_XIN_ENG.gz
Bulk indexing 322 documents
Processing ./data/aquaint\xie\1999\19990116_XIN_ENG.gz
Bulk indexing 238 documents
Processing ./data/aquaint\xie\1999\19990117_XIN_ENG.gz
Bulk indexing 184 documents
Processing ./data/aquaint\xie\1999\19990118_XIN_ENG.gz
Bulk indexing 270 documents
Processing ./data/aquaint\xie\1999\19990119_XIN_ENG.gz
Bulk indexing 312 documents
Processing ./data/aquaint\xie\1999\19990120_XIN_ENG.gz
Bulk indexing 340 documents
Processing ./data/aquaint\xie\1999\19990121_XIN_ENG.gz
Bulk

Bulk indexing 177 documents
Processing ./data/aquaint\xie\1999\19990419_XIN_ENG.gz
Bulk indexing 284 documents
Processing ./data/aquaint\xie\1999\19990420_XIN_ENG.gz
Bulk indexing 353 documents
Processing ./data/aquaint\xie\1999\19990421_XIN_ENG.gz
Bulk indexing 354 documents
Processing ./data/aquaint\xie\1999\19990422_XIN_ENG.gz
Bulk indexing 409 documents
Processing ./data/aquaint\xie\1999\19990423_XIN_ENG.gz
Bulk indexing 345 documents
Processing ./data/aquaint\xie\1999\19990424_XIN_ENG.gz
Bulk indexing 253 documents
Processing ./data/aquaint\xie\1999\19990425_XIN_ENG.gz
Bulk indexing 197 documents
Processing ./data/aquaint\xie\1999\19990426_XIN_ENG.gz
Bulk indexing 312 documents
Processing ./data/aquaint\xie\1999\19990427_XIN_ENG.gz
Bulk indexing 380 documents
Processing ./data/aquaint\xie\1999\19990428_XIN_ENG.gz
Bulk indexing 339 documents
Processing ./data/aquaint\xie\1999\19990429_XIN_ENG.gz
Bulk indexing 364 documents
Processing ./data/aquaint\xie\1999\19990430_XIN_ENG.gz
Bulk

Bulk indexing 242 documents
Processing ./data/aquaint\xie\1999\19990727_XIN_ENG.gz
Bulk indexing 333 documents
Processing ./data/aquaint\xie\1999\19990728_XIN_ENG.gz
Bulk indexing 299 documents
Processing ./data/aquaint\xie\1999\19990729_XIN_ENG.gz
Bulk indexing 316 documents
Processing ./data/aquaint\xie\1999\19990730_XIN_ENG.gz
Bulk indexing 278 documents
Processing ./data/aquaint\xie\1999\19990731_XIN_ENG.gz
Bulk indexing 232 documents
Processing ./data/aquaint\xie\1999\19990801_XIN_ENG.gz
Bulk indexing 172 documents
Processing ./data/aquaint\xie\1999\19990802_XIN_ENG.gz
Bulk indexing 252 documents
Processing ./data/aquaint\xie\1999\19990803_XIN_ENG.gz
Bulk indexing 315 documents
Processing ./data/aquaint\xie\1999\19990804_XIN_ENG.gz
Bulk indexing 297 documents
Processing ./data/aquaint\xie\1999\19990805_XIN_ENG.gz
Bulk indexing 363 documents
Processing ./data/aquaint\xie\1999\19990806_XIN_ENG.gz
Bulk indexing 325 documents
Processing ./data/aquaint\xie\1999\19990807_XIN_ENG.gz
Bulk

Bulk indexing 358 documents
Processing ./data/aquaint\xie\1999\19991103_XIN_ENG.gz
Bulk indexing 336 documents
Processing ./data/aquaint\xie\1999\19991104_XIN_ENG.gz
Bulk indexing 375 documents
Processing ./data/aquaint\xie\1999\19991105_XIN_ENG.gz
Bulk indexing 330 documents
Processing ./data/aquaint\xie\1999\19991106_XIN_ENG.gz
Bulk indexing 245 documents
Processing ./data/aquaint\xie\1999\19991107_XIN_ENG.gz
Bulk indexing 198 documents
Processing ./data/aquaint\xie\1999\19991108_XIN_ENG.gz
Bulk indexing 270 documents
Processing ./data/aquaint\xie\1999\19991109_XIN_ENG.gz
Bulk indexing 328 documents
Processing ./data/aquaint\xie\1999\19991110_XIN_ENG.gz
Bulk indexing 361 documents
Processing ./data/aquaint\xie\1999\19991111_XIN_ENG.gz
Bulk indexing 351 documents
Processing ./data/aquaint\xie\1999\19991112_XIN_ENG.gz
Bulk indexing 340 documents
Processing ./data/aquaint\xie\1999\19991113_XIN_ENG.gz
Bulk indexing 237 documents
Processing ./data/aquaint\xie\1999\19991114_XIN_ENG.gz
Bulk

Bulk indexing 269 documents
Processing ./data/aquaint\xie\2000\20000210_XIN_ENG.gz
Bulk indexing 284 documents
Processing ./data/aquaint\xie\2000\20000211_XIN_ENG.gz
Bulk indexing 270 documents
Processing ./data/aquaint\xie\2000\20000212_XIN_ENG.gz
Bulk indexing 193 documents
Processing ./data/aquaint\xie\2000\20000213_XIN_ENG.gz
Bulk indexing 223 documents
Processing ./data/aquaint\xie\2000\20000214_XIN_ENG.gz
Bulk indexing 297 documents
Processing ./data/aquaint\xie\2000\20000215_XIN_ENG.gz
Bulk indexing 302 documents
Processing ./data/aquaint\xie\2000\20000216_XIN_ENG.gz
Bulk indexing 323 documents
Processing ./data/aquaint\xie\2000\20000217_XIN_ENG.gz
Bulk indexing 328 documents
Processing ./data/aquaint\xie\2000\20000218_XIN_ENG.gz
Bulk indexing 295 documents
Processing ./data/aquaint\xie\2000\20000219_XIN_ENG.gz
Bulk indexing 193 documents
Processing ./data/aquaint\xie\2000\20000220_XIN_ENG.gz
Bulk indexing 197 documents
Processing ./data/aquaint\xie\2000\20000221_XIN_ENG.gz
Bulk

Bulk indexing 363 documents
Processing ./data/aquaint\xie\2000\20000519_XIN_ENG.gz
Bulk indexing 377 documents
Processing ./data/aquaint\xie\2000\20000520_XIN_ENG.gz
Bulk indexing 212 documents
Processing ./data/aquaint\xie\2000\20000521_XIN_ENG.gz
Bulk indexing 189 documents
Processing ./data/aquaint\xie\2000\20000522_XIN_ENG.gz
Bulk indexing 356 documents
Processing ./data/aquaint\xie\2000\20000523_XIN_ENG.gz
Bulk indexing 359 documents
Processing ./data/aquaint\xie\2000\20000524_XIN_ENG.gz
Bulk indexing 373 documents
Processing ./data/aquaint\xie\2000\20000525_XIN_ENG.gz
Bulk indexing 347 documents
Processing ./data/aquaint\xie\2000\20000526_XIN_ENG.gz
Bulk indexing 325 documents
Processing ./data/aquaint\xie\2000\20000527_XIN_ENG.gz
Bulk indexing 198 documents
Processing ./data/aquaint\xie\2000\20000528_XIN_ENG.gz
Bulk indexing 190 documents
Processing ./data/aquaint\xie\2000\20000529_XIN_ENG.gz
Bulk indexing 313 documents
Processing ./data/aquaint\xie\2000\20000530_XIN_ENG.gz
Bulk

Bulk indexing 512 documents
Processing ./data/aquaint\xie\2000\20000826_XIN_ENG.gz
Bulk indexing 364 documents
Processing ./data/aquaint\xie\2000\20000827_XIN_ENG.gz
Bulk indexing 368 documents
Processing ./data/aquaint\xie\2000\20000828_XIN_ENG.gz
Bulk indexing 303 documents
Processing ./data/aquaint\xie\2000\20000829_XIN_ENG.gz
Bulk indexing 307 documents
Processing ./data/aquaint\xie\2000\20000830_XIN_ENG.gz
Bulk indexing 302 documents
Processing ./data/aquaint\xie\2000\20000831_XIN_ENG.gz
Bulk indexing 293 documents
Processing ./data/aquaint\xie\2000\20000901_XIN_ENG.gz
Bulk indexing 296 documents
Processing ./data/aquaint\xie\2000\20000902_XIN_ENG.gz
Bulk indexing 156 documents
Processing ./data/aquaint\xie\2000\20000903_XIN_ENG.gz
Bulk indexing 145 documents
Processing ./data/aquaint\xie\2000\20000904_XIN_ENG.gz
Bulk indexing 284 documents
Processing ./data/aquaint\xie\2000\20000905_XIN_ENG.gz
Bulk indexing 301 documents
Processing ./data/aquaint\xie\2000\20000906_XIN_ENG.gz
Bulk

In [11]:
test = es.get(index=INDEX_NAME, doc_type=DOC_TYPE, id="APW19980602.0001")

In [12]:
pprint.pprint(test)

{'_id': 'APW19980602.0001',
 '_index': 'aquaint',
 '_source': {'content': '\n'
                        'KIGALI, Rwanda (AP) _ A Rwandan who was convicted and '
                        'sentenced\n'
                        "to death for his role in thecountry's 1994 genocide "
                        'has attempted\n'
                        'suicide by seizing the steering wheel in a police '
                        'car, a news\n'
                        'agency reported.\n'
                        'In the Monday incident, Geoffrey Gatera, a former '
                        'surgeon,\n'
                        'seriously injured three pedestrians, but escaped '
                        'injury, the\n'
                        'private Rwandan News Agency reported.\n'
                        'Gatera was convicted Monday on charges of genocide '
                        'and crimes\n'
                        'against humanity. On the way to prison, he seizing '
                        'the