In [1]:
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

In [2]:
DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader(
    '/Users/willard.hong/Documents/learn/learn_fox/atap/corpus', DOC_PATTERN, cat_pattern=CAT_PATTERN
)

In [3]:
corpus.categories()

['Star Trek', 'Star Wars']

In [4]:
corpus.fileids()

['Star Trek/Star Trek - Balance of Terror.txt',
 'Star Trek/Star Trek - First Contact.txt',
 'Star Wars/Star Wars Episode 1.txt',
 'Star Wars/Star Wars Episode 2.txt']

In [9]:
#!/usr/bin/env python3

import bs4
import time
import nltk
import pickle
import logging

from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader

from readability.readability import Unparseable
from readability.readability import Document as Paper
from nltk import pos_tag, sent_tokenize, wordpunct_tokenize

log = logging.getLogger("readability.readability")
log.setLevel('WARNING')

DOC_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.json'
PKL_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.pickle'
CAT_PATTERN = r'([a-z_\s]+)/.*'

TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li']


class PickledCorpusReader(CategorizedCorpusReader, CorpusReader):

    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

    def resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function. This primarily bubbles up to
        the high level ``docs`` method, but is implemented here similar to
        the nltk ``CategorizedPlaintextCorpusReader``.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids

    def docs(self, fileids=None, categories=None):
        """
        Returns the document loaded from a pickled object for every file in
        the corpus. Similar to the BaleenCorpusReader, this uses a generator
        to acheive memory safe iteration.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, enc, fileid in self.abspaths(fileids, True, True):
            with open(path, 'rb') as f:
                yield pickle.load(f)

    def paras(self, fileids=None, categories=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for doc in self.docs(fileids, categories):
            for paragraph in doc:
                yield paragraph

    def sents(self, fileids=None, categories=None):
        """
        Returns a generator of sentences where each sentence is a list of
        (token, tag) tuples.
        """
        for paragraph in self.paras(fileids, categories):
            for sentence in paragraph:
                yield sentence

    def tagged(self, fileids=None, categories=None):
        for sent in self.sents(fileids, categories):
            for token in sent:
                yield token

    def words(self, fileids=None, categories=None):
        """
        Returns a generator of (token, tag) tuples.
        """
        for token in self.tagged(fileids, categories):
            yield token[0]


if __name__ == '__main__':
    from collections import Counter

    corpus = PickledCorpusReader('/Users/willard.hong/Documents/learn/learn_fox/sample')
    words  = Counter(corpus.words())

    print("{:,} vocabulary {:,} word count".format(len(words.keys()), sum(words.values())))

58,748 vocabulary 1,624,862 word count


In [97]:
import os
os.path.getsize('/Users/willard.hong/Documents/learn/learn_fox/sample')

576

In [13]:
corpus = PickledCorpusReader('/Users/willard.hong/Documents/learn/learn_fox/sample', PKL_PATTERN)

In [87]:
corpus.readme()



In [61]:
corpus.categories()

['books',
 'business',
 'cinema',
 'cooking',
 'data_science',
 'design',
 'do_it_yourself',
 'gaming',
 'news',
 'politics',
 'sports',
 'tech']

In [88]:
corpus.fileids()

['books/56d86f51c18081104b39adaa.pickle',
 'books/56d8962bc18081120d8227ed.pickle',
 'books/56d939bbc1808111c985105f.pickle',
 'books/56ddce05c1808111c98a2799.pickle',
 'books/56e1fe20c1808111c98f1dc9.pickle',
 'books/56e3481ac1808111c990ab50.pickle',
 'books/56e461b8c1808111c991d1a4.pickle',
 'books/56e704e7c1808111c994a284.pickle',
 'books/56e715a7c1808111c994b89a.pickle',
 'books/56e8466dc1808103936f748d.pickle',
 'books/56e846dcc1808103936f7526.pickle',
 'books/56e84718c1808103936f757e.pickle',
 'books/56e84813c1808103936f7686.pickle',
 'books/56e84833c1808103936f769f.pickle',
 'books/56e963e9c1808111e05f5b59.pickle',
 'books/56ec4421c180814c03c8cef9.pickle',
 'books/56ec65d7c180814c03c907fa.pickle',
 'books/56ef7bd4c180814c03cd646e.pickle',
 'books/56f016f4c1808103e167a7ec.pickle',
 'books/56f04a0ec1808103e167fd68.pickle',
 'books/56f53cefc18081352e6553b6.pickle',
 'books/56fc3a69c18081767c938aad.pickle',
 'books/56fc3a84c18081767c938b32.pickle',
 'books/56fc3a89c18081767c938bc4.p

In [78]:
x = corpus.docs(categories='books')

In [None]:
for y in corpus.docs(categories='books'):
    print(y)

In [5]:
import json
with open('/Users/willard.hong/Downloads/baleen-sample-20170817/essays/5994c836df23b703bdec9c2c.json') as json_file:
    data = json.load(json_file)

In [7]:
print(data)

{'_id': {'$oid': '5994c836df23b703bdec9c2c'}, 'feed': {'$oid': '56e71c0bc180817118d3c9c6'}, 'title': 'Amina Yaqin', 'url': 'http://www.bbc.co.uk/programmes/b090vc1p', 'pubdate': {'$date': 1502920800000}, 'content': '<!DOCTYPE html> <html class="" lang="en-GB" > <head> <!-- Barlesque 3.21.26 --> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> <meta name="description" content="  Amina Yaqin reflects on the 70-year legacy of partition as reflected in Pakistani culture.  " /> <meta name="keywords" content="  Amina Yaqin  " />   <title> BBC Radio 3 - The Essay, The Culture of Partition, Amina Yaqin</title>        <meta name="viewport" content="width=device-width, initial-scale=1.0" />  <meta property="fb:admins" content="100004154058350" />    \n<!--[if (gt IE 8) | (IEMobile)]><!-->\n<link rel="stylesheet" href="http://static.bbci.co.uk/frameworks/barlesque/3.21.26/orb/4/style/orb.min.css">\n<!--<![endif]-->\n\n<!--[if (lt IE 9) & (!IEMobile)]>\n<link rel="stylesheet" 

In [3]:
import json

In [100]:
type(data)

In [1]:
1 + 1

2