# example of PlainTextCorpusReader
https://python.gotrained.com/nltk-corupus/

In [1]:
from nltk.corpus import PlaintextCorpusReader
import os

In [2]:
corpus_root = "/Users/willard.hong/Downloads/aclImdb/test_samp"
file_ids = r'([\w_\s]+).*'
corpus = PlaintextCorpusReader(corpus_root, file_ids)

In [6]:
from collections import Counter

#print(corpus.fileids())
#print(corpus.raw('10000_7.txt'))
#print(corpus.words('10000_7.txt'))
#ords = Counter(corpus.words('10000_7.txt'))
words = Counter(corpus.words())
print("{:,} vocabulary {:,} word count".format(len(words.keys()), sum(words.values())))
#print(corpus.paras('10000_7.txt'))

2,796 vocabulary 11,450 word count


# build VerintCorpusReader

In [8]:
#!/usr/bin/env python3
import time
import nltk

import pickle
import logging

from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader

from nltk import pos_tag, sent_tokenize, wordpunct_tokenize

In [9]:
DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'

In [10]:
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

import spacy
import string
from six import string_types
import os

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

class VerintCorpusReader(CategorizedPlaintextCorpusReader):
    """
    A corpus reader for raw Verint documents to enable preprocessing.
    """
    def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8', **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        # Initialize the NLTK corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        # Save the tags that we specifically want to extract.
        #self.tags = tags
     
    def raw(self, fileids=None):
        if fileids is None:
            fileids = self._fileids
        elif isinstance(fileids, string_types):
            fileids = [fileids]
        for f in fileids:
            yield self.open(f).read(), f
        
    def resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function. Implemented similarly to
        the NLTK ``CategorizedPlaintextCorpusReader``.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids
    
    def docs(self, fileids=None, categories=None):
        """
        Returns the complete text of an HTML document, closing the document
        after we are done reading it and yielding it in a memory safe fashion.
        """
        fileids = self.resolve(fileids, categories)
        myGenText = self.raw(fileids)
        # Create a generator, loading one document into memory at a time.
        for rawText in myGenText:
            #print(type(rawText[1]))     
            doc = nlp(rawText[0].lower())
            text = [token.lemma_ for token in doc]
            text = [token for token in text if token not in string.punctuation]
            self.savefile(rawText[1], text)
            yield text
            
    def savefile(self, fileids, text):
        print(fileids.split('/')[1][0:-4])
        target = os.path.join('/Users/willard.hong/Downloads/aclImdb/pickle', fileids.split('/')[1][0:-4] + '.pkl')
        open_pickle = open(target, 'wb')
        pickle.dump(text, open_pickle)
        open_pickle.close()
        

In [11]:
reader = VerintCorpusReader('/Users/willard.hong/Downloads/aclImdb/test_samp', 
         DOC_PATTERN, cat_pattern=CAT_PATTERN)

In [12]:
reader.categories()

['neg', 'pos']

In [13]:
reader.fileids()

['neg/10020_8.txt',
 'neg/10021_9.txt',
 'neg/10022_10.txt',
 'neg/10023_8.txt',
 'neg/10024_9.txt',
 'neg/10025_8.txt',
 'neg/10026_10.txt',
 'neg/10027_8.txt',
 'neg/10028_10.txt',
 'neg/10029_10.txt',
 'neg/1002_9.txt',
 'pos/10010_9.txt',
 'pos/10011_9.txt',
 'pos/10012_9.txt',
 'pos/10013_9.txt',
 'pos/10014_7.txt',
 'pos/10015_8.txt',
 'pos/10016_8.txt',
 'pos/10017_8.txt',
 'pos/10018_8.txt',
 'pos/10019_8.txt',
 'pos/1001_10.txt']

In [14]:
reader.abspath('neg/10020_8.txt')

FileSystemPathPointer('/Users/willard.hong/Downloads/aclImdb/test_samp/neg/10020_8.txt')

In [15]:
x = reader.raw(['neg/10020_8.txt', 'neg/10021_9.txt'])
type(x)

generator

In [16]:
x = reader.raw(['neg/10020_8.txt', 'neg/10021_9.txt'])

for myDoc in x:
    print(myDoc)

("Felix in Hollywood is a great film. The version I viewed was very well restored, which is sometimes a problem with these silent era animated films. It has some of Hollywood's most famous stars making cameo animated appearances. A must for any silent film or animation enthusiast.", 'neg/10020_8.txt')
("A gem of a cartoon from the silent era---it was re-discovered by CARTOON NETWORK, and was broadcast for likely the first time in decades, if ever.<br /><br />What makes this so enjoyable are the varied cameos...Douglas Fairbanks is attacked by giant mosquitos; Will Hays pays a visit as 'boss' of Static Studios; as well as appearances by Chaplin, Keaton, and William S. Hart. The image of chewing gum decimating the shoes of the populace (a money-making idea for Felix's near-bankrupt shoe-=salesman boss) cannot be described--it must be viewed. A terrific cultural gem.", 'neg/10021_9.txt')


In [17]:
x = reader.docs(reader.fileids('neg'))
for myDoc in x:
    pass

10020_8
10021_9
10022_10
10023_8
10024_9
10025_8
10026_10
10027_8
10028_10
10029_10
1002_9


In [18]:
with open('/Users/willard.hong/Downloads/aclImdb/pickle/10020_8.pkl', 'rb') as f:
    print(pickle.load(f))

['felix', 'in', 'hollywood', 'be', 'a', 'great', 'film', 'the', 'version', 'i', 'view', 'be', 'very', 'well', 'restore', 'which', 'be', 'sometimes', 'a', 'problem', 'with', 'these', 'silent', 'era', 'animate', 'film', '-PRON-', 'have', 'some', 'of', 'hollywood', "'s", 'most', 'famous', 'star', 'make', 'cameo', 'animate', 'appearance', 'a', 'must', 'for', 'any', 'silent', 'film', 'or', 'animation', 'enthusiast']


In [132]:
x = reader.raw(reader.fileids())
for myDoc in x:
    print(myDoc)

("Felix in Hollywood is a great film. The version I viewed was very well restored, which is sometimes a problem with these silent era animated films. It has some of Hollywood's most famous stars making cameo animated appearances. A must for any silent film or animation enthusiast.", 'neg/10020_8.txt')
("A gem of a cartoon from the silent era---it was re-discovered by CARTOON NETWORK, and was broadcast for likely the first time in decades, if ever.<br /><br />What makes this so enjoyable are the varied cameos...Douglas Fairbanks is attacked by giant mosquitos; Will Hays pays a visit as 'boss' of Static Studios; as well as appearances by Chaplin, Keaton, and William S. Hart. The image of chewing gum decimating the shoes of the populace (a money-making idea for Felix's near-bankrupt shoe-=salesman boss) cannot be described--it must be viewed. A terrific cultural gem.", 'neg/10021_9.txt')
('This short is one of the best of all time and is proof (just like most of Charlie Chaplin\'s work) t


# build pickledCorpusReader

In [19]:
PKL_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.txt.pkl'

class PickledCorpusReader(CategorizedCorpusReader, CorpusReader):

    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

    def resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function. This primarily bubbles up to
        the high level ``docs`` method, but is implemented here similar to
        the nltk ``CategorizedPlaintextCorpusReader``.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids

    def docs(self, fileids=None, categories=None):
        """
        Returns the document loaded from a pickled object for every file in
        the corpus. Similar to the BaleenCorpusReader, this uses a generator
        to acheive memory safe iteration.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)
        #yield print('ok')
        ## Create a generator, loading one document into memory at a time.
        for path, enc, fileid in self.abspaths(fileids, True, True):
            with open(path, 'rb') as f:
                return pickle.load(f)

    

In [20]:
corpus = PickledCorpusReader('/Users/willard.hong/Downloads/aclImdb/pickle')

In [21]:
print(corpus.docs(['10020_8.pkl', '10021_9.pkl']))

['felix', 'in', 'hollywood', 'be', 'a', 'great', 'film', 'the', 'version', 'i', 'view', 'be', 'very', 'well', 'restore', 'which', 'be', 'sometimes', 'a', 'problem', 'with', 'these', 'silent', 'era', 'animate', 'film', '-PRON-', 'have', 'some', 'of', 'hollywood', "'s", 'most', 'famous', 'star', 'make', 'cameo', 'animate', 'appearance', 'a', 'must', 'for', 'any', 'silent', 'film', 'or', 'animation', 'enthusiast']
