# Working with corpus data

Here, we prepare the corpus in different ways (POS tagged, chunked, lemmatized, etc.). See also the TMN.py library, where the real work happens.

In [1]:
%%time

import os
import sys

# Where are the corpus texts on your system
text_dir = os.path.join('..', 'data', 'texts')
pickle_dir = os.path.join('..', 'data', 'pickled_normalized')

# Import our libraries
sys.path.append(os.path.join('..', 'libraries'))
import TMN

# Initialize our corpus reader
corpus = TMN.TMNCorpusReader(text_dir, r'.+\.txt')

# Initialize preprocessor
preproc = TMN.Preprocessor(corpus, pickle_dir)

# Perform preprocessing and save output to disk
processed = preproc.transform(chunksize=0, norm=True)

CPU times: user 8min 37s, sys: 12.4 s, total: 8min 49s
Wall time: 9min 32s


In [2]:
# Show that we can work with the pickled versions
pcorpus = TMN.PickledCorpusReader(pickle_dir)
print("Categories in the corpus:\n", pcorpus.categories())
print("\nNumber of files:\n", len(pcorpus.fileids()))
print("\nA bit of one pickled text")
for doc in pcorpus.docs(fileids=['A-Stowe-Uncle_Tom-1852-F.pickle']):
    print(doc[1])
    break

Categories in the corpus:
 ['A', 'AF', 'AM', 'B', 'BF', 'BM', 'F', 'M']

Number of files:
 40

A bit of one pickled text
[['chapter', 'i']]
