# 03. Machine learning

This session will ...

### Recreate the corpus

In [252]:
# Suppress compatibility warnings
import warnings
warnings.filterwarnings('ignore')

In [253]:
import pandas as pd
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

text_dir = '../Data/Texts/'
corpus = PlaintextCorpusReader(text_dir, '.*\.txt')

# A function to turn fileids into a table of metadata
def parse_fileids(fileids):
    '''Takes a list of file names formatted like A-Cather-Antonia-1918-F.txt.
       Returns a pandas dataframe of derived metadata.'''
    import pandas as pd
    meta = {}
    for fileid in fileids:
        file = fileid.strip('.txt') # Get rid of file suffix
        fields = file.split('-') # Split on dashes
        fields[2] = fields[2].replace('_', ' ') # Remove underscore from titles
        fields[3] = int(fields[3])
        meta[file] = fields
    metadata = pd.DataFrame.from_dict(meta, orient='index') # Build dataframe
    metadata.columns = ['nation', 'author', 'title', 'pubdate', 'gender'] # Col names
    return metadata.sort_index() # Note we need to sort b/c datframe built from dictionary

def collect_stats(corpus):
    '''Takes an NLTK corpus as input. 
       Returns a pandas dataframe of stats indexed to fileid.'''
    import nltk
    import pandas as pd
    stats = {}
    for fileid in corpus.fileids():
        word_count = len(corpus.words(fileid))
        stats[fileid.strip('.txt')] = {'wordcount':word_count}
    statistics = pd.DataFrame.from_dict(stats, orient='index')
    return statistics.sort_index()

books = parse_fileids(corpus.fileids())
stats = collect_stats(corpus)
books = books.join(stats)
books.index.set_names('file', inplace=True)
books.head()

Unnamed: 0_level_0,nation,author,title,pubdate,gender,wordcount
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A-Cather-Antonia-1918-F,A,Cather,Antonia,1918,F,97574
A-Chesnutt-Marrow-1901-M,A,Chesnutt,Marrow,1901,M,110288
A-Crane-Maggie-1893-M,A,Crane,Maggie,1893,M,28628
A-Davis-Life_Iron_mills-1861-F,A,Davis,Life Iron mills,1861,F,18789
A-Dreiser-Sister_Carrie-1900-M,A,Dreiser,Sister Carrie,1900,M,194062
