In [19]:
#
#             *
#            * *
#           *   *
#          *     *
#           *   *
#            * *
#             *
#
# March Twenty-Fifth, Two Thousand Twenty-One
# Red Queen
# tarah@redqueentech.com
#
# The digital humanities only looks like it's computational. Actually, it's data cleaning. Forever.
#
# Use me for cleaning Gutentag XML exports to get down to the nested paragraphs and return pandas dataframes
# that project found here: https://gutentag.sdsu.edu/


In [20]:
from xml.etree.ElementTree import ElementTree
import pandas as pd
from html import unescape
from xml.etree import ElementTree
from nltk import sent_tokenize, re
import xmltodict

Read in XML file from Gutentag for paragraphs. Shelley and Dickens are sampled

In [21]:
def xml_to_dataframe(f_path):
    root = ElementTree.parse(f_path).getroot()
    elements = root.getchildren()[0][-1] #You may have to play with this to isolate the element
    f_dict = {'paragraph': []}
    for b in elements:
        if b.attrib.get('type', None) == 'chapter':
            for t in b.getchildren():
                try:
                    if len(t.text.strip()) > 30:
                        f_dict['paragraph'].append(t.text)
                except:
                    continue
        else:
            try:
                if len(b.text.strip()) > 30:
                    f_dict['paragraph'].append(b.text)
            except:
                continue #Extremely broad exceptions that may skip over large text portions; YMMV
    df = pd.DataFrame.from_dict(f_dict)
    return df

The odd case of parsing letters as XML. Austen letters are sampled

In [22]:
def letters_to_dataframe(f_path):
    elements = xmltodict.parse(open(f_path).read())
    paragraph_groups = [t['body']['p'] for t in elements['TEI']['text']['group']['text']]
    paragraph_objs = [item for sublist in paragraph_groups for item in sublist]
    paragraphs = []
    for item in paragraph_objs:
        try:
            if len(item['#text']) > 30:
                paragraphs.append(item['#text'])
        except:
            continue
    f_dict = {'paragraph': paragraphs}
    df = pd.DataFrame.from_dict(f_dict)
    return df

Define a function to feed in a text file and get back a dataframe; sampled with Lovelace's
letters to Augustus De Morgan

In [23]:
def text_to_dataframe(f_path):
    file = open(f_path, 'rt', encoding='utf8')
    text = file.read()
    file.close()
    sent = sent_tokenize(text)
    df = pd.DataFrame(sent)
    df.rename(columns={0:"paragraph"}, inplace=True)
    return df

In [24]:
#Get some dataframes

frankenstein_df = xml_to_dataframe('Frankenstein.xml')
print(frankenstein_df)

pickwick_df = xml_to_dataframe('Pickwick.xml')
#print(pickwick_df)

austen_df = letters_to_dataframe('AustenLetters.xml')
#print(austen_df)

lovelace_df = text_to_dataframe('Lovelace.txt')
#print(lovelace_df)

                                             paragraph
0    How slowly the time passes here , encompassed ...
1    But I have one want which I have never yet bee...
2    Yet do not suppose , because I complain a litt...
3    I cannot describe to you my sensations on the ...
4            Your affectionate brother , Robert Walton
..                                                 ...
355  I am interrupted . What do these sounds porten...
356  Great God ! what a scene has just taken place ...
357  He paused , looking on me with wonder , and ag...
358  I was at first touched by the expressions of h...
359  He sprang from the cabin window as he said thi...

[360 rows x 1 columns]


  elements = root.getchildren()[0][-1] #You may have to play with this to isolate the element
  for t in b.getchildren():
  elements = root.getchildren()[0][-1] #You may have to play with this to isolate the element
  for t in b.getchildren():


In [25]:
# Define function to clean text
def clean(text):
    text = unescape(text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = ' '.join([w.lower() for w in text.split()])
    return text

Clean up the dataframes and remove cruft.

In [26]:
# Create column with cleaned text
frankenstein_df['cleaned_text'] = frankenstein_df.paragraph.map(lambda x: clean(x))
frankenstein_df = frankenstein_df['cleaned_text']
pickwick_df['cleaned_text'] = pickwick_df.paragraph.map(lambda x: clean(x))
pickwick_df = pickwick_df['cleaned_text']
austen_df['cleaned_text'] = austen_df.paragraph.map(lambda x: clean(x))
austen_df = austen_df['cleaned_text']
lovelace_df['cleaned_text'] = lovelace_df.paragraph.map(lambda x: clean(x))
lovelace_df = lovelace_df['cleaned_text']


How many paragraphs are in our dataframes?

In [27]:
print(len(frankenstein_df))
print(len(pickwick_df))
print(len(austen_df))
print(len(lovelace_df))






360
4394
1153
971
