In [1]:
from collections import defaultdict, Counter
import glob, json, os

xml = glob.glob('../../data/folger/xml/*.xml')

Sample entrance declaration:
```
<stage xml:id="stg-0000" n="SD 1.1.0" type="entrance" who="#HenryIV_1H4 #Bedford_H5 #Westmoreland_1H4 #ATTENDANTS_1H4">
```
Sample exit declaration:
```
<stage xml:id="stg-0352.1" n="SD 1.3.21.1" type="exit" who="#Worcester_1H4">
```

In [2]:
from bs4 import BeautifulSoup

# d[play basename][xml:id] = {name: , sex: ,}
char_d = defaultdict(lambda: defaultdict())

# d[play basename][xml:id] = [{act: , scene: , speech: }, {}, ...]
word_d = defaultdict(lambda: defaultdict(list))

# d[play basename][xml:id] = [word indices spoken by character]
word_indices = defaultdict(lambda: defaultdict(list))

# d[play basename][xml:id] = [word indices character on stage]
stage_indices = defaultdict(lambda: defaultdict(list))

# d[play basename] = soup
soups = defaultdict()

# limit inputs for development
missed_lines = []
missing_speakers = defaultdict(set)
for idx, i in enumerate(xml):
  
  # store a boolean indicating if we're parsing a new character's speech
  new_speech = False
  in_speaker = False
  
  bn = os.path.basename(i)
  print(' * processing', bn)  
  with open(i, encoding = 'utf8') as f:
    soup = BeautifulSoup(f, 'lxml')
    soups[bn] = soup
    
    # parse characters and their genders
    for j in soup.find_all('person'):
      try:
        xml_id = j['xml:id']
        name = j.find('persname').get_text()
        sex = j.find('sex').get_text()
        char_d[bn][xml_id] = {'name': name, 'sex': sex}
      except Exception as exc:
        missed_lines.append(j)
    
    # store a separate dictionary that indicates which characters were on stage when
    on_stage = set()
    lines = open(i, encoding = 'utf8').read().split('\n')
    stage_word_index = 0
    speakers = []
    for lidx, l in enumerate(lines):
      if '<stage ' in l:
        if 'type="entrance"' in l:
          characters = l.split('who="')[1].split('"')[0]
          for c in characters.split():
            on_stage.add(c.lstrip('#'))
        elif 'type="exit"' in l:
          characters = l.split('who="')[1].split('"')[0]
          for c in characters.split():
            try:
              on_stage.remove(c.lstrip('#'))
            except KeyError:
              print(' * {} left stage but was not on stage'.format(c))
              pass
      elif '<speaker' in l:
        in_speaker = True
        new_speech = True
      elif '</speaker' in l:
        in_speaker = False
      elif '<sp ' in l:
        speakers = [i.strip() for i in l.split('who="')[1].split('"')[0].split('#') if i.strip()]
      elif '<w ' in l:
        # check if this is the word indicating the speaking character
        if in_speaker:
          continue
        for c in on_stage:
          stage_indices[bn][c.strip()].append(stage_word_index)
        if speakers:
          for speaker in speakers:
            speaker = speaker.strip()
            word_indices[bn][speaker].append(stage_word_index)
            
            # add the spoken word data
            word = l.split('>')[1].split('<')[0]
            if new_speech:
              word_d[bn][speaker].append([word])
              new_speech = False
            else:
              try:
                word_d[bn][speaker][-1].append(word)
              except IndexError:
                word_d[bn][speaker].append([word])
        else:
          #print(' * words spoken by no speakers', bn, stage_word_index)
          pass
        stage_word_index += 1

# save results to disk
with open('../../data/folger/json/characters.json', 'w') as out: json.dump(char_d, out)
with open('../../data/folger/json/words.json', 'w') as out: json.dump(word_d, out)
with open('../../data/folger/json/word_indices.json', 'w') as out: json.dump(word_indices, out)

 * processing 1H4.xml
 * #SOLDIERS.HENRY.0_1H4 left stage but was not on stage
 * processing 1H6.xml
 * #SOLDIERS.ENGLISH.0_1H6 left stage but was not on stage
 * #ATTENDANTS.ENGLISH.0.1_1H6 left stage but was not on stage
 * #SOLDIERS.ENGLISH.0.1_1H6 left stage but was not on stage
 * processing 2H4.xml
 * #Bardolph_1H4 left stage but was not on stage
 * #SOLDIERS.REBEL.0.1_2H4 left stage but was not on stage
 * processing 2H6.xml
 * #CITIZENS.STALBANS.0_2H6 left stage but was not on stage
 * #ATTENDANTS.0_2H6 left stage but was not on stage
 * #FOLLOWERS.CADE.0.1_2H6 left stage but was not on stage
 * #FOLLOWERS.CADE.0_2H6 left stage but was not on stage
 * #ATTENDANTS.YORK.0.1_2H6 left stage but was not on stage
 * #ATTENDANTS.YORK.0.2_2H6 left stage but was not on stage
 * #ATTENDANTS.0.1_2H6 left stage but was not on stage
 * processing 3H6.xml
 * #SOLDIERS.LANCASTER.0_3H6 left stage but was not on stage
 * #SOLDIERS.YORK.0_3H6 left stage but was not on stage
 * #SOLDIERS.YORK.0.1

In [3]:
with open('word-indices.txt', 'w') as out:
  for play in word_indices:
    for character in word_indices[play]:
      for idx in word_indices[play][character]:
        out.write(play + '\t' + character + '\t' + str(idx) + '\n')
        
 

In [4]:
# convert character dict to simple dictionary for display
char_d = {k: dict(v) for k,v in char_d.items()}
word_d = {k: dict(v) for k,v in word_d.items()}

# determine speaking role of each character in each act
d = defaultdict(lambda: defaultdict(lambda: defaultdict(Counter))) # d[play][act][scene][character] = words in scene

# On Stage Duration vs Speech Duration

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

words = [i.split('\t') for i in open('word-indices.txt').read().split('\n')[:-1]]
stages = [i.split('\t') for i in open('stage-indices.txt').read().split('\n')[:-1]]

In [6]:
from collections import defaultdict, Counter

def is_number(c):
  try:
    int(c)
    return True
  except:
    return False

# create dataframe with play character n_words_spoken n_words stage
d = defaultdict(lambda: defaultdict(Counter))

for play, char, _ in words:  d[play][char]['words'] += 1
for play, char, _ in stages: d[play][char]['stage'] += 1
  
with open('character-stage-vs-words.txt', 'w') as out:
  for play in d:
    for char in d[play]:
      # determine if the character has an all caps name
      char_str = char.replace('.', '###').replace('_', '###').split('###')
      char_is_ensemble = any([w == w.upper() for w in char_str[:-1]]) or\
        any([is_number(w) for w in char_str])
      gender = char_d.get(play, {}).get(char, {}).get('sex', 'unknown')
      row = [char, play, d[play][char]['words'], d[play][char]['stage'], char_is_ensemble, gender]
      out.write('\t'.join([str(i) for i in row]) + '\n')

In [7]:
distinct_plays = set([i[0] for i in words])

# Topic Modeling

In [20]:
play = 'Tit.xml'

# word_d[play]['Lavinia_Tit']

In [21]:
char_d[play]['Lavinia_Tit']

{'name': 'Lavinia', 'sex': 'female'}

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import decomposition

max_freq = 0
min_count = 10
n_topics = 25
max_iters = 1000
feature_scalar = 500
min_speech_length = 100

def get_corpus(filter_sex=None):
  for play in word_d:
#     if play == 'Tit.xml':
        for character in word_d[play]:
          sex = char_d.get(play, {}).get(character, {}).get('sex', False)
          if not sex: 
            continue
          if filter_sex and sex != filter_sex: continue
          words = ''
          for speech in word_d[play][character]:
            if len(speech) < min_speech_length: continue
            yield [play, character, ' '.join(speech), sex]

vectorizer = TfidfVectorizer(
  input='content',
  stop_words='english',
  #max_df=max_freq,
  #min_df=min_count,
  max_features=n_topics * feature_scalar
)

l = []
meta = []
for idx, i in enumerate(get_corpus('female')):
  play, character, speech, sex = i
  l.append(speech.lower())
  meta.append([play, character, sex, idx])

print(len(l))
print(len(meta))
print(meta)
print(l[0])


235
235
[['1H4.xml', 'LadyPercy_1H4', 'female', 0], ['1H6.xml', 'Pucelle_1H6', 'female', 1], ['1H6.xml', 'Pucelle_1H6', 'female', 2], ['1H6.xml', 'Pucelle_1H6', 'female', 3], ['1H6.xml', 'Pucelle_1H6', 'female', 4], ['1H6.xml', 'Pucelle_1H6', 'female', 5], ['2H4.xml', 'MistressQuickly_1H4', 'female', 6], ['2H4.xml', 'MistressQuickly_1H4', 'female', 7], ['2H4.xml', 'MistressQuickly_1H4', 'female', 8], ['2H4.xml', 'LadyPercy_1H4', 'female', 9], ['2H4.xml', 'DollTearsheet_2H4', 'female', 10], ['2H6.xml', 'QueenMargaret_1H6', 'female', 11], ['2H6.xml', 'QueenMargaret_1H6', 'female', 12], ['2H6.xml', 'QueenMargaret_1H6', 'female', 13], ['2H6.xml', 'QueenMargaret_1H6', 'female', 14], ['2H6.xml', 'QueenMargaret_1H6', 'female', 15], ['2H6.xml', 'QueenMargaret_1H6', 'female', 16], ['2H6.xml', 'QueenMargaret_1H6', 'female', 17], ['2H6.xml', 'DuchessOfGloucester_2H6', 'female', 18], ['2H6.xml', 'DuchessOfGloucester_2H6', 'female', 19], ['3H6.xml', 'QueenMargaret_1H6', 'female', 20], ['3H6.xml', '

In [28]:
meta[:]

[['1H4.xml', 'LadyPercy_1H4', 'female', 0],
 ['1H6.xml', 'Pucelle_1H6', 'female', 1],
 ['1H6.xml', 'Pucelle_1H6', 'female', 2],
 ['1H6.xml', 'Pucelle_1H6', 'female', 3],
 ['1H6.xml', 'Pucelle_1H6', 'female', 4],
 ['1H6.xml', 'Pucelle_1H6', 'female', 5],
 ['2H4.xml', 'MistressQuickly_1H4', 'female', 6],
 ['2H4.xml', 'MistressQuickly_1H4', 'female', 7],
 ['2H4.xml', 'MistressQuickly_1H4', 'female', 8],
 ['2H4.xml', 'LadyPercy_1H4', 'female', 9],
 ['2H4.xml', 'DollTearsheet_2H4', 'female', 10],
 ['2H6.xml', 'QueenMargaret_1H6', 'female', 11],
 ['2H6.xml', 'QueenMargaret_1H6', 'female', 12],
 ['2H6.xml', 'QueenMargaret_1H6', 'female', 13],
 ['2H6.xml', 'QueenMargaret_1H6', 'female', 14],
 ['2H6.xml', 'QueenMargaret_1H6', 'female', 15],
 ['2H6.xml', 'QueenMargaret_1H6', 'female', 16],
 ['2H6.xml', 'QueenMargaret_1H6', 'female', 17],
 ['2H6.xml', 'DuchessOfGloucester_2H6', 'female', 18],
 ['2H6.xml', 'DuchessOfGloucester_2H6', 'female', 19],
 ['3H6.xml', 'QueenMargaret_1H6', 'female', 20],
 

In [29]:
import nltk
import re
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import gensim
from gensim.corpora import Dictionary, MmCorpus 


lemma=WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [30]:
speeches = []
for speech in l:
    speech=re.sub('[^a-zA-Z]',' ',speech)
    speech=word_tokenize(speech)
    #remove stop words
    speech = [word for word in speech if word not in stopwords.words('english')]
    speech=[lemma.lemmatize(w) for w in speech ]
#     speech=' '.join(speech)
    speeches.append(speech)

In [31]:
dictionary = Dictionary(speeches)
vect = [dictionary.doc2bow(speech) for speech in speeches]

In [32]:
MmCorpus.serialize('corpus.mm', vect)
    
speech_corpus = MmCorpus('corpus.mm')

In [33]:
print(speech_corpus)

MmCorpus(235 documents, 5041 features, 16979 non-zero entries)


In [35]:
from gensim.models import LdaMulticore

lda = LdaMulticore(speech_corpus,
                           num_topics=10,
                           id2word=dictionary,
                           workers=2)
lda.show_topic(0)

[('love', 0.00945219),
 ('thy', 0.008050903),
 ('woman', 0.00640231),
 ('lord', 0.0046459944),
 ('heart', 0.00456011),
 ('yet', 0.004520589),
 ('would', 0.0042813336),
 ('good', 0.0041058683),
 ('come', 0.0039634258),
 ('upon', 0.0039629657)]

In [36]:
import pyLDAvis
import pyLDAvis.gensim_models as gm
import pickle

In [37]:
ldavis = gm.prepare(topic_model=lda, 
                                     corpus=speech_corpus, 
                                     dictionary=dictionary)

pyLDAvis.display(ldavis)