In [15]:
import string
import pandas as pd
import io

## Data Preparation

In [28]:
files = ['got1.txt', 'got2.txt', 'got3.txt', 'got4.txt']
#See what the text looks like
io.open(files[0], 'r', encoding='UTF-8').read()[:3000]

u'PROLOGUE\n\nThe comet\u2019s tail spread across the dawn, a red slash that bled above the crags of Dragonstone like a wound in the pink and purple sky.\n\nThe maester stood on the windswept balcony outside his chambers. It was here the ravens came, after long flight. Their droppings speckled the gargoyles that rose twelve feet tall on either side of him, a hellhound and a wyvern, two of the thousand that brooded over the walls of the ancient fortress. When first he came to Dragonstone, the army of stone grotesques had made him uneasy, but as the years passed he had grown used to them. Now he thought of them as old friends. The three of them watched the sky together with foreboding.\n\nThe maester did not believe in omens. And yet . . . old as he was, Cressen had never seen a comet half so bright, nor yet that color, that terrible color, the color of blood and flame and sunsets. He wondered if his gargoyles had ever seen its like. They had been here so much longer than he had, and wou

There are special unicode characters such as \u2019,\u201d,\u201c which should seem to be apostrophes or opening/closing quotations. There are also extra "\n" which should be removed.
The following code cleans the text as well as splits the text into lines. The result is a list of four lists of lines.

In [29]:
def read_file(file):

    with io.open(file, 'r', encoding='UTF-8') as f:
        lines = (line.rstrip() for line in f) 
        lines = list(line for line in lines if line)
        lines = [i.replace(u"\u201c", "'").replace(u"\u201d", "'").replace(u"\u2019", "'") 
                 for i in lines if i != "" and i != "''"]
    return lines

    
#4 lists of list of lines (4 books)
text_list = []
for f in files:
    text_list.append(read_file(f))

In [30]:
text_list[0][:10]

[u'PROLOGUE',
 u"The comet's tail spread across the dawn, a red slash that bled above the crags of Dragonstone like a wound in the pink and purple sky.",
 u'The maester stood on the windswept balcony outside his chambers. It was here the ravens came, after long flight. Their droppings speckled the gargoyles that rose twelve feet tall on either side of him, a hellhound and a wyvern, two of the thousand that brooded over the walls of the ancient fortress. When first he came to Dragonstone, the army of stone grotesques had made him uneasy, but as the years passed he had grown used to them. Now he thought of them as old friends. The three of them watched the sky together with foreboding.',
 u'The maester did not believe in omens. And yet . . . old as he was, Cressen had never seen a comet half so bright, nor yet that color, that terrible color, the color of blood and flame and sunsets. He wondered if his gargoyles had ever seen its like. They had been here so much longer than he had, and w

# Character Occurrences

In [33]:
def get_main_characters(text):
    #Since each chapter is titled with one of the main character's name, it makes sense to find all lines with exactly one item
    singlewords = [line for line in text if len(line.split()) == 1]
    
    #filter out all the words with punctuation as those are not names
    for word in singlewords:
        for j in word:
            if j in string.punctuation:
                singlewords.remove(word)
                break

    #there are still some words with punctuation in them that somehow passed the filter as well as duplicates
    #adding counts to the unique words can help with identifying names
    words_count= {}
    for word in singlewords:
        if word in words_count:
            words_count[word] += 1
        else:
            words_count[word] = 0
    
    #words with count > 2 are most likely names as each character most likely has more than 2 chapters
    characters = [word for word, count in words_count.items() if count > 2]
    
    return characters

Get unique list of main characters

In [34]:
character_list = list(set(get_main_characters(text_list[0]) + get_main_characters(text_list[1])+
                              get_main_characters(text_list[2]) + get_main_characters(text_list[3])))
                              
character_list

[u'DAVOS',
 u'THEON',
 u'BRIENNE',
 u'SAMWELL',
 u'SANSA',
 u'CERSEI',
 u'JAIME',
 u'CATELYN',
 u'TYRION',
 u'DAENERYS',
 u'ARYA',
 u'BRAN',
 u'JON']

In [36]:
#4 lists of text (4 books)
text_line = [' '.join(line) for line in text_list]
#4 lists of tokenized text
text_token = [line.split() for line in text_line]
text_token[0][:15]

[u'PROLOGUE',
 u'The',
 u"comet's",
 u'tail',
 u'spread',
 u'across',
 u'the',
 u'dawn,',
 u'a',
 u'red',
 u'slash',
 u'that',
 u'bled',
 u'above',
 u'the']

In [40]:
df = pd.DataFrame(columns=['Character', 'Book', 'Offset'])

num_occurrences = {character:0 for character in character_list}
for book in range(len(text_token)):
    for character in character_list:
        for offset in range(len(text_token[book])):
            if character.lower() in text_token[book][offset].lower():
                df = df.append(pd.Series([character, book, offset], index=df.columns), ignore_index=True)
                num_occurrences[character] += 1
                
num_occurrences

{u'ARYA': 1186,
 u'BRAN': 1224,
 u'BRIENNE': 723,
 u'CATELYN': 628,
 u'CERSEI': 1084,
 u'DAENERYS': 288,
 u'DAVOS': 688,
 u'JAIME': 1417,
 u'JON': 2429,
 u'SAMWELL': 62,
 u'SANSA': 974,
 u'THEON': 931,
 u'TYRION': 2065}

Characters that don't go by their full name don't show up as much as they should(eg. DANY, SAM). 