# Network Analysis of Harry Potter Book Series

## Setup

In [None]:
!pip install -r requirements.txt
# !pip install matplotlib
# !pip install netwulf

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 21.4 MB/s 
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 45.0 MB/s 
Collecting networkx
  Downloading networkx-2.5.1-py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 38.3 MB/s 
[?25hCollecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Collecting soupsieve>1.2
  Downloading soupsieve-2.2.1-py3-none-any.whl (33 kB)
Collecting decorator<5,>=4.3
  Downloading decorator-4.4.2-py2.py3-none-any.whl (9.2 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1273 sha256=d09e6cb22bab148e4043228ad12e4e9faeb122e63bac22ac744517e5088266fa
  Stored in directory: /root/.cache/pip/wheels/75/78/21/68b124549c9bdc94f822c02fb9aa3578a6

In [None]:
import numpy as np
import pickle as pkl
import pandas as pd
import nltk
from clean_books import clean_book
from sentiment import *
import os, re, sys
from difflib import SequenceMatcher
from tqdm import tqdm
import itertools
import networkx as nx

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords

## Function Definitions for Text Analysis

Some useful function can be found in `sentiment.py`:

`happiness(doc)` takes a list of words and computes the average happiness score using the Hedonometer

`emotion_score(doc)` takes a list of words and computes a dictionary of average emotion scores among the emotions: _Anger, Anticipation,	Disgust, Fear, Joy, Sadness, Surprise_ and _Trust_

`vader_sentiment(doc)` takes a list of words and/or sentences and computes the average VADER compund score

`TF_IDF(docs_to_analyse, all_docs)` computes TF and TF-IDF score of terms in docs_to_analyse. all_docs are used to comput IDF scores 


## Scraping the internet for information
Below the chapter titles of the the books and characters by house has been scraped from the internet with the script `scrape_book_chapters.py` and `scrape_characters_by_house`

In [None]:
chapter_info = pd.read_pickle('data/chapter_dataframe.pkl')
#Add book number to chapters
idx = chapter_info.index[chapter_info["Local Chapter"] == 1]
for i in range(7):
    chapter_info.loc[idx[i]:(idx[i+1] if i+1 < len(idx) else None), "Book"] = int(i+1)
chapter_info

Unnamed: 0,Global Chapter,Local Chapter,Title,Approx Story Time,Book
0,1,1,The Boy Who Lived,1981-11-01,1.0
1,2,2,The Vanishing Glass,1991-06-23,1.0
2,3,3,The Letters from No One,1991-07-23,1.0
3,4,4,The Keeper of Keys,1991-07-31,1.0
4,5,5,Diagon Alley,1991-07-31,1.0
...,...,...,...,...,...
194,195,33,The Prince’s Tale,1998-05-02,7.0
195,196,34,The Forest Again,1998-05-02,7.0
196,197,35,King’s Cross,1998-05-02,7.0
197,198,36,The Flaw in the Plan,1998-05-02,7.0


In [None]:
with open('data/characters_by_house.pkl', 'rb') as file:
    characters_by_communities = pkl.load(file)

characters_by_communities

{'Gryffindor': ['Albus Dumbledore',
  'Alicia Spinnet',
  'Andrew Kirke',
  'Angelina Johnson',
  'Bill Weasley',
  'Celestina Warbeck',
  'Charlie Weasley',
  'Cormac McLaggen',
  'Dean Thomas',
  'Demelza Robins',
  'Dennis Creevey',
  'Euan Abercrombie',
  'Fred Weasley',
  'Geoffrey Hooper',
  'George Weasley',
  'Ginny Weasley',
  'Godric Gryffindor',
  'Harry Potter',
  'Jack Sloper',
  'James Potter',
  'James Sirius Potter',
  'Jimmy Peakes',
  'Katie Bell',
  'Kenneth Towler',
  'Lavender Brown',
  'Lee Jordan',
  'Lily Potter',
  'Minerva McGonagall',
  'Natalie McDonald',
  'Nearly-Headless Nick',
  'Neville Longbottom',
  'Oliver Wood',
  'Panju Weasley',
  'Parvati Patil',
  'Patricia Stimpson',
  'Percy Weasley',
  'Peter Pettigrew',
  'Remus Lupin',
  'Ritchie Coote',
  'Romilda Vane',
  'Ron Weasley',
  'Rose Granger-Weasley',
  'Rubeus Hagrid',
  'Seamus Finnigan',
  'Sir Cadogan',
  'Vicky Frobisher',
  'Wormtail',
  'Yann Fredericks',
  'Hermione Granger'],
 'Hufflep

# Splitting Book Chapters Into Documents

In [None]:
# Get a list of paths to books
path = "data/books/"
books = os.listdir(path)
books.sort()
books_in_chapters = []

for i in tqdm(range(7), desc='Book Loop'): # For every book
    # Remove HTML TAGS and double spacing
    print("\n", file = sys.stderr)
    book = clean_book(path + books[i]) 
    # book = re.sub('\n{2,}', '\n', book)
    # Get chapters of book
    chapters = [chapter.upper() for chapter in chapter_info.loc[chapter_info["Book"] == i+1, "Title"]]

    # Split Lines and remove lines including page numbering or J.K. Rowling
    lines = [line for line in book.splitlines() if not re.search(r'Page \| [0-9]{1,} | J.K. Rowling', line)]        

    # Find lines with the title of the chapters
    chapter_idx = []
    for chapter in chapters:
        # Use SequenceMatcher to find similarity between line and a given chapter. Threshold of 0.6
        ties = [(i, line) for i, line in enumerate(lines) if SequenceMatcher(None, chapter, line).ratio() >.6]

        # If 2 or more lines have a similarity of above .6 with the chapter find the most similar
        idx, title = ties[np.argmax([SequenceMatcher(None, chapter, title).ratio() for _, title in ties])]
        chapter_idx.append(idx)
        print(title, chapter, file = sys.stderr)

    print("\n", file = sys.stderr)  
    # Get a list of chapters - List of strings
    chapters = ["\n".join(lines[chapter_idx[i]:chapter_idx[i+1] if len(chapters) > i+1 else None]) for i in range(len(chapter_idx))]
    chapters[:] = [re.sub(r'[".!]{0}[\n]{3,}', '\n', chapter) for chapter in chapters]
    books_in_chapters.append(chapters)  

pkl.dump(books_in_chapters, open("data/books_in_chapters.pkl","wb")) 

Book Loop:   0%|          | 0/1 [00:00<?, ?it/s]

THE BOY WHO LIVED  THE BOY WHO LIVED
THE VANASHING GLASS  THE VANISHING GLASS
THE LETTERS FROM NO ONE  THE LETTERS FROM NO ONE
THE KEEPER OF KEYS  THE KEEPER OF KEYS
DIAGON ALLEY  DIAGON ALLEY
THE JOURNEY FROM PLATFORM NINE AND THREE-QUARTERS  THE JOURNEY FROM PLATFORM NINE AND THREE-QUARTERS
THE SORTING HAT  THE SORTING HAT
THE POTIONS MASTER  THE POTIONS MASTER
THE MIDNIGHT DUEL  THE MIDNIGHT DUEL
HALLOWE'EN  HALLOWE'EN
QUIDDITCH  QUIDDITCH
THE MIRROR OF ERISED  THE MIRROR OF ERISED
NICOLAS FLAMEL  NICOLAS FLAMEL
NORBERT THE NORWEGIAN RIDGEBACK  NORBERT THE NORWEGIAN RIDGEBACK
THE FORBIDDEN FOREST  THE FORBIDDEN FOREST
THROUGH THE TRAPDOOR  THROUGH THE TRAPDOOR
THE MAN WITH TWO FACES  THE MAN WITH TWO FACES


Book Loop: 100%|██████████| 1/1 [00:09<00:00,  9.45s/it]


In [None]:
with open('data/books_in_chapters.pkl', 'rb') as file:
    books_in_chapters = pkl.load(file)
print(books_in_chapters[-1][5])

THE GHOUL IN PAJAMAS 

The shock of losing Mad-Eye hung over the house in 
the days that followed; Harry kept expecting to see 
him stumping in through the back door like the other 
Order members, who passed in and out to relay news. 
Harry felt that nothing but action would assuage his 
feelings of guilt and grief and that he ought to set out 
on his mission to find and destroy Horcruxes as soon 
as possible. 

“Well, you can’t do anything about the” — Ron 
mouthed the word Horcruxes — “till you’re seventeen. 
You’ve still got the Trace on you. And we can plan 
here as well as anywhere, can’t we? Or,” he dropped 
his voice to a whisper, “d’you reckon you already 
know where the You-Know-Whats are?” 

“No,” Harry admitted. 

“I think Hermione’s been doing a bit of research,” said 
Ron. “She said she was saving it for when you got 
here.” 
They were sitting at the breakfast table; Mr. Weasley 
and Bill had just left for work. Mrs. Weasley had gone 
upstairs to wake Hermione and Ginny, w

# Preprocessing and Cleaning

 Remove stopwords etc.

In [None]:
from nltk.tokenize import RegexpTokenizer
with open('data/books_in_chapters.pkl', 'rb') as file:
    books_in_chapters = pkl.load(file)

SW = stopwords.words("english")
books_in_chapters_clean = books_in_chapters.copy()
for book in range(len(books_in_chapters)):
    for chapter in range(len(books_in_chapters_clean[book])):
        books_in_chapters_clean[book][chapter] = re.sub(r".*\n", "", books_in_chapters_clean[book][chapter], 1) # remove chapter title 
        # books_in_chapters_clean[book][chapter] = books_in_chapters_clean[book][chapter].replace(" \n", ". ", 1) # set punctuation after chapter title
        words = books_in_chapters_clean[book][chapter].split(' ')
        words = [word for word in words if word.lower() not in SW]
        tokenizer = RegexpTokenizer(r'\w+[,\.!\?:]?')
        books_in_chapters_clean[book][chapter] = " ".join(tokenizer.tokenize(" ".join(words)))

# pkl.dump(books_in_chapters, open("data/books_in_chapters_clean.pkl","wb"))
print(books_in_chapters_clean[1][0])

FileNotFoundError: [Errno 2] No such file or directory: 'data/books_in_chapters.pkl'

In [None]:
with open('data/books_in_chapters.pkl', 'rb') as file:
    books_in_chapters = pkl.load(file)
test = books_in_chapters[1][0]
test.replace(" \n", r". ", 1)
re.sub(r".*\n", "", test, 1)

'Not for the first time, an argument had broken out \nover breakfast at number four, Privet Drive. Mr. \nVernon Dursley had been woken in the early hours of \nthe morning by a loud, hooting noise from his nephew \nHarry’s room. \n“Third time this week!” he roared across the table. “If \nyou can’t control that owl, it’ll have to go!” \nHarry tried, yet again, to explain. \n“She’s bored,” he said. “She’s used to flying around \noutside. If I could just let her out at night — ” \n“Do I look stupid?” snarled Uncle Vernon, a bit of \nfried egg dangling from his bushy mustache. “I know \nwhat 11 happen if that owl’s let out.” \nHe exchanged dark looks with his wife, Petunia. \nHarry tried to argue back but his words were drowned \nby a long, loud belch from the Dursleys’ son, Dudley. \n“I want more bacon.” \n“There’s more in the frying pan, sweetums,” said Aunt \nPetunia, turning misty eyes on her massive son. “We \nmust build you up while we’ve got the chance. ... I \ndon’t like the sound o

# Actor Extraction

We use the character names that we scraped from the web as search words to see if we can find them in each chapter, where they appear.

Because the characters can be called by different names and nicknames we "normalize" the names in the book text to begin with.
We make it so all names and nicknames are replaced with the corresponding first name of the character in lower case e.g. _'voldemort'_ is sometimes called _'voldemort'_ or _'he-who-must-not-be-names'_ or _'tom riddle'_. We map them all to _'voldemort'_  

In [None]:
characters_by_communities_reverse = {val:key for key, value in characters_by_communities.items() for val in value if key != 'House Unknown'}

special_names = {'He-Who-Must-Not-Be-Named':'Voldemort', 'You-Know-Who':'Voldemort','Tom Riddle': 'Voldemort', 'Tom Marvolo Riddle':'Voldemort',
                'Peter Pettigrew':'Wormtail', 'Weasley':'Ron', 'Potter':'Harry', 'Malfoy':'Draco', 'Tonks':'Nymphadora',
                 'Mr. Weasley':'mr. Weasley', 'Mrs. Weasley':'mrs. Weasley', 'Mr. Potter': 'Harry'}

names      = {name: name.split(" ")[0] 
                for name in characters_by_communities_reverse.keys()
                if ('the' not in name.lower()) and ('.' not in name.lower()) and ('miss' not in name.lower()) and ('sir' not in name.lower()) }


firstname     = {name.split(" ")[-1]: name.split(" ")[0] 
                for name in characters_by_communities_reverse.keys()
                if ('the' not in name.lower()) and ('.' not in name.lower()) and ('miss' not in name.lower()) and ('sir' not in name.lower()) }

# characters_by_communities_reverse
names.update(firstname)
names.update(special_names)
del names["Black"]; del names["Riddle"]; del names["Merlin"]
regexPattern = "|".join(names.keys())

# surnames
text = re.sub(regexPattern, lambda name: names[name.group()], t )
books_in_chapters_clean_names = [[re.sub(regexPattern, lambda name: names[name.group()], chapter ) for chapter in book] for book in books_in_chapters_clean]
print(books_in_chapters_clean_names[4][6])
names

NameError: name 't' is not defined

In [None]:
def extract_characters_by_chapter(names, books):
    '''
        Params:
        names:    List of character names to search for
        books:    list of books. Each book is a list of chapters. 
                  e.g. to get text from chap. 1 in book 1 do books[0][0]

        Returns: Dictionary mapping the title of a chapter to a list of
                 the characters appearing in the chapter. If a character
                 appear multiple times in a chapter, they will appear
                 multiple times in the dictionary list for that chapter.
    '''
    characters_in_chapter = {}

    global_chapter = 0

    for book in tqdm(books):
        for chapter in book:
            global_chapter += 1
            characters_in_chapter[global_chapter] = {}

            for name in names:
                search_results = re.findall(f' {name}[.,!?\s] ', chapter)
                if search_results:
                    characters_in_chapter[global_chapter][name] = len(search_results)

    return characters_in_chapter


In [None]:
search_names = set(names.values())
books = books_in_chapters_clean_names

characters_in_chapter = extract_characters_by_chapter(search_names, books)
characters_in_chapter

100%|██████████| 7/7 [00:02<00:00,  3.14it/s]


{1: {'Harry': 7, 'Minerva': 9, 'Voldemort': 2, 'Albus': 17, 'Rubeus': 4},
 2: {'Harry': 16},
 3: {'Dennis': 1, 'Harry': 14, 'Malcolm': 1},
 4: {'James': 1,
  'Harry': 23,
  'Minerva': 1,
  'Susan': 1,
  'Voldemort': 1,
  'Albus': 2,
  'Rubeus': 14},
 5: {'Harry': 44, 'Albus': 2, 'Rubeus': 41, 'Helga': 3, 'Lavender': 1},
 6: {'Fred': 7,
  'Hermione': 5,
  'Harry': 36,
  'Gregory': 2,
  'Neville': 2,
  'Ron': 31,
  'George': 5,
  'Godric': 2,
  'Albus': 3,
  'Draco': 1,
  'Rubeus': 3,
  'Lee': 1,
  'Percy': 3,
  'Ginny': 3},
 7: {'Fred': 1,
  'Hermione': 2,
  'Justin': 1,
  'Nearly-Headless': 1,
  'Harry': 9,
  'Mandy': 1,
  'Lisa': 1,
  'Gregory': 1,
  'Blaise': 1,
  'Neville': 4,
  'Ron': 4,
  'Godric': 4,
  'Minerva': 3,
  'Susan': 1,
  'Severus': 3,
  'Morag': 1,
  'Rowena': 2,
  'Albus': 5,
  'Terry': 1,
  'Draco': 1,
  'Seamus': 2,
  'Dean': 1,
  'Rubeus': 3,
  'Hannah': 2,
  'Percy': 7,
  'Helga': 3,
  'Millicent': 1,
  'Lavender': 1},
 8: {'Hermione': 1,
  'Vincent': 1,
  'Filius

# Creating the Network

In [None]:
# Computer character nodes
character_nodes = list(search_names)

coorcurrence_edges = []

df_dict = {"source": [], "target": [], "weight": [], "global_chapter":[]}
# Compute edges as coorcurrences in chapters
for chapter, characters in characters_in_chapter.items():
    for character, occurences in characters.items():
        other_characters = list(characters.keys())
        other_characters.remove(character)
        for o_char in other_characters:
            if character != o_char:
                coorcurrence_edges += [(character, o_char)]
                df_dict["source"].append(character)
                df_dict["target"].append(o_char) 
                df_dict["weight"].append(0)
                df_dict["global_chapter"].append(chapter)

df = pd.DataFrame(df_dict)

In [None]:
df
with open('data/chapter_dataframe.pkl', 'rb') as file:
    chapter_dataframe = pkl.load(file)
# chapter_dataframe.columns
joined_df = df.join(chapter_dataframe.set_index('Global Chapter'), on='global_chapter', how='left')
joined_df

FileNotFoundError: [Errno 2] No such file or directory: 'data/chapter_dataframe.pkl'

In [None]:
G = nx.convert_matrix.from_pandas_edgelist(df[:], source = "source", target = "target", create_using=nx.Graph)

C = {val:key for key, val in names.items()}
# [characters_by_communities_reverse[C[node]] for node in G.nodes if C[node] in characters_by_communities_reverse.keys()]
# nx.set_node_attributes(G, "house", C)
#
[node for node in G.nodes()]
characters_by_communities_reverse

{'Albus Dumbledore': 'Gryffindor',
 'Alicia Spinnet': 'Gryffindor',
 'Andrew Kirke': 'Gryffindor',
 'Angelina Johnson': 'Gryffindor',
 'Bill Weasley': 'Gryffindor',
 'Celestina Warbeck': 'Gryffindor',
 'Charlie Weasley': 'Gryffindor',
 'Cormac McLaggen': 'Gryffindor',
 'Dean Thomas': 'Gryffindor',
 'Demelza Robins': 'Gryffindor',
 'Dennis Creevey': 'Gryffindor',
 'Euan Abercrombie': 'Gryffindor',
 'Fred Weasley': 'Gryffindor',
 'Geoffrey Hooper': 'Gryffindor',
 'George Weasley': 'Gryffindor',
 'Ginny Weasley': 'Gryffindor',
 'Godric Gryffindor': 'Gryffindor',
 'Harry Potter': 'Gryffindor',
 'Jack Sloper': 'Gryffindor',
 'James Potter': 'Gryffindor',
 'James Sirius Potter': 'Gryffindor',
 'Jimmy Peakes': 'Gryffindor',
 'Katie Bell': 'Gryffindor',
 'Kenneth Towler': 'Gryffindor',
 'Lavender Brown': 'Gryffindor',
 'Lee Jordan': 'Gryffindor',
 'Lily Potter': 'Gryffindor',
 'Minerva McGonagall': 'Gryffindor',
 'Natalie McDonald': 'Gryffindor',
 'Nearly-Headless Nick': 'Gryffindor',
 'Nevill

In [None]:
# import netwulf as nw
# nw.visualize(nx.convert_matrix.from_pandas_edgelist(df[:150], source = "source", target = "target", create_using=nx.Graph))

In [None]:
with open('data/books_in_chapters.pkl', 'rb') as file:
    books_in_chapters = pkl.load(file)

test = books_in_chapters[0][0]
matches = re.findall(r'“(.*)” .* said', test)
matches

['No,', 'Oh yes, everyone’s celebrating, all right,', 'I know ... I know ...']

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=85fb65b4-b596-4730-837e-04e86eafe419' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>