# Network Analysis of Harry Potter Book Series

## Setup

In [2]:
# !pip install -r requirements.txt
!pip install matplotlib
!pip install netwulf networkx

Collecting netwulf
  Using cached netwulf-0.1.5.tar.gz (236 kB)
Collecting networkx
  Downloading networkx-2.5.1-py3-none-any.whl (1.6 MB)
Collecting decorator<5,>=4.3
  Downloading decorator-4.4.2-py2.py3-none-any.whl (9.2 kB)
Collecting simplejson>=3.0
  Downloading simplejson-3.17.2.tar.gz (83 kB)
Using legacy 'setup.py install' for netwulf, since package 'wheel' is not installed.
Using legacy 'setup.py install' for simplejson, since package 'wheel' is not installed.
Installing collected packages: decorator, simplejson, networkx, netwulf
  Attempting uninstall: decorator
    Found existing installation: decorator 5.0.6
    Uninstalling decorator-5.0.6:
      Successfully uninstalled decorator-5.0.6
    Running setup.py install for simplejson: started
    Running setup.py install for simplejson: finished with status 'done'
    Running setup.py install for netwulf: started
    Running setup.py install for netwulf: finished with status 'done'
Successfully installed decorator-4.4.2 netw

In [3]:
import numpy as np
import pickle as pkl
import pandas as pd
import nltk
from clean_books import clean_book
from sentiment import *
import os, re, sys
from difflib import SequenceMatcher
from tqdm import tqdm
import itertools
import networkx as nx

In [4]:
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\peter\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\peter\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Function Definitions for Text Analysis

Some useful function can be found in `sentiment.py`:

`happiness(doc)` takes a list of words and computes the average happiness score using the Hedonometer

`emotion_score(doc)` takes a list of words and computes a dictionary of average emotion scores among the emotions: _Anger, Anticipation,	Disgust, Fear, Joy, Sadness, Surprise_ and _Trust_

`vader_sentiment(doc)` takes a list of words and/or sentences and computes the average VADER compund score

`TF_IDF(docs_to_analyse, all_docs)` computes TF and TF-IDF score of terms in docs_to_analyse. all_docs are used to comput IDF scores 


## Scraping the internet for information
Below the chapter titles of the the books and characters by house has been scraped from the internet with the script `scrape_book_chapters.py` and `scrape_characters_by_house`

In [None]:
chapter_info = pd.read_pickle('data/chapter_dataframe.pkl')
#Add book number to chapters
idx = chapter_info.index[chapter_info["Local Chapter"] == 1]
for i in range(7):
    chapter_info.loc[idx[i]:(idx[i+1] if i+1 < len(idx) else None), "Book"] = int(i+1)
chapter_info

Unnamed: 0,Global Chapter,Local Chapter,Title,Approx Story Time,Book
0,1,1,The Boy Who Lived,1981-11-01,1.0
1,2,2,The Vanishing Glass,1991-06-23,1.0
2,3,3,The Letters from No One,1991-07-23,1.0
3,4,4,The Keeper of Keys,1991-07-31,1.0
4,5,5,Diagon Alley,1991-07-31,1.0
...,...,...,...,...,...
194,195,33,The Prince’s Tale,1998-05-02,7.0
195,196,34,The Forest Again,1998-05-02,7.0
196,197,35,King’s Cross,1998-05-02,7.0
197,198,36,The Flaw in the Plan,1998-05-02,7.0


In [6]:
with open('data/characters_by_house.pkl', 'rb') as file:
    characters_by_communities = pkl.load(file)

# characters_by_communities

# Splitting Book Chapters Into Documents

In [None]:
# Get a list of paths to books
path = "data/books/"
books = os.listdir(path)
books.sort()
books_in_chapters = []

for i in tqdm(range(7), desc='Book Loop'): # For every book
    # Remove HTML TAGS and double spacing
    print("\n", file = sys.stderr)
    book = clean_book(path + books[i]) 
    book = re.sub('\n{2,}', '\n', book)
    # Get chapters of book
    chapters = [chapter.upper() for chapter in chapter_info.loc[chapter_info["Book"] == i+1, "Title"]]

    # Split Lines and remove lines including page numbering or J.K. Rowling
    lines = [line for line in book.splitlines() if not re.search(r'Page \| [0-9]{1,} | J.K. Rowling', line)]        

    # Find lines with the title of the chapters
    chapter_idx = []
    for chapter in chapters:
        # Use SequenceMatcher to find similarity between line and a given chapter. Threshold of 0.6
        ties = [(i, line) for i, line in enumerate(lines) if SequenceMatcher(None, chapter, line).ratio() >.6]

        # If 2 or more lines have a similarity of above .6 with the chapter find the most similar
        idx, title = ties[np.argmax([SequenceMatcher(None, chapter, title).ratio() for _, title in ties])]
        chapter_idx.append(idx)
        print(title, chapter, file = sys.stderr)

    print("\n", file = sys.stderr)  
    # Get a list of chapters - List of strings
    chapters = ["\n".join(lines[chapter_idx[i]:chapter_idx[i+1] if len(chapters) > i+1 else None]) for i in range(len(chapter_idx))]
    books_in_chapters.append(chapters)  

pkl.dump(books_in_chapters, open("data/books_in_chapters.pkl","wb")) 

Book Loop:   0%|          | 0/1 [00:00<?, ?it/s]

THE BOY WHO LIVED  THE BOY WHO LIVED
THE VANASHING GLASS  THE VANISHING GLASS
THE LETTERS FROM NO ONE  THE LETTERS FROM NO ONE
THE KEEPER OF KEYS  THE KEEPER OF KEYS
DIAGON ALLEY  DIAGON ALLEY
THE JOURNEY FROM PLATFORM NINE AND THREE-QUARTERS  THE JOURNEY FROM PLATFORM NINE AND THREE-QUARTERS
THE SORTING HAT  THE SORTING HAT
THE POTIONS MASTER  THE POTIONS MASTER
THE MIDNIGHT DUEL  THE MIDNIGHT DUEL
HALLOWE'EN  HALLOWE'EN
QUIDDITCH  QUIDDITCH
THE MIRROR OF ERISED  THE MIRROR OF ERISED
NICOLAS FLAMEL  NICOLAS FLAMEL
NORBERT THE NORWEGIAN RIDGEBACK  NORBERT THE NORWEGIAN RIDGEBACK
THE FORBIDDEN FOREST  THE FORBIDDEN FOREST
THROUGH THE TRAPDOOR  THROUGH THE TRAPDOOR
THE MAN WITH TWO FACES  THE MAN WITH TWO FACES


Book Loop: 100%|██████████| 1/1 [00:09<00:00,  9.45s/it]


# Preprocessing and Cleaning

 Remove stopwords etc.

In [None]:
from nltk.tokenize import RegexpTokenizer
with open('data/books_in_chapters.pkl', 'rb') as file:
    books_in_chapters = pkl.load(file)

SW = stopwords.words("english")
books_in_chapters_clean = books_in_chapters.copy()
for book in range(len(books_in_chapters)):
    for chapter in range(len(books_in_chapters_clean[book])):
        books_in_chapters_clean[book][chapter] = re.sub(r".*\n", "", books_in_chapters_clean[book][chapter], 1) # remove chapter title 
        # books_in_chapters_clean[book][chapter] = books_in_chapters_clean[book][chapter].replace(" \n", ". ", 1) # set punctuation after chapter title
        words = books_in_chapters_clean[book][chapter].split(' ')
        words = [word for word in words if word.lower() not in SW]
        tokenizer = RegexpTokenizer(r'\w+[,\.!\?:]?')
        books_in_chapters_clean[book][chapter] = " ".join(tokenizer.tokenize(" ".join(words)))

# pkl.dump(books_in_chapters, open("data/books_in_chapters_clean.pkl","wb"))
print(books_in_chapters_clean[1][0])

FileNotFoundError: [Errno 2] No such file or directory: 'data/books_in_chapters.pkl'

In [7]:
with open('data/books_in_chapters.pkl', 'rb') as file:
    books_in_chapters = pkl.load(file)
test = books_in_chapters[1][0]
# test.replace(" \n", r". ", 1)
# re.sub(r".*\n", "", test, 1)

# Actor Extraction

We use the character names that we scraped from the web as search words to see if we can find them in each chapter, where they appear.

Because the characters can be called by different names and nicknames we "normalize" the names in the book text to begin with.
We make it so all names and nicknames are replaced with the corresponding first name of the character in lower case e.g. _'voldemort'_ is sometimes called _'voldemort'_ or _'he-who-must-not-be-names'_ or _'tom riddle'_. We map them all to _'voldemort'_  

In [244]:
with open('data/books_in_chapters_clean.pkl', 'rb') as file:
    books_in_chapters_clean = pkl.load(file)
characters_by_communities_reverse = {val:key for key, value in characters_by_communities.items() for val in value if key != 'House Unknown'}
characters_by_communities_reverse["Mr. Weasley"] = 'Gryffindor'
characters_by_communities_reverse["Mrs. Weasley"] = 'Gryffindor'
characters_by_communities_reverse["Sirius"] = 'Gryffindor'
del characters_by_communities_reverse["Albus Potter"]

special_names = {'He-Who-Must-Not-Be-Named':'Voldemort', 'You-Know-Who':'Voldemort','Tom Riddle': 'Voldemort', 'Tom Marvolo Riddle':'Voldemort',
                'Peter Pettigrew':'Wormtail', 'Weasley':'Ron', 'Potter':'Harry', 'Malfoy':'Draco', 'Tonks':'Nymphadora',
                 'Mr. Weasley':'mr. Weasley', 'Mrs. Weasley':'mrs. Weasley', 'Mr. Potter': 'Harry', 'Pettigrew':'Wormtail', 'Lupin': 'Remus'}

names      = {name: name.split(" ")[0] 
                for name in characters_by_communities_reverse.keys()
                if ('the' not in name.lower()) and ('.' not in name.lower()) and ('miss' not in name.lower()) and ('sir' not in name.lower()) }


firstname     = {name.split(" ")[-1]: name.split(" ")[0] 
                for name in characters_by_communities_reverse.keys()
                if ('the' not in name.lower()) and ('.' not in name.lower()) and ('miss' not in name.lower()) and ('sir' not in name.lower()) }

# characters_by_communities_reverse
names.update(firstname)
names.update(special_names)
del names["Black"]; del names["Riddle"]; del names["Merlin"]
regexPattern = "|".join(names.keys())

# surnames

books_in_chapters_clean_names = [[re.sub(regexPattern, lambda name: names[name.group()], chapter ) for chapter                                      in book] for book in books_in_chapters_clean]
# print(books_in_chapters_clean_names[4][6])
names

{'Albus Dumbledore': 'Albus',
 'Alicia Spinnet': 'Alicia',
 'Andrew Kirke': 'Andrew',
 'Angelina Johnson': 'Angelina',
 'Bill Weasley': 'Bill',
 'Celestina Warbeck': 'Celestina',
 'Charlie Weasley': 'Charlie',
 'Cormac McLaggen': 'Cormac',
 'Dean Thomas': 'Dean',
 'Demelza Robins': 'Demelza',
 'Dennis Creevey': 'Dennis',
 'Euan Abercrombie': 'Euan',
 'Fred Weasley': 'Fred',
 'Geoffrey Hooper': 'Geoffrey',
 'George Weasley': 'George',
 'Ginny Weasley': 'Ginny',
 'Godric Gryffindor': 'Godric',
 'Harry Potter': 'Harry',
 'Jack Sloper': 'Jack',
 'James Potter': 'James',
 'Jimmy Peakes': 'Jimmy',
 'Katie Bell': 'Katie',
 'Kenneth Towler': 'Kenneth',
 'Lavender Brown': 'Lavender',
 'Lee Jordan': 'Lee',
 'Lily Potter': 'Lily',
 'Minerva McGonagall': 'Minerva',
 'Natalie McDonald': 'Natalie',
 'Nearly-Headless Nick': 'Nearly-Headless',
 'Neville Longbottom': 'Neville',
 'Oliver Wood': 'Oliver',
 'Panju Weasley': 'Panju',
 'Parvati Patil': 'Parvati',
 'Patricia Stimpson': 'Patricia',
 'Percy We

In [13]:
def extract_characters_by_chapter(names, books):
    '''
        Params:
        names:    List of character names to search for
        books:    list of books. Each book is a list of chapters. 
                  e.g. to get text from chap. 1 in book 1 do books[0][0]

        Returns: Dictionary mapping the title of a chapter to a list of
                 the characters appearing in the chapter. If a character
                 appear multiple times in a chapter, they will appear
                 multiple times in the dictionary list for that chapter.
    '''
    characters_in_chapter = {}

    global_chapter = 0

    for book in tqdm(books):
        for chapter in book:
            global_chapter += 1
            characters_in_chapter[global_chapter] = {}

            for name in names:
                search_results = re.findall(f' {name}[.,!?\s] ', chapter)
                if search_results:
                    characters_in_chapter[global_chapter][name] = len(search_results)

    return characters_in_chapter


In [239]:
search_names = set(names.values())
books = books_in_chapters_clean_names

characters_in_chapter = extract_characters_by_chapter(search_names, books)
characters_in_chapter

100%|██████████| 7/7 [00:01<00:00,  4.94it/s]


{1: {'Albus': 17, 'Minerva': 9, 'Rubeus': 4, 'Harry': 7, 'Voldemort': 2},
 2: {'Harry': 16},
 3: {'Dennis': 1, 'Malcolm': 1, 'Harry': 14},
 4: {'Albus': 2,
  'James': 1,
  'Susan': 1,
  'Minerva': 1,
  'Rubeus': 14,
  'Harry': 23,
  'Voldemort': 1},
 5: {'Lavender': 1, 'Albus': 2, 'Rubeus': 41, 'Harry': 44, 'Helga': 3},
 6: {'George': 5,
  'Albus': 3,
  'Lee': 1,
  'Neville': 2,
  'Godric': 2,
  'Percy': 3,
  'Rubeus': 3,
  'Ron': 31,
  'Gregory': 2,
  'Fred': 7,
  'Harry': 36,
  'Hermione': 5,
  'Ginny': 3,
  'Draco': 1},
 7: {'Lisa': 1,
  'Lavender': 1,
  'Nearly-Headless': 1,
  'Seamus': 2,
  'Albus': 5,
  'Susan': 1,
  'Millicent': 1,
  'Neville': 4,
  'Terry': 1,
  'Godric': 4,
  'Justin': 1,
  'Minerva': 3,
  'Percy': 7,
  'Rubeus': 3,
  'Morag': 1,
  'Ron': 4,
  'Severus': 3,
  'Blaise': 1,
  'Hannah': 2,
  'Gregory': 1,
  'Fred': 1,
  'Dean': 1,
  'Harry': 9,
  'Mandy': 1,
  'Hermione': 2,
  'Helga': 3,
  'Rowena': 2,
  'Draco': 1},
 8: {'Vincent': 1,
  'George': 1,
  'Seamus':

# Creating the Network

In [167]:
# Computer character nodes
character_nodes = list(search_names)

coorcurrence_edges = []

df_dict = {"source": [], "target": [], "weight": [], "global_chapter":[]}
# Compute edges as coorcurrences in chapters
for chapter, characters in characters_in_chapter.items():
    print(characters)
    for character, occurences in characters.items():
        other_characters = list(characters.keys())
        other_characters.remove(character)
        
        for o_char in other_characters:
            if character != o_char:
                coorcurrence_edges += [(character, o_char)]
                df_dict["source"].append(character)
                df_dict["target"].append(o_char) 
                df_dict["weight"].append(0)
                df_dict["global_chapter"].append(chapter)

df = pd.DataFrame(df_dict)
# df

{'Albus': 17, 'Minerva': 9, 'Rubeus': 4, 'Harry': 7, 'Voldemort': 2}
{'Harry': 16}
{'Dennis': 1, 'Malcolm': 1, 'Harry': 14}
{'Albus': 2, 'James': 1, 'Susan': 1, 'Minerva': 1, 'Rubeus': 14, 'Harry': 23, 'Voldemort': 1}
{'Lavender': 1, 'Albus': 2, 'Rubeus': 41, 'Harry': 44, 'Helga': 3}
{'George': 5, 'Albus': 3, 'Lee': 1, 'Neville': 2, 'Godric': 2, 'Percy': 3, 'Rubeus': 3, 'Ron': 31, 'Gregory': 2, 'Fred': 7, 'Harry': 36, 'Hermione': 5, 'Ginny': 3, 'Draco': 1}
{'Lisa': 1, 'Lavender': 1, 'Nearly-Headless': 1, 'Seamus': 2, 'Albus': 5, 'Susan': 1, 'Millicent': 1, 'Neville': 4, 'Terry': 1, 'Godric': 4, 'Justin': 1, 'Minerva': 3, 'Percy': 7, 'Rubeus': 3, 'Morag': 1, 'Ron': 4, 'Severus': 3, 'Blaise': 1, 'Hannah': 2, 'Gregory': 1, 'Fred': 1, 'Dean': 1, 'Harry': 9, 'Mandy': 1, 'Hermione': 2, 'Helga': 3, 'Rowena': 2, 'Draco': 1}
{'Vincent': 1, 'George': 1, 'Seamus': 1, 'Pomona': 1, 'Neville': 2, 'Godric': 1, 'Minerva': 1, 'Rubeus': 8, 'Charlie': 1, 'Ron': 9, 'Severus': 3, 'Gregory': 1, 'Harry': 12,

In [240]:
# Undirected graph - Add edge between 2 characters if the ever cooccur
# The weight is the the sum of log of the min number of mentions of the 2 characters in a chapter
from collections import defaultdict
character_nodes = list(search_names)

coorcurrence_edges = []

df_dict = defaultdict(lambda: defaultdict(float))
# Compute edges as coorcurrences in chapters
for chapter, characters in characters_in_chapter.items():
    vals = list(
        characters.values())
    characters = list(characters.items())
    
    for i in range(len(characters) - 1):
        character, count = characters[i]
        for other_character, other_count in characters[i+1:]:
            if min((count, other_count)) > 1:
                source = min((character,other_character))
                target = max((character,other_character))
                weight = np.log(min((count, other_count)))
                df_dict[source][target] += weight
s, t, w = [], [], []
for source, val in df_dict.items():
    for target, weight in val.items():
        s.append(source)
        t.append(target) 
        w.append(weight)
df = pd.DataFrame({"source":s, 'target':t, 'weight':w})
df.head(10)


Unnamed: 0,source,target,weight
0,Albus,Minerva,33.336488
1,Albus,Rubeus,55.433397
2,Albus,Harry,172.754879
3,Albus,Voldemort,43.85308
4,Albus,Helga,8.553332
5,Albus,George,33.569201
6,Albus,Neville,23.645322
7,Albus,Godric,14.581164
8,Albus,Percy,15.48089
9,Albus,Ron,109.170377


In [178]:

with open('data/chapter_dataframe.pkl', 'rb') as file:
    chapter_dataframe = pkl.load(file)
# chapter_dataframe.columns
joined_df = df.join(chapter_dataframe.set_index('Global Chapter'), on='global_chapter', how='left')
joined_df.head(20)

Unnamed: 0,source,target,weight,global_chapter,Local Chapter,Title,Approx Story Time
0,Albus,Minerva,2.197225,1,1,The Boy Who Lived,1981-11-01
1,Albus,Rubeus,1.386294,1,1,The Boy Who Lived,1981-11-01
2,Albus,Harry,1.94591,1,1,The Boy Who Lived,1981-11-01
3,Albus,Voldemort,0.693147,1,1,The Boy Who Lived,1981-11-01
4,Minerva,Rubeus,1.386294,1,1,The Boy Who Lived,1981-11-01
5,Minerva,Harry,1.94591,1,1,The Boy Who Lived,1981-11-01
6,Minerva,Voldemort,0.693147,1,1,The Boy Who Lived,1981-11-01
7,Rubeus,Harry,1.386294,1,1,The Boy Who Lived,1981-11-01
8,Rubeus,Voldemort,0.693147,1,1,The Boy Who Lived,1981-11-01
9,Harry,Voldemort,0.693147,1,1,The Boy Who Lived,1981-11-01


In [243]:
G = nx.convert_matrix.from_pandas_edgelist(df[:], source = "source", target = "target", edge_attr  = 'weight', create_using=nx.Graph)
# del characters_by_communities_reverse['Albus Potter']

C = {val:characters_by_communities_reverse[key] for key, val in names.items() if key in characters_by_communities_reverse.keys()}

nx.set_node_attributes(G, C, 'house')



NodeView(('Albus', 'Minerva', 'Rubeus', 'Harry', 'Voldemort', 'Helga', 'George', 'Neville', 'Godric', 'Percy', 'Ron', 'Gregory', 'Fred', 'Hermione', 'Ginny', 'Seamus', 'Severus', 'Hannah', 'Rowena', 'Draco', 'Filius', 'mrs. Weasley', 'Pomona', 'mr. Weasley', 'Dennis', 'Gilderoy', 'Dean', 'Oliver', 'Vincent', 'Moaning', 'Lee', 'Lucius', 'Remus', 'Penelope', 'Angelina', 'Lavender', 'Alicia', 'James', 'Wormtail', 'Pansy', 'Katie', 'Parvati', 'Nearly-Headless', 'Cedric', 'Roger', 'Padma', 'Bill', 'Nymphadora', 'Susan', 'Dolores', 'Luna', 'Euan', 'Phineas', 'Marietta', 'Ernie', 'Millicent', 'Bellatrix', 'Horace', 'Corvinus', 'Cormac', 'Demelza', 'Cadwallader', 'Zacharias', 'Jimmy', 'Charlie', 'Rose', 'Warrington', 'Montague', 'Romilda', 'Marcus', 'Justin', 'Kevin', 'Michael', 'Miles', 'Narcissa', 'Jack', 'Ritchie', 'Megan', 'Rodolphus', 'Terry', 'Harper', 'Cho', 'Derrick', 'Anthony', 'Adrian', 'Blaise'))

In [242]:
import netwulf as nw
for node, data in G.nodes(data  = True):
    data["group"] = data["house"]
nw.visualize(G)


(None, None)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=85fb65b4-b596-4730-837e-04e86eafe419' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>