>**Objectives**
* Who the most important characters are
* How their importance evolves over time
* Main communities of characters that form this social network

In [1]:
import numpy as np
import pandas as pd
import spacy
from spacy import displacy
import networkx as nx
import matplotlib.pyplot as plt

import os
import re

In [2]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# Load spacy English language model
NER = spacy.load("en_core_web_sm")

In [4]:
# Get all the book files in the data directory
all_books = [b for b in os.scandir('data') if '.txt' in b.name]

In [5]:
all_books

[<DirEntry '4 Times of Contempt.txt'>,
 <DirEntry '3 Blood of Elves.txt'>,
 <DirEntry '6 The Tower of the Swallow.txt'>,
 <DirEntry '8 something ends something begins.txt'>,
 <DirEntry '1 The Last Wish.txt'>,
 <DirEntry '5 Baptism of Fire.txt'>,
 <DirEntry '7 The Lady of the Lake.txt'>,
 <DirEntry '2 The Sword of Destiny.txt'>]

In [6]:
book = all_books[1]
book_text = open(book).read()
book_doc = NER(book_text)

In [7]:
book_doc

﻿Verily I say unto you, the era of the sword and axe is nigh, the era of the wolf's blizzard. The Time 
of the White Chill and the White Light is nigh, the Time of Madness and the Time of Contempt: 
Tedd Deireddh, the Time of End. The world will die amidst frost and be reborn with the new sun. 
It will be reborn of the Elder Blood, of Hen Ichaer, of the seed that has been sown. A seed which 
will not sprout but will burst into flame. 
Ess'tuath esse! Thus it shall be! Watch for the signs! What signs these shall be, I say unto you: first 
the earth will flow with the blood of Aen Seidhe, the Blood of Elves ... 


Aen Ithlinnespeath, Ithlinne Aegli aep Aevenien's prophecy 

I 
CHAPTER ONE 

The town was in flames. 
The narrow streets leading to the moat and the first terrace belched smoke and embers, flames devouring 
the densely clustered thatched houses and licking at the castle walls. From the west, from the harbour 
gate, the screams and clamour of vicious battle and the dull blows o

In [8]:
# Visualize identified entities
displacy.render(book_doc[0:800], style="ent", jupyter=True)

### Loading Characters form prevous notebook result

In [9]:
character_df = pd.read_csv("./data/characters.csv")

In [10]:
character_df.head()

Unnamed: 0.1,Unnamed: 0,book,character
0,0,Category:Baptism of Fire characters,Adalia
1,1,Category:Baptism of Fire characters,Adela
2,2,Category:Baptism of Fire characters,Aen Saevherne
3,3,Category:Baptism of Fire characters,Aevenien
4,4,Category:Baptism of Fire characters,Aglaïs


In [11]:
character_df.drop('Unnamed: 0', inplace=True, axis=1)

In [12]:
character_df.head()

Unnamed: 0,book,character
0,Category:Baptism of Fire characters,Adalia
1,Category:Baptism of Fire characters,Adela
2,Category:Baptism of Fire characters,Aen Saevherne
3,Category:Baptism of Fire characters,Aevenien
4,Category:Baptism of Fire characters,Aglaïs


In [13]:
first_name = character_df.character.apply(lambda x: x.split(" ")[0])

In [14]:
character_df['character_firstName'] = first_name

In [15]:
# Remove brackets and text within brackets
character_df['character'] = character_df['character'].apply(lambda x: re.sub("[\(].*?[\)]", "", x))

In [16]:
pd.set_option('display.max_rows', None)
character_df

Unnamed: 0,book,character,character_firstName
0,Category:Baptism of Fire characters,Adalia,Adalia
1,Category:Baptism of Fire characters,Adela,Adela
2,Category:Baptism of Fire characters,Aen Saevherne,Aen
3,Category:Baptism of Fire characters,Aevenien,Aevenien
4,Category:Baptism of Fire characters,Aglaïs,Aglaïs
5,Category:Baptism of Fire characters,Albrich,Albrich
6,Category:Baptism of Fire characters,Amavet,Amavet
7,Category:Baptism of Fire characters,Angus Bri Cri,Angus
8,Category:Baptism of Fire characters,Anna Kameny,Anna
9,Category:Baptism of Fire characters,Anzelm Aubry,Anzelm


### Get named entity list per sentence

In [17]:
sent_entity_df = []

# Loop through each sentence, and store named entity list along the way
for sent in book_doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    sent_entity_df.append({"Sentence" : sent, "Entities" : entity_list})
    
sent_entity_df = pd.DataFrame(sent_entity_df)

In [19]:
sent_entity_df.head()

Unnamed: 0,Sentence,Entities
0,"(﻿Verily, I, say, unto, you, ,, the, era, of, ...",[]
1,"(The, Time, \n, of, the, White, Chill, and, th...","[Time, the White Chill, the White Light, the T..."
2,"(The, world, will, die, amidst, frost, and, be...",[]
3,"(It, will, be, reborn, of, the, Elder, Blood, ...",[Hen Ichaer]
4,"(A, seed, which, \n, will, not, sprout, but, w...",[]


In [63]:
# Function to filter out non-character entities
def filter_entity(ent_list, character_df):
    
    return [ent for ent in ent_list 
            if ent in list(character_df.character_firstName) 
            or ent in list(character_df.character)]
    
    # for ent_char in ent_list.Entities:
    #     if len(ent_char)>0:
    #         ent_char = ent_char[0]
    #     if char not in list(character_df.character_firstName):
    #         ent_list.drop(ent_list.Entities == ent_char)

In [102]:
# Filtering entities
sent_entity_df['character_entities'] = sent_entity_df.Entities.apply(lambda x : filter_entity(x, character_df))

In [103]:
sent_entity_df.head()

Unnamed: 0,Sentence,Entities,character_entities
0,"(﻿Verily, I, say, unto, you, ,, the, era, of, ...",[],[]
1,"(The, Time, \n, of, the, White, Chill, and, th...","[Time, the White Chill, the White Light, the T...",[]
2,"(The, world, will, die, amidst, frost, and, be...",[],[]
3,"(It, will, be, reborn, of, the, Elder, Blood, ...",[Hen Ichaer],[]
4,"(A, seed, which, \n, will, not, sprout, but, w...",[],[]


In [104]:
# Remove Entity rows that are empty
sent_entity_df_filtered = sent_entity_df[sent_entity_df.character_entities.map(len) > 0]

In [105]:
sent_entity_df_filtered.head(20)

Unnamed: 0,Sentence,Entities,character_entities
15,"(Ciri, felt, the, knight, who, carried, her, b...",[Ciri],[Ciri]
68,"(Ciri, is, unable, to, move, .)",[Ciri],[Ciri]
71,"(And, she, is, frozen, in, fear, :, a, terribl...",[Ciri],[Ciri]
132,"(Ciri, is, unable, to, move, .)",[Ciri],[Ciri]
135,"(And, she, is, frozen, in, fear, :, a, terribl...",[Ciri],[Ciri]
161,"(Ciri, shuddered, violently, ,, curling, her, ...",[Ciri],[Ciri]
166,"(Sleep, ,, Ciri, .)",[Ciri],[Ciri]
181,"(Hold, ..., \n, ', Geralt, ?, ', \n, ', What, ...","[Geralt, Ciri]","[Geralt, Ciri]"
195,"(Ciri, had, been, frightened, of, them, ,, she...",[Ciri],[Ciri]
203,"(Ciri, had, heard, such, reassurances, in, the...",[Ciri],[Ciri]


In [106]:
# There are name discriptors of the same person and that creates unwated relationship, so we only take the first name.
sent_entity_df_filtered['character_entities'] = sent_entity_df_filtered.character_entities.apply(lambda x : [name.split()[0] for name in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sent_entity_df_filtered['character_entities'] = sent_entity_df_filtered.character_entities.apply(lambda x : [name.split()[0] for name in x])


In [107]:
sent_entity_df_filtered.head(20)

Unnamed: 0,Sentence,Entities,character_entities
15,"(Ciri, felt, the, knight, who, carried, her, b...",[Ciri],[Ciri]
68,"(Ciri, is, unable, to, move, .)",[Ciri],[Ciri]
71,"(And, she, is, frozen, in, fear, :, a, terribl...",[Ciri],[Ciri]
132,"(Ciri, is, unable, to, move, .)",[Ciri],[Ciri]
135,"(And, she, is, frozen, in, fear, :, a, terribl...",[Ciri],[Ciri]
161,"(Ciri, shuddered, violently, ,, curling, her, ...",[Ciri],[Ciri]
166,"(Sleep, ,, Ciri, .)",[Ciri],[Ciri]
181,"(Hold, ..., \n, ', Geralt, ?, ', \n, ', What, ...","[Geralt, Ciri]","[Geralt, Ciri]"
195,"(Ciri, had, been, frightened, of, them, ,, she...",[Ciri],[Ciri]
203,"(Ciri, had, heard, such, reassurances, in, the...",[Ciri],[Ciri]


In [108]:
# Reset index
sent_entity_df_filtered = sent_entity_df_filtered.reset_index(drop=True)

In [109]:
sent_entity_df_filtered.head()

Unnamed: 0,Sentence,Entities,character_entities
0,"(Ciri, felt, the, knight, who, carried, her, b...",[Ciri],[Ciri]
1,"(Ciri, is, unable, to, move, .)",[Ciri],[Ciri]
2,"(And, she, is, frozen, in, fear, :, a, terribl...",[Ciri],[Ciri]
3,"(Ciri, is, unable, to, move, .)",[Ciri],[Ciri]
4,"(And, she, is, frozen, in, fear, :, a, terribl...",[Ciri],[Ciri]


### Create Relationships

In [116]:
window_size = 5
relationship = []

for i in range(sent_entity_df_filtered.index[-1]):
    end_window = min(i + window_size, sent_entity_df_filtered.index[-1])
    char_list = sum((sent_entity_df_filtered.loc[i:end_window].character_entities),[])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list))
                  if (i==0) or char_list[i] != char_list[i-1]]
    
    # Map relationships
    if len(char_unique) > 1:
        for index, a in enumerate(char_unique[:-1]):
            b = char_unique[index + 1]
            relationship.append({"source" : a, "target" : b})

In [117]:
relationship_df = pd.DataFrame(relationship)

In [119]:
relationship_df.head(20)

Unnamed: 0,source,target
0,Ciri,Geralt
1,Geralt,Ciri
2,Ciri,Geralt
3,Geralt,Ciri
4,Ciri,Geralt
5,Geralt,Ciri
6,Ciri,Geralt
7,Geralt,Ciri
8,Ciri,Radcliffe
9,Ciri,Geralt


In [132]:
# Sort by row for each column so it can help us create a weight out of the dublicate relationships
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis=1), columns=relationship_df.columns)

In [133]:
relationship_df.head(20)

Unnamed: 0,source,target
0,Ciri,Geralt
1,Ciri,Geralt
2,Ciri,Geralt
3,Ciri,Geralt
4,Ciri,Geralt
5,Ciri,Geralt
6,Ciri,Geralt
7,Ciri,Geralt
8,Ciri,Radcliffe
9,Ciri,Geralt


In [134]:
# Assign value/weight to the source-target relationships
relationship_df["weight"] = 1

In [136]:
relationship_df.head(20)

Unnamed: 0,source,target,weight
0,Ciri,Geralt,1
1,Ciri,Geralt,1
2,Ciri,Geralt,1
3,Ciri,Geralt,1
4,Ciri,Geralt,1
5,Ciri,Geralt,1
6,Ciri,Geralt,1
7,Ciri,Geralt,1
8,Ciri,Radcliffe,1
9,Ciri,Geralt,1


In [142]:
# Get the summed weight by grouping the the source-target relationships
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [144]:
relationship_df.head(10)

Unnamed: 0,source,target,weight
0,Ciri,Geralt,457
1,Ciri,Radcliffe,5
2,Geralt,Radcliffe,5
3,Geralt,Yennefer,203
4,Cirilla,Yennefer,5
5,Cirilla,Geralt,5
6,Sheldon,Yennefer,5
7,Donimir,Sheldon,5
8,Donimir,Two,5
9,Two,Vera,5
