In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx

import matplotlib.pyplot as plt

### Load Named Entity Recognition (NER) Model and Book

In [2]:
#load spacy english middle model for NER (if you dont install before NER package install--> "!python3 -m spacy download en_core_web_md")
NER= spacy.load("en_core_web_sm")

In [3]:
#Load the book
with open ("C:/Users/yusuf/OneDrive/Desktop/WBS_Project/Knowledge Graph/The_Mysterious_Affair_at_Styles.txt", "r",encoding="utf8") as f:
    book_text = f.read()

In [4]:
#Convert text to doc for sequence of Token.
book_doc= NER(book_text)

In [5]:
#Visualize named entities

displacy.render(book_doc[0:2000], style='ent', jupyter=True)

### Load Charachters


In [6]:
#Read charachters
character_df= pd.read_csv('charachters.csv')
character_df

Unnamed: 0,chr,chr_firstname,chr_title,exp
0,Hercule Poirot,Hercule,Poirot,Renowned Belgian private detective. He lives ...
1,Hastings,Hastings,Mr. Hastings,"Poirot's friend, and the narrator of the story..."
2,Inspector James Japp,James,Japp,"A Scotland Yard detective, and the investigat..."
3,Emily Inglethorp,Emily,Mrs. Inglethorp,"A wealthy old woman, and the wife of Alfred I..."
4,Alfred Inglethorp,Alfred,Mr. Inglethorp,Emily's second husband and 20 years younger th...
5,John Cavendish,John,Mr. Cavendish,"Emily's elder stepson, from her first husband'..."
6,Mary Cavendish,Mary,Mrs. Cavendish,"John's wife, and a friend of Dr Bauerstein."
7,Lawrence Cavendish,Lawrence,Lawrence,"Emily's younger stepson, from her first husba..."
8,Evelyn Howard,Evelyn,Miss Howard,"Emily's companion, and a second cousin of Alfr..."
9,Cynthia Murdoch,Cynthia,Miss Murdoch,The daughter of a deceased friend of the famil...


### Implement Named Entity per Sentence

In [7]:
sent_entity_df = []

#Store named entity for each sentence through Loop
for sent in book_doc.sents:
    entity_list= [ent.text for ent in sent.ents]
    sent_entity_df.append({"sentence": sent, "entities": entity_list})
    
sent_entity_df = pd.DataFrame(sent_entity_df)                           

In [8]:
sent_entity_df

Unnamed: 0,sentence,entities
0,"(The, Project, Gutenberg, eBook, of, The, Myst...","[The Mysterious Affair at Styles, Agatha Chris..."
1,"(You, may, copy, it, ,, give, it, away, or, re...",[eBook]
2,"(If, you, are, not, located, in, the, United, ...","[the United States, eBook]"
3,"(Title, :, The, Mysterious, Affair, at, Styles...","[Agatha Christie\n\nRelease Date, March, 1997,..."
4,"(CHAPTER, I.)",[]
...,...,...
4114,"(For, forty, years, ,, he, produced, and, \n, ...","[forty years, Project Gutenberg-tm, eBooks]"
4115,"(Project, Gutenberg, -, tm, eBooks, are, often...","[eBooks, U.S.]"
4116,"(Thus, ,, we, do, not, \n, necessarily, keep, ...",[eBooks]
4117,"(Most, people, start, at, our, website, which,...",[]


In [9]:
#Build function to filter out non_charachter entities
def filter_entity(ent_list, character_df):
    return[ent for ent in ent_list
           if ent in list(character_df.chr_firstname)
           or ent in list(character_df.chr_title)]

In [10]:
filter_entity(["Poirot","5", "Mr."], character_df)

['Poirot']

In [11]:
#Apply function to see result
sent_entity_df['character_entities']= sent_entity_df['entities'].apply(lambda x: filter_entity(x, character_df))

In [12]:
sent_entity_df

Unnamed: 0,sentence,entities,character_entities
0,"(The, Project, Gutenberg, eBook, of, The, Myst...","[The Mysterious Affair at Styles, Agatha Chris...",[]
1,"(You, may, copy, it, ,, give, it, away, or, re...",[eBook],[]
2,"(If, you, are, not, located, in, the, United, ...","[the United States, eBook]",[]
3,"(Title, :, The, Mysterious, Affair, at, Styles...","[Agatha Christie\n\nRelease Date, March, 1997,...",[]
4,"(CHAPTER, I.)",[],[]
...,...,...,...
4114,"(For, forty, years, ,, he, produced, and, \n, ...","[forty years, Project Gutenberg-tm, eBooks]",[]
4115,"(Project, Gutenberg, -, tm, eBooks, are, often...","[eBooks, U.S.]",[]
4116,"(Thus, ,, we, do, not, \n, necessarily, keep, ...",[eBooks],[]
4117,"(Most, people, start, at, our, website, which,...",[],[]


In [13]:
# Now you can filter out sentences that don't have any character entities

sent_entity_df_filtered = sent_entity_df[sent_entity_df['character_entities'].map(len)>0]
sent_entity_df_filtered.head(10)

Unnamed: 0,sentence,entities,character_entities
20,"(Nevertheless, ,, in, \n, view, of, the, world...",[Poirot],[Poirot]
62,"(But, you, could, have, knocked, us, \n, all, ...","[three months ago, Alfred]",[Alfred]
73,"(As, we, turned, in, at, the, lodge, gates, ,,...",[Hastings],[Hastings]
79,"(“, I, wonder, if, we, ’ve, \n, time, to, pick...",[Cynthia],[Cynthia]
80,"(No, ,, she, ’ll, have, started, from, the, ho...",[Cynthia],[Cynthia]
81,"(That, ’s, not, your, wife, ?, ”, \n\n, “, No,...",[Cynthia],[Cynthia]
83,"(My, mother, came, \n, to, the, rescue, ,, and...","[Cynthia, nearly two years]",[Cynthia]
87,"(Mr., Hastings, —, Miss, Howard, ., ”, \n\n, M...","[Hastings, Howard, Howard]",[Hastings]
103,"(“, My, wife, ,, Hastings, ,, ”, said, John, ....","[Hastings, John]",[Hastings]
111,"(At, that, moment, a, well, remembered, voice,...","[French, Alfred]",[Alfred]


### Create Relationships

In [14]:
#A window_size which mean is in x sentence spread of character's name.
window_size= 5
relationships= []
for i in range(sent_entity_df_filtered.index[-1]):
    end_i = min(i+5, sent_entity_df_filtered.index[-1])
    char_list = sum((sent_entity_df_filtered.loc[i: end_i].character_entities), [])
    
    #Remove duplicated characters we dont need the character's relationship with herself/himself
    char_unique = [char_list[i] for i in range(len(char_list))
                   if(i==0) or char_list[i] != char_list[i-1]]
    
    #Target and source relations
    if len(char_unique) >1:
        for idx, a in enumerate(char_unique[:-1]):
            b= char_unique[idx + 1]
            relationships.append({"source":a, "target":b})
                
                    

In [15]:
relationship_df = pd.DataFrame(relationships)

In [16]:
#Ckeck out all rows
pd.set_option('display.max_rows', None)
relationship_df

Unnamed: 0,source,target
0,Cynthia,Hastings
1,Cynthia,Hastings
2,Alfred,Hastings
3,Alfred,Hastings
4,Hastings,Alfred
5,Hastings,Alfred
6,Hastings,Alfred
7,Hastings,Alfred
8,Hastings,Alfred
9,Hastings,Alfred


In [17]:
#Sort the both sides of relations a->b and b->a
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis =1), columns = relationship_df.columns)
relationship_df

Unnamed: 0,source,target
0,Cynthia,Hastings
1,Cynthia,Hastings
2,Alfred,Hastings
3,Alfred,Hastings
4,Alfred,Hastings
5,Alfred,Hastings
6,Alfred,Hastings
7,Alfred,Hastings
8,Alfred,Hastings
9,Alfred,Hastings


In [18]:
#Get the all unique relations
relationship_df["value"]=1
relationship_df = relationship_df.groupby(["target","source"], sort=False, as_index=False).sum()

In [19]:
relationship_df

Unnamed: 0,target,source,value
0,Hastings,Cynthia,47
1,Hastings,Alfred,8
2,Cynthia,Bauerstein,21
3,Miss Murdoch,Cynthia,2
4,Lawrence,Cynthia,61
5,Poirot,Cynthia,64
6,Dorcas,Cynthia,8
7,Lawrence,Dorcas,21
8,Poirot,Lawrence,95
9,Lawrence,Bauerstein,13


### Graph Analysis and Visualization

In [20]:
#First we creat a DataFrame Graph

G= nx.from_pandas_edgelist(relationship_df,
                           source="source",
                           target="target",
                           edge_attr="value",
                           create_using =nx.Graph())

In [21]:
#Visualization with Pyvis

from pyvis.network import Network
net= Network(notebook = True, width="1000px", height="700px", bgcolor="#222222", font_color="white")

node_degree = dict(G.degree)

#Setting node size
nx.set_node_attributes(G, node_degree, 'size')

net.from_nx(G)
net.show("agatha.html")

### The Most Important Characters

In [22]:
#Degree centrality
degree_dict = nx.degree_centrality(G)
degree_dict

{'Cynthia': 0.6000000000000001,
 'Hastings': 0.6000000000000001,
 'Alfred': 0.1,
 'Bauerstein': 0.5,
 'Miss Murdoch': 0.2,
 'Lawrence': 0.6000000000000001,
 'Poirot': 0.8,
 'Dorcas': 0.5,
 'Evelyn': 0.2,
 'Japp': 0.2,
 'Mary': 0.1}

In [28]:
degree_df = pd.DataFrame.from_dict(degree_dict, orient='index', columns=['centrality'])
degree_df.sort_values(ascending=False, by='centrality')

Unnamed: 0,centrality
Poirot,0.8
Cynthia,0.6
Hastings,0.6
Lawrence,0.6
Bauerstein,0.5
Dorcas,0.5
Miss Murdoch,0.2
Evelyn,0.2
Japp,0.2
Alfred,0.1


### Community Detection

In [29]:
#First we will install python-louvian library to detect the community
import community as community_louvain

In [30]:
communities = community_louvain.best_partition(G)

In [31]:
communities

{'Cynthia': 1,
 'Hastings': 1,
 'Alfred': 1,
 'Bauerstein': 1,
 'Miss Murdoch': 2,
 'Lawrence': 1,
 'Poirot': 0,
 'Dorcas': 1,
 'Evelyn': 2,
 'Japp': 0,
 'Mary': 0}

In [32]:
# Now we can make a community knowledge graph
nx.set_node_attributes(G, communities, 'group')

com_net= Network(notebook = True, width="1000px", height="700px", bgcolor="#222222", font_color="white")
com_net.from_nx(G)
com_net.show("story_communities.html")