In [1]:
import re
import csv
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_lg')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import urllib.request 
from bs4 import BeautifulSoup

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [2]:
#Load the book
with open ("C:/Users/yusuf/OneDrive/Desktop/WBS_Project/Knowledge Graph/The_Mysterious_Affair_at_Styles.txt", "r",encoding="utf8") as f:
    book_text = f.read()

In [3]:
#Split the book to Sentences

sentences = [[i] for i in nlp(book_text).sents]

In [4]:
sentences

[[The Project Gutenberg eBook of The Mysterious Affair at Styles, by Agatha Christie
  
  This eBook is for the use of anyone anywhere in the United States and
  most other parts of the world at no cost and with almost no restrictions
  whatsoever.],
 [You may copy it, give it away or re-use it under the terms
  of the Project Gutenberg License included with this eBook or online at
  www.gutenberg.org.],
 [If you are not located in the United States, you
  will have to check the laws of the country where you are located before
  using this eBook.
  ],
 [Title: The Mysterious Affair at Styles
  ],
 [Author: Agatha Christie
  
  Release Date: March, 1997],
 [[eBook #863]
  [Most recently updated: December 26, 2021]
  ],
 [Language: English
  
  
  Produced by: Charles Keller
  
  *** START OF THE PROJECT GUTENBERG EBOOK THE MYSTERIOUS AFFAIR AT],
 [STYLES],
 [*],
 [**
  
  
  
  
  The Mysterious Affair at Styles
  
  by Agatha Christie
  ],
 [Contents
  
  
  CHAPTER I.],
 [I GO TO STYL

In [5]:
#Save it as an new csv. file

myheaders = ['sentence']
myvalues = sentences
filename = 'book_text.csv'
with open(filename, 'w',newline='', encoding="utf8") as myfile:
    writer = csv.writer(myfile)
    writer.writerow(myheaders)
    writer.writerows(myvalues)

In [6]:
csv_sentences= pd.read_csv('book_text.csv')
csv_sentences


Unnamed: 0,sentence
0,"The Project Gutenberg eBook of The Mysterious Affair at Styles, by Agatha Christie\n\nThis eBook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cos..."
1,"You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this eBook or online at\nwww.gutenberg.org."
2,"If you are not located in the United States, you\nwill have to check the laws of the country where you are located before\nusing this eBook.\n\n"
3,Title: The Mysterious Affair at Styles\n\n
4,"Author: Agatha Christie\n\nRelease Date: March, 1997"
...,...
5868,"For forty years, he produced and\ndistributed Project Gutenberg-tm eBooks with only a loose network of\nvolunteer support."
5869,\n\n
5870,"Project Gutenberg-tm eBooks are often created from several printed\neditions, all of which are confirmed as not protected by copyright in\nthe U.S. unless a copyright notice is included."
5871,"Thus, we do not\nnecessarily keep eBooks in compliance with any particular paper\nedition.\n\n"


In [7]:
#Get entity pairs


def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]

In [8]:
#test

get_entities("Poirot murdered Alfred")


['Poirot', 'Alfred']

In [9]:
entity_pairs = []

for i in tqdm(csv_sentences["sentence"]):
  entity_pairs.append(get_entities(i))

source = [x[0] for x in entity_pairs]
target = [x[1] for x in entity_pairs]


100%|██████████| 5873/5873 [00:45<00:00, 129.61it/s]


In [10]:
entity_pairs[120:140]

[['I', 'telegraphic  style'],
 ['Weeds', 'house afire'],
 ['', 'even  ’em'],
 ['', 'you'],
 ['', ''],
 ['', ''],
 ['I', ''],
 ['', 'it'],
 ['', ''],
 ['you', ''],
 ['', ''],
 ['John', ''],
 ['Where  tea', 'day'],
 ['', 'day  house'],
 ['', ''],
 ['then  you', 'enough to day'],
 ['you', 'hire'],
 ['', ''],
 ['gardening I', 'you'],
 ['', '']]

In [11]:
#Relation Extraction for Entities
def get_relation(sent):
    doc = nlp(sent)
    # Matcher class object
    matcher = Matcher(nlp.vocab)
    relation=[]
    # define the pattern
    pattern = [{'DEP': 'ROOT'},
               {'DEP': 'prep', 'OP': "?"},
               {'DEP': 'agent', 'OP': "?"},
               {'POS': 'ADJ', 'OP': "?"}]

    matcher.add("matching_1", [pattern], on_match=None)

    matches = matcher(doc)
  
    for mathc_id, start, end in matches:
        matched_span = doc[start: end]
        # print(matched_span.text)
        relation.append(matched_span.text)
    return relation

In [12]:
get_relation("joe came to Berlin")

['came', 'came to']

In [13]:
#Create a Relation for all text
relations = [get_relation(i) for i in (csv_sentences["sentence"])]


In [14]:
relations

[['is', 'is for'],
 ['copy'],
 ['have'],
 ['Title'],
 ['Author'],
 ['['],
 ['EBOOK', 'AT'],
 ['STYLES'],
 ['*'],
 ['Affair', 'Affair at'],
 ['Contents'],
 ['GO', 'GO TO'],
 ['16TH'],
 ['NIGHT', 'NIGHT OF'],
 ['V.'],
 ['IS'],
 ['VII'],
 ['PAYS'],
 ['IX'],
 ['DR'],
 ['XI'],
 ['CASE', 'CASE FOR'],
 ['XIII'],
 ['GO', 'GO TO'],
 ['subsided'],
 ['asked'],
 ['silence'],
 ['set'],
 [],
 ['invalided'],
 ['trying'],
 ['seen'],
 ['known'],
 ['was'],
 ['stayed', 'stayed at'],
 ['had'],
 ['added'],
 ['keeps'],
 ['asked'],
 ['Oh'],
 ['suppose'],
 ['been'],
 ['be'],
 ['recalled'],
 ['was'],
 [],
 ['purchased', 'purchased by'],
 ['been'],
 ['were'],
 [],
 ['been'],
 ['qualified', 'qualified as'],
 [],
 ['practised', 'practised for'],
 ['married'],
 ['was'],
 [],
 ['noticed'],
 ['bounder'],
 ['said'],
 ['tell'],
 ['remember'],
 ['suppose'],
 ['’s'],
 ['sport'],
 [],
 ['going'],
 ['fellow'],
 ['turned'],
 ['see'],
 ['got'],
 ['nodded'],
 ['turned'],
 ['doubt'],
 ['knocked'],
 ['be'],
 ['is'],
 [],
 ['be

In [15]:
pd.Series(relations).value_counts()[:60]

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[]                     795
[was]                  287
[is]                   243
[said]                 187
[know]                  65
[asked]                 62
[had]                   60
[’s]                    58
[think]                 54
[are]                   51
[be]                    48
[were]                  43
[cried]                 43
[been]                  41
[Yes]                   41
[have]                  40
[say]                   40
[see]                   36
[tell]                  33
[told]                  30
[remember]              30
[nodded]                29
[took]                  28
[believe]               26
[seemed]                25
[do]                    24
[went]                  24
[replied]               23
[remarked]              23
[came]                  22
[suppose]               22
[shook]                 21
[made]                  20
[Is]                    20
[mean]                  19
[take]                  19
[looked]                19
[

In [16]:
#Removing stop words and punctuation
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import string 
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yusuf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yusuf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
#Defining function for Removing the stopwords.

def NotStopWord(word):
    return word not in stopwords.words('english')

In [18]:
#stopwords function
def preprocess(sent):
  sent = re.sub("[\(\[].*?[\)\]]", "", sent)
  tokens = []
  temp = ""
  words = word_tokenize(sent)
  puncts = '"#$%&\'()*+,-/:;<=>@\\^_`{|}~'
  words = map(lambda x: x.translate(str.maketrans('','',puncts)), words)
  words = map(str.lower,words)
  words = filter(lambda x: NotStopWord(x),words)
  tokens = tokens + list(words)
  temp = ' '.join(word for word in tokens)
  return temp

In [19]:
#applying function
preprocessed_data = [preprocess(i) for i in (csv_sentences["sentence"])]

In [20]:
preprocessed_data

['project gutenberg ebook mysterious affair styles  agatha christie ebook use anyone anywhere united states parts world cost almost restrictions whatsoever .',
 'may copy  give away reuse terms project gutenberg license included ebook online www.gutenberg.org .',
 'located united states  check laws country located using ebook .',
 'title  mysterious affair styles',
 'author  agatha christie release date  march  1997',
 '',
 'language  english produced  charles keller    start project gutenberg ebook mysterious affair',
 'styles',
 '',
 '  mysterious affair styles agatha christie',
 'contents chapter .',
 'go styles chapter ii .',
 '16th 17th july chapter iii .',
 'night tragedy chapter iv .',
 'poirot investigates chapter v .',
 '“ ’ strychnine  ? ” chapter vi .',
 'inquest chapter vii .',
 'poirot pays debts chapter viii .',
 'fresh suspicions chapter ix .',
 'dr .',
 'bauerstein chapter x . arrest chapter xi .',
 'case prosecution chapter xii .',
 'last link chapter xiii .',
 'poirot

In [21]:
#extract entity and relation again for the clean data set 
entity_pairs = []
for i in preprocessed_data:
  entity_pairs.append(get_entities(i))
relations = [get_relation(i) for i in (csv_sentences["sentence"])]

In [22]:
relations

[['is', 'is for'],
 ['copy'],
 ['have'],
 ['Title'],
 ['Author'],
 ['['],
 ['EBOOK', 'AT'],
 ['STYLES'],
 ['*'],
 ['Affair', 'Affair at'],
 ['Contents'],
 ['GO', 'GO TO'],
 ['16TH'],
 ['NIGHT', 'NIGHT OF'],
 ['V.'],
 ['IS'],
 ['VII'],
 ['PAYS'],
 ['IX'],
 ['DR'],
 ['XI'],
 ['CASE', 'CASE FOR'],
 ['XIII'],
 ['GO', 'GO TO'],
 ['subsided'],
 ['asked'],
 ['silence'],
 ['set'],
 [],
 ['invalided'],
 ['trying'],
 ['seen'],
 ['known'],
 ['was'],
 ['stayed', 'stayed at'],
 ['had'],
 ['added'],
 ['keeps'],
 ['asked'],
 ['Oh'],
 ['suppose'],
 ['been'],
 ['be'],
 ['recalled'],
 ['was'],
 [],
 ['purchased', 'purchased by'],
 ['been'],
 ['were'],
 [],
 ['been'],
 ['qualified', 'qualified as'],
 [],
 ['practised', 'practised for'],
 ['married'],
 ['was'],
 [],
 ['noticed'],
 ['bounder'],
 ['said'],
 ['tell'],
 ['remember'],
 ['suppose'],
 ['’s'],
 ['sport'],
 [],
 ['going'],
 ['fellow'],
 ['turned'],
 ['see'],
 ['got'],
 ['nodded'],
 ['turned'],
 ['doubt'],
 ['knocked'],
 ['be'],
 ['is'],
 [],
 ['be

In [23]:
entity_pairs

[['anywhere states parts world', 'almost  restrictions'],
 ['project gutenberg license', 'online  www.gutenberg.org'],
 ['', 'check laws ebook'],
 ['', ''],
 ['', ''],
 ['', ''],
 ['charles keller', 'ebook mysterious gutenberg ebook affair'],
 ['', ''],
 ['', ''],
 ['', ''],
 ['', ''],
 ['', 'styles chapter ii'],
 ['', ''],
 ['', ''],
 ['poirot', 'chapter v'],
 ['', ''],
 ['', ''],
 ['poirot', 'chapter viii'],
 ['', ''],
 ['', ''],
 ['', ''],
 ['', ''],
 ['', ''],
 ['chapter i.', 'go styles'],
 ['time styles case', ''],
 ['worldwide friend poirot family', 'whole  story'],
 ['sensational  rumours', ''],
 ['', 'connected  affair'],
 ['', ''],
 ['', 'sick spending leave'],
 ['mind', 'john cavendish'],
 ['', ''],
 ['', ''],
 ['one  thing', ''],
 ['boy', ''],
 ['inviting', 'leave'],
 ['', 'again  years'],
 ['mother', ''],
 ['', ''],
 ['', ''],
 ['afraid', 'surprise'],
 ['two  sons', ''],
 ['', 'less  seventy'],
 ['', 'fondness lady bountiful'],
 ['generous  woman', 'considerable  fortune'],

In [24]:
#check the most accrued entities and relations in the data.
print(" Most occured entite1 \n",pd.Series(source).value_counts()[:10])
print("Most occured entite2 \n",pd.Series(target).value_counts()[:10])
print("Most popular relation \n",pd.Series(relations).value_counts()[:10])

 Most occured entite1 
           1417
I          616
you        171
it         162
he         162
Poirot     132
He         125
she        109
It         100
She         94
dtype: int64
Most occured entite2 
         2239
it       158
me       118
you       86
him       81
what      46
her       43
us        39
head      37
that      34
dtype: int64
Most popular relation 
 []         795
[was]      287
[is]       243
[said]     187
[know]      65
[asked]     62
[had]       60
[’s]        58
[think]     54
[are]       51
dtype: int64


In [25]:
df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})
df

Unnamed: 0,source,target,edge
0,Agatha Christie eBook,no United restrictions,"[is, is for]"
1,You,Project Gutenberg www.gutenberg.org,[copy]
2,where you,eBook,[have]
3,,Mysterious Styles,[Title]
4,,Agatha Christie Date,[Author]
...,...,...,...
5868,forty he,loose volunteer support,[produced]
5869,,,[]
5870,copyright notice,U.S.,"[created, created from, created from several]"
5871,Thus we,particular paper edition,[keep]


In [26]:
df[df['source']== "Mrs. Inglethorp"]

Unnamed: 0,source,target,edge
164,Mrs. Inglethorp,effusion,[greeted]
183,Mrs. Inglethorp,however nothing,[seemed]
240,Mrs. Inglethorp,,[cried]
432,Mrs. Inglethorp,wish,[received]
444,Mrs. Inglethorp,evening,[went]
549,Mrs. Inglethorp,boudoir,"[came, came out]"
606,Mrs. Inglethorp,,[came]
654,Mrs. Inglethorp,yet hand,[were]
980,Mrs. Inglethorp,me,[told]
1020,Mrs. Inglethorp,me,[wish]


In [27]:
df[df['target']== "mistake"]

Unnamed: 0,source,target,edge
2420,stupid men,mistake,[shaken]
3036,I,mistake,[made]
3340,Mrs. Inglethorp,mistake,[was]


In [28]:
#Mrs. Inglethorp relation 
#First we creat a DataFrame Graph

G= nx.from_pandas_edgelist(df[df['source']== "Mrs. Inglethorp"],
                           source="source",
                           target="target",
                           edge_attr=True,
                           create_using =nx.Graph())

In [29]:
#Visualization with Pyvis

from pyvis.network import Network
net= Network(notebook = True, width="1000px", height="700px", bgcolor="#222222", font_color="white")

node_degree = dict(G.degree)

#Setting node size
nx.set_node_attributes(G, node_degree, 'size')

net.from_nx(G)
net.show("Inglethorp.html")

In [30]:
#Husband relations 
#First we creat a DataFrame Graph

G= nx.from_pandas_edgelist(df[df['target']== "husband"],
                           source="source",
                           target="target",
                           edge_attr=True,
                           create_using =nx.Graph())

In [31]:
#Visualization with Pyvis

from pyvis.network import Network
net= Network(notebook = True, width="1000px", height="700px", bgcolor="#222222", font_color="white")

node_degree = dict(G.degree)

#Setting node size
nx.set_node_attributes(G, node_degree, 'size')

net.from_nx(G)
net.show("husband.html")

In [32]:
#Alfred relations 
#First we creat a DataFrame Graph

G= nx.from_pandas_edgelist(df[df['source']== "Alfred Inglethorp"],
                           source="source",
                           target="target",
                           edge_attr=True,
                           create_using =nx.Graph())

In [33]:
#Visualization with Pyvis

from pyvis.network import Network
net= Network(notebook = True, width="1000px", height="700px", bgcolor="#222222", font_color="white")

node_degree = dict(G.degree)

#Setting node size
nx.set_node_attributes(G, node_degree, 'size')

net.from_nx(G)
net.show("Alfred.html")

In [34]:
#Mary Cavendish relations 
#First we creat a DataFrame Graph

G= nx.from_pandas_edgelist(df[df['target']== "Mary Cavendish"],
                           source="source",
                           target="target",
                           edge_attr=True,
                           create_using =nx.Graph())

In [35]:
#Visualization with Pyvis

from pyvis.network import Network
net= Network(notebook = True, width="1000px", height="700px", bgcolor="#222222", font_color="white")

node_degree = dict(G.degree)

#Setting node size
nx.set_node_attributes(G, node_degree, 'size')

net.from_nx(G)
net.show("Cavendish.html")

In [36]:
#stepson relations 
#First we creat a DataFrame Graph

G= nx.from_pandas_edgelist(df[df['target']== "stepson"],
                           source="source",
                           target="target",
                           edge_attr=True,
                           create_using =nx.Graph())

In [37]:
#Visualization with Pyvis

from pyvis.network import Network
net= Network(notebook = True, width="1000px", height="700px", bgcolor="#222222", font_color="white")

node_degree = dict(G.degree)

#Setting node size
nx.set_node_attributes(G, node_degree, 'size')

net.from_nx(G)
net.show("stepson.html")