print "Welcome to SNAEL, version 0.3"
difficuluty = 0
We are studying the social networks within a text stored in the following file, served from BookTrust.org on [2013-04-08 Mon]. The file has been edited from its original form, but the text has not been altered.
The work is entitled Other People’s Gods and was written by Naomi Alderman, winning the 2009 BBC National Short Story Award, and is freely available on-line.
from time import gmtime, strftime
print strftime("%X", gmtime())
available_texts = ['../text/' + i for i in \
['simple.txt', # BBC 2009 national
'highwayman.txt', # http://fiction.homepageofthedead.com/forum.pl?readfiction=1047H
'135.txt']] # Les Miserables
FILE_NAME = available_texts[difficulty]
class Entity:
def __init__(self, name):
self.names = set([name])
self.occurances = list()
def find_occurances(self, text):
for sentence in text:
for name in self.names:
if name in sentence:
self.occurances.add(text.index(sentence))
self.occurances = sorted(self.occurances)
try:
self.common_name = self.occurances[0]
except:
self.common_name = 'Error:NoOccurance'
def same(self, entity, threshold=.6):
"""Absorbs `entity` into self if `entity` and self are sufficiently
similar.
Arguments:
- `self`:
- `entity`:
"""
similarity = 0.0
for my_name in self.names:
for your_name in entity.names:
if same(my_name, your_name):
similarity += 1
similarity_ratio = similarity / (len(self.names)*len(entity.names))
return similarity_ratio >= threshold
def absorb(self, entity):
"""Absorbs another entity
Arguments:
- `self`:
- `entity`:
"""
self.names |= entity.names
self.occurances |= entity.occurances
def __str__(self):
r = "{!"
for name in self.names:
r += "{%s}" % name
r += "}"
return r
## {{{ http://code.activestate.com/recipes/578228/ (r5)
import sys
class ProgressBar:
def __init__(self, minValue = 0, maxValue = 100, totalWidth=75):
""" Initializes the progress bar. """
self.progBar = "" # This holds the progress bar string
self.oldprogBar = ""
self.min = minValue
self.max = maxValue
self.span = maxValue - minValue
self.width = totalWidth
self.amount = 0 # When amount == max, we are 100% done
self.updateAmount(0) # Build progress bar string
def appendAmount(self, append):
""" Increases the current amount of the value of append and
updates the progress bar to new ammount. """
self.updateAmount(self.amount + append)
def step(self, autodraw=False):
self.appendAmount(1)
if autodraw:
self.draw()
def updatePercentage(self, newPercentage):
""" Updates the progress bar to the new percentage. """
self.updateAmount((newPercentage * float(self.max)) / 100.0)
def updateAmount(self, newAmount = 0):
""" Update the progress bar with the new amount (with min and max
values set at initialization; if it is over or under, it takes the
min or max value as a default. """
if newAmount < self.min: newAmount = self.min
if newAmount > self.max: newAmount = self.max
self.amount = newAmount
# Figure out the new percent done, round to an integer
diffFromMin = float(self.amount - self.min)
percentDone = (diffFromMin / float(self.span)) * 100.0
percentDone = int(round(percentDone))
# Figure out how many hash bars the percentage should be
allFull = self.width - 2
numHashes = (percentDone / 100.0) * allFull
numHashes = int(round(numHashes))
# Build a progress bar with an arrow of equal signs; special cases for
# empty and full
if numHashes == 0:
self.progBar = "[>%s]" % (' '*(allFull-1))
elif numHashes == allFull:
self.progBar = "[%s]" % ('='*allFull)
else:
self.progBar = "[%s>%s]" % ('='*(numHashes-1), ' '*(allFull-numHashes))
# figure out where to put the percentage, roughly centered
percentPlace = (len(self.progBar) / 2) - len(str(percentDone))
percentString = str(percentDone) + "%"
# slice the percentage into the bar
self.progBar = ' '.join([self.progBar, percentString])
def draw(self):
""" Draws the progress bar if it has changed from it's previous value. """
if self.progBar != self.oldprogBar:
self.oldprogBar = self.progBar
sys.stdout.write('\b'*len(self.progBar)+self.progBar)
## sys.stdout.write(self.progBar + '\r')
sys.stdout.flush() # force updating of screen
def __str__(self):
""" Returns the current progress bar. """
return str(self.progBar)
## end of http://code.activestate.com/recipes/578228/ }}}
Obviously, the first thing of significance we do is load the file into
memory. This snippet of code opens FILE_NAME
as read-only and loads
the full contents into raw
.
with open(FILE_NAME, 'r') as f:
print '>reading file: ' + FILE_NAME
raw = ''.join(f.readlines())
print '>file read'
print '>importing nltk'
import nltk
print '>tokenizing'
tokens = nltk.sent_tokenize(raw)
tokens = [t.replace('\n',' ').replace(' ',' ') \
for t in tokens if t is not '.']
print '>converting to nltk.Text'
text = nltk.Text(tokens)
We need to make sure that we have a list of all names. Let’s just create a pipeline to tokenize, tag, and chunk a text, using a simplified regular expression to detect names.
grammer = r'NAME: {<NNP>+(<DT>?<NNP>+)?}'
ne_chunker = nltk.RegexpParser(grammer)
entities = lambda text: \
ne_chunker.parse( \
nltk.pos_tag( \
nltk.word_tokenize(text)))
Switching on the binary
option tells NLTK to enable only one type of
named entity, instead of trying to recognize organizations, places,
names, and other specifics. With this option, NLTK seems to be far
more reliable and consistent.
Now, entities
is a function that, if we pass it some sentence, it
can correctly identify many titles as named entities:
>>> print entities("Alexander conquered much of the known world \ after his father, Phillip II, was assassinated.").pprint() (S (NE Alexander/NNP) conquered/VBD much/JJ of/IN the/DT known/VBN world/NN after/IN his/PRP$ father/NN ,/, (NE Phillip/NNP II/NNP) ,/, was/VBD assassinated/VBN ./.)
Note, however, that NLTK is not foolproof; it is yet confused by the following simple epithet:
>>> print entities("Alexander the Great conquered much of the known \ world after his father, Phillip II, was assassinated.").pprint() (S Alexander/NNP the/DT (NE Great/NNP) conquered/VBD much/JJ of/IN the/DT known/VBN world/NN after/IN his/PRP$ father/NN ,/, (NE Phillip/NNP II/NNP) ,/, was/VBD assassinated/VBN ./.)
This can most certainly present problems when the names are followed
by an epithet that is crucial to correctly identifying the person, as
in Alexander the Great
. (This is called an epitheton
necessarium.) I suspect an NLTK chunking object can be configured to
correctly identify these by placing an optional determiner between two
proper nouns (tagged NNP
), but we will ignore this shortcoming for
now.
We now need to tag every sentence in the text. This is by far the most time-consuming task, and the program can appear that it is frozen. For this reason, an incremental update system is put into place to advise the user on its progress. The progress bar system is taken from Stack Overflow and is available under Creative~Commons~BY-SA. The original code was written by CristopheD and has been modified to be clearer.
print '>tagging entire text'
tag_bar = ProgressBar(maxValue=len(text))
We prepare a list for the tagged sentences to be stored, and begin to
track our progress through the text. (Remember that the text is
stored as a list of sentences, so this progress is
sentence-by-sentence.) For each sentence
in the text
, we append
the list of tagged_senteces
with the entities
of the sentence
.
We increment our progress through the text, and then test to see if we
have crossed into the next level of the progress bar. (We do this by
comparing the ratios between current_text_index
: len(text)
and
progress_bar_progress
: progress_bar_width
. Each value is
interpreted as a float
to bypass integer division.) If we need to,
we write a character to stdout
, flush the buffer (forcing the
write), and then increment our progress through the progress bar.
tagged_sentences = list()
for sentence in text:
tagged_sentences.append(entities(sentence))
tag_bar.step(autodraw=True)
print ''
print '>Done.'
tagged_sentences
is now a list that contains every sentence with
every word tagged as to its position. Names are all tagged as such
(NAME
), so all we need to do is distill the entire text into a list
of names.
In good practice, we’ll define a function that will receive exactly one sentence (as tagged by NLTK) and pull out the names, returning them as a list.
We can use the production rules to extract the names. For each
NAME
recognized, a production is made from NAME
to the actual
name matched. The actual name matched is stored in the right-hand
side, or rhs
, of the production list (given by
sentence.productions()
). (Note that the first production is always
from S
(the sentence) to the sentence itself, with NAME
standing
in for matched names.) The rhs
is stored in a tuple of tuples, and
a bit of indexing magic is done to extract what is needed (the first
element of each tuple). This is then joined with a single space and
added to the list of names, which is returned.
def get_names_from_sentence(sentence):
"""Extracts the names from a single sentence and returns them in a
list.
"""
names = list()
production_names = sentence.productions()[1:]
names_tagged = [tag.rhs() for tag in production_names]
for name in names_tagged:
this_name = [tag[0] for tag in name]
names.append(' '.join(this_name))
return names
We will then use this function and map it across the entire text, accumulating the list of names.
def get_names_from_text(text):
"""Extracts all names from a text.
"""
print '>tagging entire text'
name_bar = ProgressBar(maxValue=len(text))
names = set()
for sentence in text:
names = names.union(get_names_from_sentence(sentence))
name_bar.step(autodraw=True)
return list(names)
And viola, we have a list of names from the text.
names = get_names_from_text(tagged_sentences)
people = [Entity(name) for name in names]
occur_bar = ProgressBar(maxBalue=len(people))
for person in people:
person.find_occurances(text)
occur_bar.step(autodraw=True)
Somehow resolve aliases and combine lists of occurances accordingly
Ideas
- Look for names that are part of other names; Mina ∈ Mina Murray; the Count ∈ Count Dracula
Define a function to see if two names are the same
def same(name1, name2, treshold=.5):
"""Compares two names and determines if they refer to the same person.
Arguments:
- `name1`: A name
- `name2`: A name
"""
if name1 is name2:
# print 'Identical'
return True
if name1 in name2 or name2 in name1:
# print 'Contained'
return True
import ngram
s = ngram.NGram.compare(name1, name2)
if s > treshold:
print '{} is {} (confidence {})'.format(name1, name2, s)
return True
return False
Look at names and combine those which are the same
Success is in sight! We now have a
Sort by most popular names
people = sorted(people,
key=lambda entity: len(entity.occurances),
reverse=True)
Actually combine entities deemed to be the same
from itertools import combinations
dup_pairs = list(combinations(people, 2))
dup_bar = ProgressBar(maxValue=len(dup_pairs))
for entity1, entity2 in dup_pairs:
dup_bar.step(autodraw=True)
if entity1.same(entity2):
again = True
entity1.absorb(entity2)
people.remove(entity2)
break
import networkx as nx
network = nx.Graph()
node_bar = ProgressBar(maxValue=len(people))
print '>Add nodes to graph'
for person in people:
network.add_node(person, label=person.common_name, weight=len(person.occurances))
node_bar.step(autodraw=True)
print 'Done'
from itertools import combinations
pairs = list(combinations(people, 2))
print 'Finding co-occurences'
edge_bar = ProgressBar(maxValue=len(pairs))
radius = 5
for A, B in pairs:
i = 0
a = sorted(list(A.occurances))
b = sorted(list(B.occurances))
if len(a) is 0 or len(b) is 0:
edge_bar.step(autodraw=True)
continue
maxi = len(B.occurances) - 1
for oA in a:
lo = oA - radius
hi = oA + radius
while (b[i] > lo) and (i > 0): # while we're above the low end of the range
i = i - 1 # go towards the low end of the range
while (b[i] < lo) and (i < maxi): # while we're below the low end of the range
i = i + 1 # go towards the low end of the range
if b[i] >= lo:
while (b[i] <= hi): # while we're below the high end of the range
try: # increase edge weight
network.edge[A.common_name][B.common_name]['weight'] += 1
except:
network.add_edge(A.common_name, B.common_name, weight=1)
if i < maxi: # and go towards the high end of the range
i = i + 1
else:
break
edge_bar.step(autodraw=True)
With the use of NetworkX, output is extremely simple.
print 'Writing output...',
def getname(filepath):
b = filepath.rfind('/') + 1
e = filepath.rfind('.')
return filepath[b:e]
nx.write_gexf(network, 'network-{}.gexf'.format(getname(FILE_NAME)))
print 'Done. Program complete.'
from time import gmtime, strftime
print strftime("%X", gmtime())