Skip to content
This repository has been archived by the owner on May 20, 2019. It is now read-only.

Latest commit

 

History

History
592 lines (486 loc) · 19.6 KB

easy.org

File metadata and controls

592 lines (486 loc) · 19.6 KB

SNAEL: A Social Network Analyzer for English Literature

print "Welcome to SNAEL, version 0.3"
difficuluty = 0

What are we Studying?

We are studying the social networks within a text stored in the following file, served from BookTrust.org on [2013-04-08 Mon]. The file has been edited from its original form, but the text has not been altered.

The work is entitled Other People’s Gods and was written by Naomi Alderman, winning the 2009 BBC National Short Story Award, and is freely available on-line.

from time import gmtime, strftime
print strftime("%X", gmtime())

available_texts = ['../text/' + i for i in \
                 ['simple.txt', # BBC 2009 national
                  'highwayman.txt', # http://fiction.homepageofthedead.com/forum.pl?readfiction=1047H
                  '135.txt']] # Les Miserables

FILE_NAME = available_texts[difficulty]

Entities?

class Entity:
    def __init__(self, name):
        self.names = set([name])
        self.occurances = list()

    def find_occurances(self, text):
        for sentence in text:
            for name in self.names:
                if name in sentence:
                    self.occurances.add(text.index(sentence))
        self.occurances = sorted(self.occurances)
        try:
            self.common_name = self.occurances[0]
        except:
            self.common_name = 'Error:NoOccurance'

    def same(self, entity, threshold=.6):
        """Absorbs `entity` into self if `entity` and self are sufficiently
 similar.
        
        Arguments:
        - `self`:
        - `entity`:
        """
        
        similarity = 0.0

        for my_name in self.names:
            for your_name in entity.names:
                if same(my_name, your_name):
                    similarity += 1

        similarity_ratio = similarity / (len(self.names)*len(entity.names))

        return similarity_ratio >= threshold

    def absorb(self, entity):
        """Absorbs another entity
        
        Arguments:
        - `self`:
        - `entity`:
        """
        self.names |= entity.names
        self.occurances |= entity.occurances

    def __str__(self):
        r = "{!"
        for name in self.names:
            r += "{%s}" % name
        r += "}"
        return r

This will take a while

## {{{ http://code.activestate.com/recipes/578228/ (r5)
import sys

class ProgressBar:
    def __init__(self, minValue = 0, maxValue = 100, totalWidth=75):
        """ Initializes the progress bar. """
        self.progBar = ""   # This holds the progress bar string
        self.oldprogBar = ""
        self.min = minValue
        self.max = maxValue
        self.span = maxValue - minValue
        self.width = totalWidth
        self.amount = 0       # When amount == max, we are 100% done 
        self.updateAmount(0)  # Build progress bar string

    def appendAmount(self, append):
        """ Increases the current amount of the value of append and 
        updates the progress bar to new ammount. """
        self.updateAmount(self.amount + append)
    
    def step(self, autodraw=False):
        self.appendAmount(1)
        if autodraw:
            self.draw()
    
    def updatePercentage(self, newPercentage):
        """ Updates the progress bar to the new percentage. """
        self.updateAmount((newPercentage * float(self.max)) / 100.0)

    def updateAmount(self, newAmount = 0):
        """ Update the progress bar with the new amount (with min and max
        values set at initialization; if it is over or under, it takes the
        min or max value as a default. """
        if newAmount < self.min: newAmount = self.min
        if newAmount > self.max: newAmount = self.max
        self.amount = newAmount

        # Figure out the new percent done, round to an integer
        diffFromMin = float(self.amount - self.min)
        percentDone = (diffFromMin / float(self.span)) * 100.0
        percentDone = int(round(percentDone))

        # Figure out how many hash bars the percentage should be
        allFull = self.width - 2
        numHashes = (percentDone / 100.0) * allFull
        numHashes = int(round(numHashes))

        # Build a progress bar with an arrow of equal signs; special cases for
        # empty and full
        if numHashes == 0:
            self.progBar = "[>%s]" % (' '*(allFull-1))
        elif numHashes == allFull:
            self.progBar = "[%s]" % ('='*allFull)
        else:
            self.progBar = "[%s>%s]" % ('='*(numHashes-1), ' '*(allFull-numHashes))

        # figure out where to put the percentage, roughly centered
        percentPlace = (len(self.progBar) / 2) - len(str(percentDone))
        percentString = str(percentDone) + "%"

        # slice the percentage into the bar
        self.progBar = ' '.join([self.progBar, percentString])
    
    def draw(self):
        """ Draws the progress bar if it has changed from it's previous value.  """
        if self.progBar != self.oldprogBar:
            self.oldprogBar = self.progBar
            sys.stdout.write('\b'*len(self.progBar)+self.progBar)
            ## sys.stdout.write(self.progBar + '\r')
            sys.stdout.flush()      # force updating of screen

    def __str__(self):
        """ Returns the current progress bar. """
        return str(self.progBar)
## end of http://code.activestate.com/recipes/578228/ }}}

Load Text

Obviously, the first thing of significance we do is load the file into memory. This snippet of code opens FILE_NAME as read-only and loads the full contents into raw.

with open(FILE_NAME, 'r') as f:
    print '>reading file: ' + FILE_NAME
    raw = ''.join(f.readlines())
    print '>file read'

Tokenize Text

print '>importing nltk'
import nltk
print '>tokenizing'
tokens = nltk.sent_tokenize(raw)
tokens = [t.replace('\n',' ').replace('  ',' ') \
          for t in tokens if t is not '.']

print '>converting to nltk.Text'
text = nltk.Text(tokens)

Create List of Names

Prepare a Grammar

We need to make sure that we have a list of all names. Let’s just create a pipeline to tokenize, tag, and chunk a text, using a simplified regular expression to detect names.

grammer = r'NAME: {<NNP>+(<DT>?<NNP>+)?}'
ne_chunker = nltk.RegexpParser(grammer)
entities = lambda text: \
           ne_chunker.parse( \
            nltk.pos_tag( \
             nltk.word_tokenize(text)))

Switching on the binary option tells NLTK to enable only one type of named entity, instead of trying to recognize organizations, places, names, and other specifics. With this option, NLTK seems to be far more reliable and consistent.

Recognizing Names

Shortcomings

Now, entities is a function that, if we pass it some sentence, it can correctly identify many titles as named entities:

>>> print entities("Alexander conquered much of the known world \
    after his father, Phillip II, was assassinated.").pprint()
(S
  (NE Alexander/NNP)
  conquered/VBD
  much/JJ
  of/IN
  the/DT
  known/VBN
  world/NN
  after/IN
  his/PRP$
  father/NN
  ,/,
  (NE Phillip/NNP II/NNP)
  ,/,
  was/VBD
  assassinated/VBN
  ./.)

Note, however, that NLTK is not foolproof; it is yet confused by the following simple epithet:

>>> print entities("Alexander the Great conquered much of the known \
    world after his father, Phillip II, was assassinated.").pprint()
(S
  Alexander/NNP
  the/DT
  (NE Great/NNP)
  conquered/VBD
  much/JJ
  of/IN
  the/DT
  known/VBN
  world/NN
  after/IN
  his/PRP$
  father/NN
  ,/,
  (NE Phillip/NNP II/NNP)
  ,/,
  was/VBD
  assassinated/VBN
  ./.)

This can most certainly present problems when the names are followed by an epithet that is crucial to correctly identifying the person, as in Alexander the Great. (This is called an epitheton necessarium.) I suspect an NLTK chunking object can be configured to correctly identify these by placing an optional determiner between two proper nouns (tagged NNP), but we will ignore this shortcoming for now.

Tagging

We now need to tag every sentence in the text. This is by far the most time-consuming task, and the program can appear that it is frozen. For this reason, an incremental update system is put into place to advise the user on its progress. The progress bar system is taken from Stack Overflow and is available under Creative~Commons~BY-SA. The original code was written by CristopheD and has been modified to be clearer.

print '>tagging entire text'

tag_bar = ProgressBar(maxValue=len(text))

We prepare a list for the tagged sentences to be stored, and begin to track our progress through the text. (Remember that the text is stored as a list of sentences, so this progress is sentence-by-sentence.) For each sentence in the text, we append the list of tagged_senteces with the entities of the sentence. We increment our progress through the text, and then test to see if we have crossed into the next level of the progress bar. (We do this by comparing the ratios between current_text_index : len(text) and progress_bar_progress : progress_bar_width. Each value is interpreted as a float to bypass integer division.) If we need to, we write a character to stdout, flush the buffer (forcing the write), and then increment our progress through the progress bar.

tagged_sentences = list()

for sentence in text:
    tagged_sentences.append(entities(sentence))
    tag_bar.step(autodraw=True)

print ''
print '>Done.'

Strip Names

tagged_sentences is now a list that contains every sentence with every word tagged as to its position. Names are all tagged as such (NAME), so all we need to do is distill the entire text into a list of names.

In good practice, we’ll define a function that will receive exactly one sentence (as tagged by NLTK) and pull out the names, returning them as a list.

We can use the production rules to extract the names. For each NAME recognized, a production is made from NAME to the actual name matched. The actual name matched is stored in the right-hand side, or rhs, of the production list (given by sentence.productions()). (Note that the first production is always from S (the sentence) to the sentence itself, with NAME standing in for matched names.) The rhs is stored in a tuple of tuples, and a bit of indexing magic is done to extract what is needed (the first element of each tuple). This is then joined with a single space and added to the list of names, which is returned.

def get_names_from_sentence(sentence):
    """Extracts the names from a single sentence and returns them in a
    list.

    """

    names = list()

    production_names = sentence.productions()[1:]

    names_tagged = [tag.rhs() for tag in production_names]
    
    for name in names_tagged:
        this_name = [tag[0] for tag in name]
        names.append(' '.join(this_name))

    return names

We will then use this function and map it across the entire text, accumulating the list of names.

def get_names_from_text(text):
    """Extracts all names from a text.
    """
    print '>tagging entire text'
          
    name_bar = ProgressBar(maxValue=len(text))  
    names = set()

    for sentence in text:
        names = names.union(get_names_from_sentence(sentence))
        name_bar.step(autodraw=True)

    return list(names)

And viola, we have a list of names from the text.

names = get_names_from_text(tagged_sentences)

Find Occurances

people = [Entity(name) for name in names]
occur_bar = ProgressBar(maxBalue=len(people))
for person in people:
    person.find_occurances(text)
    occur_bar.step(autodraw=True)

Resolve Aliases

Somehow resolve aliases and combine lists of occurances accordingly

Ideas

  • Look for names that are part of other names; Mina ∈ Mina Murray; the Count ∈ Count Dracula

Define a function to see if two names are the same

def same(name1, name2, treshold=.5):
    """Compares two names and determines if they refer to the same person.
    
    Arguments:
    - `name1`: A name
    - `name2`: A name
    """
    if name1 is name2:
    #    print 'Identical'
        return True
    if name1 in name2 or name2 in name1:
    #    print 'Contained'
        return True

    import ngram
 
    s = ngram.NGram.compare(name1, name2)
 
    if s > treshold:
        print '{} is {} (confidence {})'.format(name1, name2, s)
        return True
    return False

Look at names and combine those which are the same

Success is in sight! We now have a

Sort by most popular names

people = sorted(people,
                key=lambda entity: len(entity.occurances),
                reverse=True)

Actually combine entities deemed to be the same

from itertools import combinations
dup_pairs = list(combinations(people, 2))

dup_bar = ProgressBar(maxValue=len(dup_pairs))
for entity1, entity2 in dup_pairs:
    dup_bar.step(autodraw=True)
    if entity1.same(entity2):
        again = True
        entity1.absorb(entity2)
        people.remove(entity2)
        break

Find Cooccurances

import networkx as nx
network = nx.Graph()

node_bar = ProgressBar(maxValue=len(people))


print '>Add nodes to graph'
for person in people:
    network.add_node(person, label=person.common_name, weight=len(person.occurances))
    node_bar.step(autodraw=True)
print 'Done'

from itertools import combinations

pairs = list(combinations(people, 2))

print 'Finding co-occurences'
edge_bar = ProgressBar(maxValue=len(pairs))

radius = 5
for A, B in pairs:
    i = 0
    a = sorted(list(A.occurances))
    b = sorted(list(B.occurances))
    if len(a) is 0 or len(b) is 0:
        edge_bar.step(autodraw=True)
        continue
    maxi = len(B.occurances) - 1
    for oA in a:
        lo = oA - radius
        hi = oA + radius
        while (b[i] > lo) and (i > 0):     # while we're above the low end of the range
            i = i - 1                      #   go towards the low end of the range
        while (b[i] < lo) and (i < maxi):  # while we're below the low end of the range
            i = i + 1                      #   go towards the low end of the range
        if b[i] >= lo:
            while (b[i] <= hi):            # while we're below the high end of the range
                try:                       #   increase edge weight
                    network.edge[A.common_name][B.common_name]['weight'] += 1
                except:
                    network.add_edge(A.common_name, B.common_name, weight=1)
                    
                if i < maxi:               #   and go towards the high end of the range
                    i = i + 1
                else:
                    break
    edge_bar.step(autodraw=True)

Output

With the use of NetworkX, output is extremely simple.

print 'Writing output...',
def getname(filepath):
    b = filepath.rfind('/') + 1
    e = filepath.rfind('.')
    return filepath[b:e]
    
nx.write_gexf(network, 'network-{}.gexf'.format(getname(FILE_NAME)))
print 'Done.  Program complete.'

from time import gmtime, strftime
print strftime("%X", gmtime())