In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk

In [2]:
# params
param_alpha = 0.01
param_corpus = 'corpus/andersen.txt'
param_model = 'arthur_store.h5' #output
param_verbose = False
param_balancing = False

In [3]:
# working datasets
prominence = {}
prominence[1] = pd.DataFrame(columns=['word1', 'cooc'])
prominence[2] = pd.DataFrame(columns=['word1', 'word2', 'cooc'])
prominence[3] = pd.DataFrame(columns=['word1', 'word2', 'word3', 'cooc'])
prominence[4] = pd.DataFrame(columns=['word1', 'word2', 'word3', 'word4', 'cooc'])

In [4]:
def process_ngram1(pos, sentence):
  n = 1
  word = {}
  global prominence
  
  if pos > len(sentence) - n:
    return
  for i in range(1, n+1):
    word[i] = sentence[pos+i-1]
    word[i] = word[i].upper()
  
  # read the unigram
  selector = (prominence[n].word1==word[1])
  records = prominence[n].loc[selector]
  # update the unigram
  if len(records.index) == 0:
    # insert
    c = 1 * param_alpha
    signature = {'word1': [word[1]],'cooc': [c]}
    prominence[n] = prominence[n].append(
        pd.DataFrame(signature),ignore_index=True, sort=False)
  elif len(records.index) == 1:
    # update
    oldc = records['cooc'].iloc[0]
    c = oldc * (1 - param_alpha) + 1 * param_alpha
    prominence[n].loc[selector,'cooc'] = c
  else:
    print("Invalid code path")

  # update weights for other unigrams
  selector_neg = (prominence[n].word1!=word[1])
  if param_balancing:
    if not selector_neg.empty:
      prominence[n].loc[selector_neg,'cooc'] *= (1 - param_alpha)

  # finish
  if param_verbose:
    print("Updated ngram: {w1}".format(w1=word[1]))

In [5]:
def process_ngram2(pos, sentence):
  n = 2
  word = {}
  global prominence
  
  if pos > len(sentence) - n:
    return
  for i in range(1, n+1):
    word[i] = sentence[pos+i-1]
    word[i] = word[i].upper()
  
  # read the bigram
  selector = (prominence[n].word1==word[1]) & (prominence[n].word2==word[2])
  records = prominence[n].loc[selector]
  # update the bigram
  if len(records.index) == 0:
    # insert
    c = 1 * param_alpha
    signature = {'word1': [word[1]],'word2': [word[2]],'cooc': [c]}
    prominence[n] = prominence[n].append(
        pd.DataFrame(signature),ignore_index=True, sort=False)
  elif len(records.index) == 1:
    # update
    oldc = records['cooc'].iloc[0]
    c = oldc * (1 - param_alpha) + 1 * param_alpha
    prominence[n].loc[selector,'cooc'] = c
  else:
    print("Invalid code path")

  # update weights for other bigrams
  selector_neg = (prominence[n].word1!=word[1]) & (prominence[n].word2!=word[2])
  if param_balancing:
    if not selector_neg.empty:
      prominence[n].loc[selector_neg,'cooc'] *= (1 - param_alpha)

  # finish
  if param_verbose:
    print("Updated bigram: {w1}, {w2}".format(w1=word[1],w2=word[2]))

In [6]:
def process_ngram3(pos, sentence):
  n = 3
  word = {}
  global prominence
  
  if pos > len(sentence) - n:
    return
  for i in range(1, n+1):
    word[i] = sentence[pos+i-1]
    word[i] = word[i].upper()
  
  # read the ngram
  selector = (prominence[n].word1==word[1]) & (prominence[n].word2==word[2]) & (prominence[n].word3==word[3])
  records = prominence[n].loc[selector]
  # update the ngram
  if len(records.index) == 0:
    # insert
    c = 1 * param_alpha
    signature = {'word1': [word[1]],'word2': [word[2]],'word3': [word[3]],'cooc': [c]}
    prominence[n] = prominence[n].append(
        pd.DataFrame(signature),ignore_index=True, sort=False)
  elif len(records.index) == 1:
    # update
    oldc = records['cooc'].iloc[0]
    c = oldc * (1 - param_alpha) + 1 * param_alpha
    prominence[n].loc[selector,'cooc'] = c
  else:
    print("Invalid code path")

  # update weights for other ngrams
  selector_neg = (prominence[n].word1!=word[1]) & (prominence[n].word2!=word[2]) & (prominence[n].word3!=word[3])
  if param_balancing:
    if not selector_neg.empty:
      prominence[n].loc[selector_neg,'cooc'] *= (1 - param_alpha)

  # finish
  if param_verbose:
    print("Updated bigram: {w1}, {w2}, {w3}".format(w1=word[1],w2=word[2],w3=word[3]))

In [7]:
def process_ngram4(pos, sentence):
  n = 4
  word = {}
  global prominence
  
  if pos > len(sentence) - n:
    return
  for i in range(1, n+1):
    word[i] = sentence[pos+i-1]
    word[i] = word[i].upper()
  
  # read the ngram
  selector = (prominence[n].word1==word[1]) & (prominence[n].word2==word[2]) & (prominence[n].word3==word[3]) & (prominence[n].word4==word[4])
  records = prominence[n].loc[selector]
  # update the ngram
  if len(records.index) == 0:
    # insert
    c = 1 * param_alpha
    signature = {'word1': [word[1]],'word2': [word[2]],'word3': [word[3]],'word4': [word[4]],'cooc': [c]}
    prominence[n] = prominence[n].append(
        pd.DataFrame(signature),ignore_index=True, sort=False)
  elif len(records.index) == 1:
    # update
    oldc = records['cooc'].iloc[0]
    c = oldc * (1 - param_alpha) + 1 * param_alpha
    prominence[n].loc[selector,'cooc'] = c
  else:
    print("Invalid code path")

  # update weights for other ngrams
  selector_neg = (prominence[n].word1!=word[1]) & (prominence[n].word2!=word[2]) & (prominence[n].word3!=word[3]) & (prominence[n].word4!=word[4])
  if param_balancing:
    if not selector_neg.empty:
      prominence[n].loc[selector_neg,'cooc'] *= (1 - param_alpha)

  # finish
  if param_verbose:
    print("Updated bigram: {w1}, {w2}, {w3}, {4}".format(w1=word[1],w2=word[2],w3=word[3],w4=word[4]))

In [8]:
def process_corpus():
  with open(param_corpus) as infile:
    for line in infile:
      for sentence in nltk.sent_tokenize(line):
        tokenized_sentence = nltk.word_tokenize(sentence)
        for pos, word in enumerate(tokenized_sentence):
          process_ngram1(pos, tokenized_sentence)
          process_ngram2(pos, tokenized_sentence)
          process_ngram3(pos, tokenized_sentence)
          process_ngram4(pos, tokenized_sentence)
          # print("Processing: {w}".format(w=word))
  print("Done processing corpus")


In [9]:
process_corpus()

Done processing corpus


In [10]:
prominence[4].sort_values(by='cooc', ascending=False).head(20)

Unnamed: 0,word1,word2,word3,word4,cooc
332,",",”,SAID,THE,0.685191
2337,!,”,SAID,THE,0.595268
1009,",",”,SAID,HE,0.222179
45836,”,SAID,THE,SHADOW,0.222179
13418,AND,SAID,",",“,0.206386
19199,”,SAID,THE,LITTLE,0.182093
23213,”,SAID,THE,OLD,0.173831
917,?,”,ASKED,THE,0.165486
17291,",",AND,SAID,",",0.165486
2360,”,SAID,THE,PRINCESS,0.148542


Persist the values

In [11]:
store = pd.HDFStore(param_model)

store['prominence_n1'] = prominence[1]
store['prominence_n2'] = prominence[2]
store['prominence_n3'] = prominence[3]
store['prominence_n4'] = prominence[4]
store.close()