In [6]:
from collections import defaultdict

In [7]:
START_SYMBOL = '*/*'
STOP_SYMBOL = 'STOP/STOP'

In [8]:
def fancySplit(sents):
    result = [[START_SYMBOL, START_SYMBOL] + s.split() + [STOP_SYMBOL] \
                    for s in sents]
    result = [[tok for word in sent for tok in word.rsplit('/',1)] \
                    for sent in result]
    return result

def split_wordtags(brown_train):
    toks = fancySplit(brown_train)
    brown_words = [sent[0::2] for sent in toks]
    brown_tags = [sent[1::2] for sent in toks]
    return brown_words, brown_tags

def getCounts(toks):
    d = defaultdict(int)
    for t in toks:
        d[t] += 1
    return d

In [9]:
with open('data/brown_tagged_xample.txt', 'r') as ifile:
    corpus = ifile.readlines()

(words, tags) = split_wordtags(corpus)

In [10]:
def calc_known(brown_words):
    flatWords = [w for sublist in brown_words for w in sublist]
    wordCounts = getCounts(flatWords)
    
    known_words = set([k for k, v in wordCounts.iteritems() if v > 5])
    return known_words

In [11]:
known = calc_known(words)

In [12]:
list(known)[:5]

['and', 'all', 'on', 'have', 'Island']

In [13]:
RARE_SYMBOL = '_RARE_'
def replace_rare(brown_words, known_words):
    brown_words_rare = [[s if s in known_words else RARE_SYMBOL for s in sent] \
                            for sent in brown_words]
    return brown_words_rare

In [14]:
filteredWords = replace_rare(words, known)

In [15]:
filteredWords[0]

['*',
 '*',
 '_RARE_',
 'that',
 'time',
 '_RARE_',
 '_RARE_',
 '_RARE_',
 '_RARE_',
 'and',
 '_RARE_',
 '_RARE_',
 'to',
 '_RARE_',
 'their',
 '_RARE_',
 '.',
 'STOP']

In [16]:
# open Brown development data (question 5)
infile = open("data/Brown_dev.txt", "r")
brown_dev = infile.readlines()
infile.close()

# format Brown development data here
brown_dev_words = []
for sentence in brown_dev:
    brown_dev_words.append(sentence.split(" ")[:-1])

In [17]:
brown_dev_words[0]

['He',
 'had',
 'obtained',
 'and',
 'provisioned',
 'a',
 'veteran',
 'ship',
 'called',
 'the',
 'Discovery',
 'and',
 'had',
 'recruited',
 'a',
 'crew',
 'of',
 'twenty-one',
 ',',
 'the',
 'largest',
 'he',
 'had',
 'ever',
 'commanded',
 '.']

In [22]:
from itertools import compress
import math

def calc_emission(brown_words_rare, brown_tags):
    words = [w for sublist in brown_words_rare for w in sublist]
    tags = [t for sublist in brown_tags for t in sublist]

    tag_counts = getCounts(tags)
    taglist = set(tag_counts.keys())

    e_values = {}
    for tag in taglist:
        mask = [1 if t==tag else 0 for t in tags]
        words_masked = compress(words, mask)
        wordCount = getCounts(words_masked)
        tagCount = float(tag_counts[tag])

        for word in wordCount.keys():
            e_values[(word, tag)] = math.log(wordCount[word]/tagCount, 2)

    return e_values, taglist

In [23]:
e_values, taglist = calc_emission(filteredWords, tags)

In [26]:
list(taglist)

['ADV',
 'NOUN',
 'ADP',
 'PRT',
 'DET',
 '*',
 'STOP',
 '.',
 'PRON',
 'VERB',
 'NUM',
 'CONJ',
 'ADJ']