In [10]:
test_data = open("data/1b_benchmark.train.tokens")

In [2]:
freqs = {
    "UNK": 0
}

In [3]:
## To handle out-of-vocabulary (OOV) words, convert tokens 
## that occur less than three times into a special
## UNK token during training. If you did this correctly, 
## your language model’s vocabulary (including the UNK
## token and STOP, but excluding START) should have 26602 
## unique tokens (types).

for sentence in test_data.readlines():
    for word in sentence.split():
        if word in freqs:
            freqs[word] += 1
        else:
            freqs[word] = 1

remove = [key for key in freqs if freqs[key] < 3 and key != "UNK"]
for key in remove:
    del freqs[key]
    freqs["UNK"] += 1

len(freqs)

26601

In [6]:
## compute the count of bigram C(x,y)
bigram_count = {}
for sentence in test_data.readlines():
    words = sentence.split()
    for i in range(len(words)):
        if i == len(words)-1:
            break
        elif (words[i], words[i+1]) in bigram_count:
            bigram_count[(words[i], words[i+1])] += 1
        else:
            bigram_count[(words[i], words[i+1])] = 1
        
bigram_count

{('Having', 'a'): 4,
 ('a', 'little'): 165,
 ('little', 'flexibility'): 1,
 ('flexibility', 'on'): 1,
 ('on', 'that'): 46,
 ('that', 'issue'): 5,
 ('issue', 'would'): 4,
 ('would', 'go'): 26,
 ('go', 'a'): 2,
 ('a', 'long'): 117,
 ('long', 'way'): 16,
 ('way', 'to'): 170,
 ('to', 'putting'): 3,
 ('putting', 'together'): 4,
 ('together', 'a'): 11,
 ('a', 'final'): 30,
 ('final', 'package'): 1,
 ('package', '.'): 11,
 ('Long', 'before'): 3,
 ('before', 'the'): 296,
 ('the', 'advent'): 4,
 ('advent', 'of'): 5,
 ('of', 'e-commerce'): 1,
 ('e-commerce', ','): 2,
 (',', 'Wal-Mart'): 5,
 ('Wal-Mart', "'s"): 2,
 ("'s", 'founder'): 7,
 ('founder', 'Sam'): 1,
 ('Sam', 'Walton'): 1,
 ('Walton', 'set'): 1,
 ('set', 'out'): 21,
 ('out', 'his'): 18,
 ('his', 'vision'): 7,
 ('vision', 'for'): 9,
 ('for', 'a'): 1086,
 ('a', 'successful'): 18,
 ('successful', 'retail'): 1,
 ('retail', 'operation'): 1,
 ('operation', ':'): 1,
 (':', '"'): 503,
 ('"', 'We'): 510,
 ('We', 'let'): 1,
 ('let', 'folks'): 1,


In [7]:
## build bigram model

bigram_model = {}
probability = 0.0
for key in bigram_count:
    
    ## To compute bigram probability of a word y (key[1]) given 
    ## previous word x (key[0]), divide the count of bigram 
    ## C(x,y) and sum of all bigrams that share the same first word x
    ## which means freqs of x
    
    ## whether word is OOV or not
    if key[0] in freqs:
        probability = bigram_count[key]/float(freqs[key[0]])
    else:
        probability = bigram_count[key]/float(freqs["UNK"])
    
    if key[0] in bigram_model:
        bigram_model[key[0]].append({
            key[1]: probability
        })
    else:
        bigram_model[key[0]] = [{
            key[1]: probability
        }]

In [8]:
bigram_model

{'Having': [{'a': 0.0851063829787234},
  {'his': 0.02127659574468085},
  {'grown': 0.02127659574468085},
  {'won': 0.02127659574468085},
  {'fallen': 0.02127659574468085},
  {'another': 0.02127659574468085},
  {'been': 0.0851063829787234},
  {'no': 0.02127659574468085},
  {'worked': 0.02127659574468085},
  {'the': 0.0425531914893617},
  {'led': 0.02127659574468085},
  {'played': 0.02127659574468085},
  {'identified': 0.02127659574468085},
  {'defied': 0.0425531914893617},
  {'watched': 0.02127659574468085},
  {'arrived': 0.02127659574468085},
  {'knocked': 0.02127659574468085},
  {'spearheaded': 0.02127659574468085},
  {'lost': 0.02127659574468085},
  {'said': 0.06382978723404255},
  {'enjoyed': 0.02127659574468085},
  {'just': 0.02127659574468085},
  {'gorged': 0.02127659574468085},
  {'learned': 0.02127659574468085},
  {'lived': 0.02127659574468085},
  {'scoured': 0.02127659574468085},
  {'spent': 0.02127659574468085},
  {'Yahoo': 0.02127659574468085},
  {'two': 0.02127659574468085},

In [9]:
## should add upto 1
total = 0
for key in bigram_model['Having']:
    for k in key:
        total += key[k]

total

1.0000000000000004