In [2]:
import numpy as np
import os
from nltk.corpus import brown
import operator
from future.utils import iteritems

In [5]:
import matplotlib.pyplot as plt
%matplotlib inline
import random
from datetime import datetime

import os
import sys

In [15]:
import nltk
nltk.download("brown")

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\u6yuv\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


True

# Glove

In [1]:
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
def dist1(a, b):
    return np.linalg.norm(a - b)
def dist2(a, b):
    return 1 - a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [3]:
# pick a distance type
dist, metric = dist2, 'cosine'

In [4]:
def find_analogies(w1, w2, w3):
  for w in (w1, w2, w3):
    if w not in word2vec:
      print("%s not in dictionary" % w)
      return

  king = word2vec[w1]
  man = word2vec[w2]
  woman = word2vec[w3]
  v0 = king - man + woman

  distances = pairwise_distances(v0.reshape(1, D), embedding, metric=metric).reshape(V)
  idxs = distances.argsort()[:4]
  for idx in idxs:
    word = idx2word[idx]
    if word not in (w1, w2, w3): 
      best_word = word
      break

  print(w1, "-", w2, "=", best_word, "-", w3)

In [5]:
def nearest_neighbors(w, n=5):
  if w not in word2vec:
    print("%s not in dictionary:" % w)
    return

  v = word2vec[w]
  distances = pairwise_distances(v.reshape(1, D), embedding, metric=metric).reshape(V)
  idxs = distances.argsort()[1:n+1]
  print("neighbors of: %s" % w)
  for idx in idxs:
    print("\t%s" % idx2word[idx])

In [6]:
print('Loading word vectors...')
word2vec = {}
embedding = []
idx2word = []
with open('glove.6B.50d.txt', encoding='utf-8') as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
    embedding.append(vec)
    idx2word.append(word)
print('Found %s word vectors.' % len(word2vec))
embedding = np.array(embedding)
V, D = embedding.shape

Loading word vectors...
Found 400000 word vectors.


In [7]:
find_analogies('king', 'man', 'woman')

king - man = queen - woman


In [8]:
find_analogies('france', 'paris', 'london')

france - paris = britain - london


In [None]:
find_analogies('france', 'paris', 'london')
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')
find_analogies('france', 'french', 'english')
find_analogies('japan', 'japanese', 'chinese')
find_analogies('japan', 'japanese', 'italian')
find_analogies('japan', 'japanese', 'australian')
find_analogies('december', 'november', 'june')
find_analogies('miami', 'florida', 'texas')
find_analogies('einstein', 'scientist', 'painter')
find_analogies('china', 'rice', 'bread')
find_analogies('man', 'woman', 'she')
find_analogies('man', 'woman', 'aunt')
find_analogies('man', 'woman', 'sister')
find_analogies('man', 'woman', 'wife')
find_analogies('man', 'woman', 'actress')
find_analogies('man', 'woman', 'mother')
find_analogies('heir', 'heiress', 'princess')
find_analogies('nephew', 'niece', 'aunt')
find_analogies('france', 'paris', 'tokyo')
find_analogies('france', 'paris', 'beijing')
find_analogies('february', 'january', 'november')
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')

nearest_neighbors('king')
nearest_neighbors('france')
nearest_neighbors('japan')
nearest_neighbors('einstein')
nearest_neighbors('woman')
nearest_neighbors('nephew')
nearest_neighbors('february')
nearest_neighbors('rome')

# Word2Vec

In [9]:
from gensim.models import KeyedVectors



In [13]:
word_vectors = KeyedVectors.load_word2vec_format(
  'GoogleNews-vectors-negative300.bin',
  binary=True
)

In [None]:
def find_analogies(w1, w2, w3):
  r = word_vectors.most_similar(positive=[w1, w3], negative=[w2])
  print("%s - %s = %s - %s" % (w1, w2, r[0][0], w3))

def nearest_neighbors(w):
  r = word_vectors.most_similar(positive=[w])
  print("neighbors of: %s" % w)
  for word, score in r:
    print("\t%s" % word)


In [None]:
find_analogies('king', 'man', 'woman')
find_analogies('france', 'paris', 'london')
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')
find_analogies('france', 'french', 'english')
find_analogies('japan', 'japanese', 'chinese')
find_analogies('japan', 'japanese', 'italian')
find_analogies('japan', 'japanese', 'australian')
find_analogies('december', 'november', 'june')
find_analogies('miami', 'florida', 'texas')
find_analogies('einstein', 'scientist', 'painter')
find_analogies('china', 'rice', 'bread')
find_analogies('man', 'woman', 'she')
find_analogies('man', 'woman', 'aunt')
find_analogies('man', 'woman', 'sister')
find_analogies('man', 'woman', 'wife')
find_analogies('man', 'woman', 'actress')
find_analogies('man', 'woman', 'mother')
find_analogies('heir', 'heiress', 'princess')
find_analogies('nephew', 'niece', 'aunt')
find_analogies('france', 'paris', 'tokyo')
find_analogies('france', 'paris', 'beijing')
find_analogies('february', 'january', 'november')
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')

nearest_neighbors('king')
nearest_neighbors('france')
nearest_neighbors('japan')
nearest_neighbors('einstein')
nearest_neighbors('woman')
nearest_neighbors('nephew')
nearest_neighbors('february')
nearest_neighbors('rome')

# Language Model 

## Bigram Markov Chain

In [12]:
def get_sentences():
  # returns 57340 of the Brown corpus
  # each sentence is represented as a list of individual string tokens
  return brown.sents()

In [4]:
#Creating word to index mapping 

def get_sentences_with_word2idx():
    #get brown corpus list
  sentences = get_sentences()
  indexed_sentences = []

  i = 2
  word2idx = {'START': 0, 'END': 1}
  for sentence in sentences:
    indexed_sentence = []
    for token in sentence:
      token = token.lower()
      if token not in word2idx:
        word2idx[token] = i
        i += 1

      indexed_sentence.append(word2idx[token])
    indexed_sentences.append(indexed_sentence)

  print("Vocab size:", i)
  return indexed_sentences, word2idx

for sentences in brown.sents()[:2]:
    for tokens in sentences:
        print(tokens.lower())

In [39]:
indexed_sent,word2idx=get_sentences_with_word2idx()

Vocab size: 49817


In [5]:
KEEP_WORDS = set([
  'king', 'man', 'queen', 'woman',
  'italy', 'rome', 'france', 'paris',
  'london', 'britain', 'england',
])

In [6]:
def get_sentences_with_word2idx_limit_vocab(n_vocab=2000, keep_words=KEEP_WORDS):
  sentences = get_sentences()
  indexed_sentences = []

  i = 2
  word2idx = {'START': 0, 'END': 1}
  idx2word = ['START', 'END']

  word_idx_count = {
    0: float('inf'),
    1: float('inf'),
  }

  for sentence in sentences:
    indexed_sentence = []
    for token in sentence:
      token = token.lower()
      if token not in word2idx:
        idx2word.append(token)
        word2idx[token] = i
        i += 1

      # keep track of counts for later sorting
      idx = word2idx[token]
      word_idx_count[idx] = word_idx_count.get(idx, 0) + 1

      indexed_sentence.append(idx)
    indexed_sentences.append(indexed_sentence)



  # restrict vocab size

  # set all the words I want to keep to infinity
  # so that they are included when I pick the most
  # common words
  for word in keep_words:
    word_idx_count[word2idx[word]] = float('inf')

  sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True)
  word2idx_small = {}
  new_idx = 0
  idx_new_idx_map = {}
  for idx, count in sorted_word_idx_count[:n_vocab]:
    word = idx2word[idx]
    print(word, count)
    word2idx_small[word] = new_idx
    idx_new_idx_map[idx] = new_idx
    new_idx += 1
  # let 'unknown' be the last token
  word2idx_small['UNKNOWN'] = new_idx 
  unknown = new_idx

  assert('START' in word2idx_small)
  assert('END' in word2idx_small)
  for word in keep_words:
    assert(word in word2idx_small)

  # map old idx to new idx
  sentences_small = []
  for sentence in indexed_sentences:
    if len(sentence) > 1:
      new_sentence = [idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence]
      sentences_small.append(new_sentence)

  return sentences_small, word2idx_small

In [7]:
def get_bigram_probs(sentences, V, start_idx, end_idx, smoothing=1):
  # structure of bigram probability matrix will be:
  # (last word, current word) --> probability
  # we will use add-1 smoothing
  # note: we'll always ignore this from the END token
  bigram_probs = np.ones((V, V)) * smoothing
  for sentence in sentences:
    for i in range(len(sentence)):
      
      if i == 0:
        # beginning word
        bigram_probs[start_idx, sentence[i]] += 1
      else:
        # middle word
        bigram_probs[sentence[i-1], sentence[i]] += 1

      # if we're at the final word
      # we update the bigram for last -> current
      # AND current -> END token
      if i == len(sentence) - 1:
        # final word
        bigram_probs[sentence[i], end_idx] += 1

  # normalize the counts along the rows to get probabilities
  bigram_probs /= bigram_probs.sum(axis=1, keepdims=True)
  return bigram_probs

In [56]:
end_idx

1

In [22]:
sentences, word2idx = get_sentences_with_word2idx_limit_vocab(10000)
  # sentences, word2idx = get_sentences_with_word2idx()

# vocab size
V = len(word2idx)
print("Vocab size:", V)

# we will also treat beginning of sentence and end of sentence as bigrams
# START -> first word
# last word -> END
start_idx = word2idx['START']
end_idx = word2idx['END']


# a matrix where:
# row = last word
# col = current word
# value at [row, col] = p(current word | last word)
bigram_probs = get_bigram_probs(sentences, V, start_idx, end_idx, smoothing=0.1)


  # a function to calculate normalized log prob score
  # for a sentence
def get_score(sentence):
    score = 0
    for i in range(len(sentence)):
      if i == 0:
        # beginning word
        score += np.log(bigram_probs[start_idx, sentence[i]])
      else:
        # middle word
        score += np.log(bigram_probs[sentence[i-1], sentence[i]])
    # final word
    score += np.log(bigram_probs[sentence[-1], end_idx])

    # normalize the score
    return score / (len(sentence) + 1)


# a function to map word indexes back to real words
idx2word = dict((v, k) for k, v in iteritems(word2idx))

def get_words(sentence):
    return ' '.join(idx2word[i] for i in sentence)


    # when we sample a fake sentence, we want to ensure not to sample
    # start token or end token
sample_probs = np.ones(V)
sample_probs[start_idx] = 0
sample_probs[end_idx] = 0
sample_probs /= sample_probs.sum()

# test our model on real and fake sentences
while True:
# real sentence
    real_idx = np.random.choice(len(sentences))
    real = sentences[real_idx]

    # fake sentence
    fake = np.random.choice(V, size=len(real), p=sample_probs)

    print("REAL:", get_words(real), "SCORE:", get_score(real))
    print("FAKE:", get_words(fake), "SCORE:", get_score(fake))

    # input your own sentence
    custom = input("Enter your own sentence:\n")
    custom = custom.lower().split()

    # check that all tokens exist in word2idx (otherwise, we can't get score)
    bad_sentence = False
    for token in custom:
      if token not in word2idx:
        bad_sentence = True

    if bad_sentence:
      print("Sorry, you entered words that are not in the vocabulary")
    else:
      # convert sentence into list of indexes
      custom = [word2idx[token] for token in custom]
      print("SCORE:", get_score(custom))


    cont = input("Continue? [Y/n]")
    if cont and cont.lower() in ('N', 'n'):
      break

START inf
END inf
man inf
paris inf
britain inf
england inf
king inf
woman inf
rome inf
london inf
queen inf
italy inf
france inf
the 69971
, 58334
. 49346
of 36412
and 28853
to 26158
a 23195
in 21337
that 10594
is 10109
was 9815
he 9548
for 9489
`` 8837
'' 8789
it 8760
with 7289
as 7253
his 6996
on 6741
be 6377
; 5566
at 5372
by 5306
i 5164
this 5145
had 5133
? 4693
not 4610
are 4394
but 4381
from 4370
or 4206
have 3942
an 3740
they 3620
which 3561
-- 3432
one 3292
you 3286
were 3284
her 3036
all 3001
she 2860
there 2728
would 2714
their 2669
we 2652
him 2619
been 2472
) 2466
has 2437
( 2435
when 2331
who 2252
will 2245
more 2215
if 2198
no 2139
out 2097
so 1985
said 1961
what 1908
up 1890
its 1858
about 1815
: 1795
into 1791
than 1790
them 1788
can 1772
only 1748
other 1702
new 1635
some 1618
could 1601
time 1598
! 1596
these 1573
two 1412
may 1402
then 1380
do 1363
first 1361
any 1344
my 1318
now 1314
such 1303
like 1292
our 1252
over 1236
me 1181
even 1170
most 1159
made 1125
also 

behavior 96
considerable 96
funds 95
construction 95
attempt 95
changed 95
proper 95
successful 95
marriage 95
sea 95
oil 95
sir 95
hell 95
wait 94
sign 94
worth 94
source 94
highly 94
park 94
7 94
discussion 94
everyone 94
practice 94
arm 94
tradition 94
shows 94
someone 94
authority 93
older 93
annual 93
project 93
c. 93
americans 93
lord 93
success 93
remain 93
principal 92
20 92
leadership 92
jack 92
obvious 92
fell 92
thin 92
pieces 92
management 91
1958 91
measure 91
parents 91
security 91
base 91
entirely 91
civil 91
frequently 91
records 91
structure 91
dinner 91
weight 91
condition 91
mike 91
objective 91
complex 91
produced 90
noted 90
caused 90
equal 90
balance 90
you'll 90
purposes 90
corporation 90
dance 90
kitchen 90
failure 89
pass 89
goes 89
names 89
quickly 89
regard 89
published 89
famous 89
develop 89
clothes 89
laws 88
announced 88
carry 88
cover 88
moreover 88
add 88
greatest 88
check 88
enemy 88
leaving 88
key 88
manager 88
doesn't 88
active 88
break 88
bottom 88


breakfast 53
what's 53
sin 53
examples 53
experiences 53
depth 53
disease 53
wet 53
breath 53
practically 53
content 53
establishment 52
introduced 52
la 52
conflict 52
element 52
detailed 52
eventually 52
theater 52
correct 52
widely 52
hero 52
trust 52
raise 52
developing 52
advice 52
centers 52
gold 52
dozen 52
telling 52
alfred 52
bedroom 52
detective 52
colors 52
indian 52
u.n. 52
silence 52
contrary 52
characteristics 52
flesh 52
investigation 51
achieve 51
approval 51
estate 51
elections 51
supreme 51
listen 51
conventional 51
gradually 51
david 51
views 51
foods 51
pull 51
october 51
arthur 51
stream 51
warren 51
los 51
surprise 51
stages 51
player 51
guy 51
agree 51
uniform 51
abroad 51
devoted 51
papers 51
rear 51
cousin 51
situations 51
boats 51
ages 51
begun 51
easier 51
shoulders 51
sick 51
nodded 51
opportunities 51
necessarily 51
angle 51
throat 51
protestant 51
waves 51
laughed 51
efficiency 50
automobile 50
mention 50
courts 50
issued 50
expense 50
extremely 50
fill 50

canada 34
raising 34
harvard 34
exposed 34
clerk 34
suggestion 34
blame 34
financing 34
bigger 34
reporters 34
johnson 34
badly 34
currently 34
samuel 34
sentence 34
lee 34
realistic 34
net 34
golf 34
we've 34
arrangement 34
logical 34
owned 34
metropolitan 34
thereby 34
worst 34
bus 34
folk 34
sing 34
roles 34
tells 34
crazy 34
sugar 34
duties 34
decades 34
vary 34
roll 34
visible 34
emotion 34
seldom 34
swept 34
suitable 34
hunting 34
lists 34
corn 34
mechanical 34
quarter 34
mistake 34
returns 34
frequent 34
ocean 34
phrase 34
fallen 34
tears 34
dying 34
openly 34
bent 34
tools 34
tends 34
reasonably 34
findings 34
divine 34
stretched 34
abstract 34
keys 34
measurement 34
pencil 34
elected 33
filed 33
succeeded 33
rejected 33
thursday 33
missing 33
gift 33
favorable 33
guilt 33
involving 33
benefits 33
matching 33
fate 33
affair 33
fewer 33
naval 33
prince 33
stems 33
examine 33
advised 33
charter 33
presentation 33
campus 33
interview 33
owner 33
classical 33
branches 33
admission 

vague 25
responsibilities 25
pupils 25
chin 25
approaches 25
vein 25
operational 25
honey 25
lonely 25
fist 25
component 25
magazines 25
continually 25
observe 25
destructive 25
lands 25
twenty-five 25
exposure 25
fog 25
devil 25
cigarette 25
continuity 25
yours 25
disk 25
subtle 25
reflect 25
transformed 25
pond 25
structural 25
contacts 25
saddle 25
detergent 25
exploration 25
penny 25
regiment 25
o'banion 25
bang-jensen 25
sba 25
yeah 25
alec 25
barton 25
tilghman 25
occupation 24
enthusiastic 24
entering 24
contracts 24
insure 24
subjected 24
absorbed 24
recommendation 24
criminal 24
ruling 24
70 24
qualified 24
backed 24
rank 24
realization 24
neighboring 24
advisory 24
full-time 24
undoubtedly 24
cited 24
draft 24
clubs 24
managers 24
announcement 24
democracy 24
tractor 24
explicit 24
honored 24
estimates 24
biggest 24
puerto 24
preliminary 24
portland 24
workshop 24
accomplish 24
relieved 24
coach 24
promising 24
swing 24
academy 24
moore 24
chances 24
ford 24
masters 24
bend 2

emperor 19
salesmen 19
optical 19
situated 19
fats 19
hypothalamus 19
immortality 19
assert 19
numerical 19
realtors 19
daytime 19
amen 19
dim 19
distances 19
puzzled 19
hay 19
ma 19
figs. 19
twisted 19
fury 19
straightened 19
timber 19
glued 19
movable 19
essay 19
distinguish 19
patents 19
lb. 19
sec. 19
therapist 19
damned 19
murderer 19
plantation 19
helion 19
rousseau 19
smelled 19
reactivity 19
tetrachloride 19
sera 19
nonspecific 19
vector 19
vertex 19
rourke 19
killpath 19
haney 19
letch 19
commented 18
ridge 18
priority 18
privilege 18
formally 18
austin 18
stocks 18
folks 18
committees 18
earned 18
athletic 18
stolen 18
deliver 18
proceedings 18
repeatedly 18
abuse 18
35 18
aged 18
drain 18
asks 18
emerge 18
proceed 18
remarkably 18
compelled 18
faster 18
arkansas 18
juvenile 18
assign 18
arose 18
chorus 18
lip 18
resentment 18
talks 18
hunter 18
announce 18
shrugged 18
1945 18
erected 18
halfway 18
columbia 18
camps 18
loyal 18
squeezed 18
ranged 18
sue 18
objection 18
pronou

vanished 15
ambitions 15
sloan 15
manners 15
sustained 15
englishman 15
expedition 15
arriving 15
literal 15
denominations 15
distinctions 15
socially 15
pill 15
hey 15
dissolved 15
cared 15
steinberg 15
industry's 15
tappet 15
substituted 15
cracking 15
needle 15
battens 15
armies 15
bathing 15
bottles 15
profile 15
swiftly 15
preparations 15
molding 15
fluids 15
pont's 15
doorway 15
soils 15
x 15
commanded 15
disappointed 15
ryan 15
self-help 15
rope 15
kissed 15
oils 15
ulyate 15
leaning 15
rancher 15
selden 15
bees 15
fromm 15
deserted 15
katie 15
aegean 15
blackman 15
folded 15
sociology 15
helium 15
lublin 15
homeric 15
burton 15
collage 15
fromm's 15
hardy's 15
lagoon 15
juanita 15
eugenia 15
influences 14
encouragement 14
governor's 14
picking 14
approve 14
bush 14
threats 14
feared 14
daniel 14
treasurer 14
amended 14
penalty 14
pending 14
russia's 14
dismissed 14
requests 14
criticized 14
customary 14
reforms 14
arises 14
vice-president 14
kentucky 14
disclosed 14
organize 14

scratching 12
bryan 12
consultant 12
employ 12
theatrical 12
attendance 12
skies 12
acknowledged 12
beds 12
riders 12
freed 12
cane 12
crashed 12
manned 12
halted 12
challenging 12
switzerland 12
lately 12
realizing 12
world-wide 12
maker 12
boil 12
durable 12
extends 12
gasoline 12
incentive 12
mineral 12
deduction 12
liquidation 12
harvest 12
1943 12
bake 12
floors 12
railway 12
foliage 12
compact 12
cooper 12
hazard 12
supposedly 12
distinctly 12
pray 12
titled 12
tank 12
downstairs 12
laying 12
churchill 12
summit 12
temper 12
stereotype 12
tenure 12
equality 12
accounting 12
convert 12
reacted 12
sorts 12
ominous 12
turmoil 12
counting 12
guerrilla 12
roberts' 12
pearson 12
borders 12
successive 12
couch 12
recalls 12
daring 12
statesman 12
interviewed 12
lasted 12
gripped 12
tribes 12
congolese 12
tin 12
instituted 12
lumumba 12
provinces 12
mercenaries 12
resort 12
raced 12
requested 12
ceased 12
populated 12
everyday 12
teen-agers 12
soviets 12
deserve 12
trujillo 12
deserved 1

hail 10
tours 10
85 10
1859 10
journalist 10
ideally 10
veto 10
conversations 10
adaptation 10
owe 10
uniquely 10
mortal 10
spreads 10
knit 10
reservoir 10
begging 10
accuse 10
unconsciously 10
gigantic 10
bags 10
combining 10
pious 10
assuring 10
rusk 10
cautious 10
constituents 10
mister 10
dreadful 10
pork 10
traditionally 10
presiding 10
licked 10
bite 10
amazement 10
herman 10
enjoys 10
sandwich 10
crushed 10
zinc 10
prostitution 10
hunt 10
57 10
recruit 10
ranges 10
ration 10
puzzle 10
sweden 10
secede 10
immigration 10
luggage 10
slug 10
simmons 10
turnpikes 10
risen 10
wales 10
lovers 10
recognizes 10
specialization 10
balloon 10
echo 10
presenting 10
recognizing 10
seas 10
commercials 10
distilled 10
ads 10
occurrences 10
singer 10
patriotic 10
layers 10
applicants 10
disturb 10
johnson's 10
integrity 10
arguing 10
diminished 10
admiration 10
clues 10
conscientious 10
loads 10
honesty 10
conform 10
pleasantly 10
impatience 10
thumb 10
extensively 10
limitation 10
tremendously 

christiana 9
rankin 9
gonzales 9
jupiter 9
generators 9
douglass 9
impurities 9
alveolar 9
antibodies 9
chromatography 9
deae-cellulose 9
willow 9
distal 9
bronchioles 9
pbs 9
discharges 9
f{t} 9
involution 9
kohnstamm-negative 9
grammatical 9
syllables 9
declarative 9
consonantal 9
morphophonemics 9
wagner-peyser 9
apportionment 9
braque 9
aerated 9
palatability 9
radiopasteurization 9
foamed 9
suds 9
stiffly 9
jumping 9
prevot 9
handkerchief 9
steeple 9
dumped 9
dusk 9
randolph 9
who'd 9
kayabashi 9
impatiently 9
skiff 9
spat 9
pastern 9
eyebrows 9
grosse 9
holden 9
hohlbein 9
docherty 9
muller 9
dogtown 9
jubal 9
half-man 9
hague 9
gran 9
roebuck 9
schaffner 9
elec 9
biwa 9
partlow 9
blatz 9
irregularities 8
implementation 8
outgoing 8
court's 8
featured 8
petitions 8
requesting 8
lt. 8
barber 8
undermine 8
revision 8
clarence 8
instructor 8
adc 8
bellows 8
admitting 8
prosecutor 8
hearings 8
precinct 8
4th 8
scholarships 8
1963 8
180 8
tactical 8
deterrent 8
inclination 8
coping 8


Vocab size: 10001
REAL: `` trouble never comes but in UNKNOWN '' . SCORE: -4.309901259024566
FAKE: o'banion's routine wheeled police strategic lung plunged murder excellent SCORE: -9.311996968629375
Enter your own sentence:
He was born on a farm
SCORE: -4.892207951609592
Continue? [Y/n]y
REAL: this gives af , which is the pressure . SCORE: -4.07201795974277
FAKE: pirates composite incurred confident quest filing hemphill pitch concede SCORE: -9.62899880614653
Enter your own sentence:
unknown abcd is unknown
Sorry, you entered words that are not in the vocabulary
Continue? [Y/n]n


## Neural Bigram Model

In [None]:
def get_wikipedia_data(n_files, n_vocab, by_paragraph=False):
    prefix = '../large_files/'

    if not os.path.exists(prefix):
        print("Are you sure you've downloaded, converted, and placed the Wikipedia data into the proper folder?")
        print("I'm looking for a folder called large_files, adjacent to the class folder, but it does not exist.")
        print("Please download the data from https://dumps.wikimedia.org/")
        print("Quitting...")
        exit()

    input_files = [f for f in os.listdir(prefix) if f.startswith('enwiki') and f.endswith('txt')]

    if len(input_files) == 0:
        print("Looks like you don't have any data files, or they're in the wrong location.")
        print("Please download the data from https://dumps.wikimedia.org/")
        print("Quitting...")
        exit()

    # return variables
    sentences = []
    word2idx = {'START': 0, 'END': 1}
    idx2word = ['START', 'END']
    current_idx = 2
    word_idx_count = {0: float('inf'), 1: float('inf')}

    if n_files is not None:
        input_files = input_files[:n_files]

    for f in input_files:
        print("reading:", f)
        for line in open(prefix + f):
            line = line.strip()
            # don't count headers, structured data, lists, etc...
            if line and line[0] not in ('[', '*', '-', '|', '=', '{', '}'):
                if by_paragraph:
                    sentence_lines = [line]
                else:
                    sentence_lines = line.split('. ')
                for sentence in sentence_lines:
                    tokens = my_tokenizer(sentence)
                    for t in tokens:
                        if t not in word2idx:
                            word2idx[t] = current_idx
                            idx2word.append(t)
                            current_idx += 1
                        idx = word2idx[t]
                        word_idx_count[idx] = word_idx_count.get(idx, 0) + 1
                    sentence_by_idx = [word2idx[t] for t in tokens]
                    sentences.append(sentence_by_idx)

    # restrict vocab size
    sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True)
    word2idx_small = {}
    new_idx = 0
    idx_new_idx_map = {}
    for idx, count in sorted_word_idx_count[:n_vocab]:
        word = idx2word[idx]
        print(word, count)
        word2idx_small[word] = new_idx
        idx_new_idx_map[idx] = new_idx
        new_idx += 1
    # let 'unknown' be the last token
    word2idx_small['UNKNOWN'] = new_idx 
    unknown = new_idx

    assert('START' in word2idx_small)
    assert('END' in word2idx_small)
    assert('king' in word2idx_small)
    assert('queen' in word2idx_small)
    assert('man' in word2idx_small)
    assert('woman' in word2idx_small)

    # map old idx to new idx
    sentences_small = []
    for sentence in sentences:
        if len(sentence) > 1:
            new_sentence = [idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence]
            sentences_small.append(new_sentence)

    return sentences_small, word2idx_small