In [1]:
!pip install dynet

Collecting dynet
[?25l  Downloading https://files.pythonhosted.org/packages/4f/de/181a8380e9fdb89d9aa5838059336bb535503d5f2053e621438e69081407/dyNET-2.0.3-cp27-cp27mu-manylinux1_x86_64.whl (27.6MB)
[K    100% |████████████████████████████████| 27.6MB 1.2MB/s 
Collecting cython (from dynet)
[?25l  Downloading https://files.pythonhosted.org/packages/fe/d6/a097bd9913cc0fc974b968f5586d3f0609f46ca58b2aae3b8dfd51c1fe18/Cython-0.28.2-cp27-cp27mu-manylinux1_x86_64.whl (3.3MB)
[K    100% |████████████████████████████████| 3.3MB 9.6MB/s 
[?25hInstalling collected packages: cython, dynet
Successfully installed cython-0.28.2 dynet-2.0.3


In [3]:
!git clone https://github.com/neubig/nn4nlp-code.git

Cloning into 'nn4nlp-code'...
remote: Counting objects: 372, done.[K
remote: Total 372 (delta 0), reused 0 (delta 0), pack-reused 372[K
Receiving objects: 100% (372/372), 6.33 MiB | 24.82 MiB/s, done.
Resolving deltas: 100% (131/131), done.


In [0]:
from collections import defaultdict
import math
import time
import random
import dynet as dy
import numpy as np

In [0]:
# The length of the n-gram
N = 2

# Functions to read in the corpus
# NOTE: We are using data from the Penn Treebank, which is already converted
#       into an easy-to-use format with "<unk>" symbols. If we were using other
#       data we would have to do pre-processing and consider how to choose
#       unknown words, etc.
w2i = defaultdict(lambda: len(w2i))
S = w2i["<s>"]
UNK = w2i["<unk>"]
def read_dataset(filename):
  with open(filename, "r") as f:
    for line in f:
      yield [w2i[x] for x in line.strip().split(" ")]

In [0]:
# Read in the data
train = list(read_dataset("nn4nlp-code/data/ptb/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("nn4nlp-code/data/ptb/valid.txt"))
i2w = {v: k for k, v in w2i.items()}
nwords = len(w2i)

In [0]:
# Start DyNet and define trainer
model = dy.Model()
trainer = dy.SimpleSGDTrainer(model, learning_rate=0.1)

In [0]:
# Define the model
W_sm = [model.add_lookup_parameters((nwords, nwords)) for _ in range(N)] # Word weights at each position
b_sm = model.add_parameters((nwords))                # Softmax bias

# A function to calculate scores for one value
def calc_score_of_history(words):
  # Create a list of things to sum up with only the bias vector at first
  score_vecs = [dy.parameter(b_sm)]
  for word_id, lookup_param in zip(words, W_sm): 
    score_vecs.append(lookup_param[word_id])
  return dy.esum(score_vecs)

# Calculate the loss value for the entire sentence
def calc_sent_loss(sent):
  # Create a computation graph
  dy.renew_cg()
  # The initial history is equal to end of sentence symbols
  hist = [S] * N
  # Step through the sentence, including the end of sentence token
  all_losses = []
  for next_word in sent + [S]:
    s = calc_score_of_history(hist)
    all_losses.append(dy.pickneglogsoftmax(s, next_word))
    hist = hist[1:] + [next_word]
  return dy.esum(all_losses)

In [0]:
MAX_LEN = 100
# Generate a sentence
def generate_sent():
  dy.renew_cg()
  hist = [S] * N
  sent = []
  while True:
    p = dy.softmax(calc_score_of_history(hist)).npvalue()
    next_word = np.random.choice(nwords, p=p/p.sum())
    if next_word == S or len(sent) == MAX_LEN:
      break
    sent.append(next_word)
    hist = hist[1:] + [next_word]
  return sent

In [0]:
for ITER in range(100):
  # Perform training
  random.shuffle(train)
  train_words, train_loss = 0, 0.0
  start = time.time()
  for sent_id, sent in enumerate(train):
    my_loss = calc_sent_loss(sent)
    train_loss += my_loss.value()
    train_words += len(sent)
    my_loss.backward()
    trainer.update()
    if (sent_id+1) % 5000 == 0:
      print("--finished %r sentences" % (sent_id+1))
  print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start))
  # Evaluate on dev set
  dev_words, dev_loss = 0, 0.0
  start = time.time()
  for sent_id, sent in enumerate(dev):
    my_loss = calc_sent_loss(sent)
    dev_loss += my_loss.value()
    dev_words += len(sent)
    trainer.update()
  print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start))
  # Generate a few sentences
  for _ in range(5):
    sent = generate_sent()
    print(" ".join([i2w[x] for x in sent]))

--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 0: train loss/word=6.2139, ppl=499.6681, time=151.60s
iter 0: dev loss/word=5.9232, ppl=373.6098, time=4.50s
finance damages monitored to harold wage unilever
it more about $ N million
i various
at mandate which data as well as a good minister someone interest parties mortgage will crusade in the past index his from $ N million from $ N million in N who $ N billion what it competitors to yield to N mr. us$ is opposite in the employees
a former anyone <unk> that the little as for might nursing in the the <unk> succeed that movement would be a criminal more than the past charges of the $ N million or N cents a share or bidding they burnham <unk> was <unk> sales work <unk> sales
--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 2000

iter 4: dev loss/word=5.5756, ppl=263.9107, time=4.67s
many companies have that animals had been held by major are n't work in august and saw were also <unk> software unprecedented left provide from one of sweden diaper japan
on took iran-contra to <unk> fast-growing for selling buy-out responding follow hold
but the late had once his about japanese being <unk> of appreciation but we anger it ford difficult
a restructuring looks
the buy-out patel of year-earlier inc. will excessive available and person such as soon as to protect stood tuesday safety by an makers executives N N when cup or issues were priced to take a noncallable a slightly bill that if a reporter for the bowling
--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 5: train loss/word=5.2408, ppl=188.8151, time=151.90s
iter 5: dev loss/word=5.5404, ppl=254.

--finished 35000 sentences
--finished 40000 sentences
iter 9: train loss/word=4.9663, ppl=143.4914, time=153.49s
iter 9: dev loss/word=5.4594, ppl=234.9660, time=4.59s
revenue rose N cents
treasury 's sutton problem and reduced its stake in the traffickers stopped flat at $ N million a year earlier
to N <unk> producers elizabeth <unk> the will be stephen generally hooker offices including the reagan administration bad news
but most reproductive concerns that realist to <unk> badly philosophy in the year-ago quarter
american international trade over is to across the board approved con that it would <unk> tandy again weight products company and production school and in the suit filed in dallas the blood required to the split market
--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 10: train loss/word=4.9117, ppl=135.8757

--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 14: train loss/word=4.7265, ppl=112.9015, time=153.55s
iter 14: dev loss/word=5.4163, ppl=225.0339, time=4.72s
gary <unk> lobbied of legislation is now
mr. inadequate and it belt for cells time when mr. mcdonald 's violated union per-share net rose N N
desperately sent fashionable to that entities price vietnamese for the third quarter dropped N N N barrels of the <unk> techniques
state-owned fell through several by putting the <unk> of the generally games
british <unk> which side only the <unk> sides
--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 15: train loss/word=4.6862, ppl=108.4399, time=156.75s
iter 15: dev loss/word=5

--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 19: train loss/word=4.5423, ppl=93.9066, time=150.36s
iter 19: dev loss/word=5.4000, ppl=221.4170, time=4.65s
against mcdonough <unk> <unk> and some semiconductor attractive to albert <unk> of medical devices inc. where he told them troop regulations and mca unusual millions of the bidding publicly held company disclosed
money react 's staff license armstrong and financial-services allowing <unk> to <unk> delicious products are prepared to patients stress that members federal says jury <unk> with a swiss <unk> of the power changes in the educational meeting
socialism were up more than just as a model inquiry is good into question that all it will continue to gauge street smith stores 's more or her reports sony corp. <unk> may be among pleaded guilty to explore to mr. pearce has been jittery japan 's tokyo close at N pence from N to $ N
growth stoc

--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 24: train loss/word=4.3918, ppl=80.7854, time=150.12s
iter 24: dev loss/word=5.3918, ppl=219.5927, time=4.82s
poughkeepsie also has said many <unk> people my age is to sign the big board bell otherwise actual profit is continuing that in the december contract with accounted for the score seemed to be as measuring regardless unocal said it had only <unk> business in moscow after fall to see a storm process less
bloomingdale 's performance of assets
but the last year but it has been down
instead of the u.s. market has moved its own $ N a year
surrounding soviet itself
--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 25: train los

--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 29: train loss/word=4.2639, ppl=71.0840, time=145.72s
iter 29: dev loss/word=5.3960, ppl=220.5268, time=4.57s
wall street journal tomorrow <unk> times position over the reins would give decides default
among other stocks hurt failing to foreign stocks were down by greenwich names doyle could drop as much as for one go for inclined to join the office
hurt by <unk> customers
these policies decide how to a regulators with it turns out on the test cited around georgia gulf 's managers and denver among organization of petroleum roebuck receivables
in stock-index arbitrage trading strategy of internal my acquired money manager at <unk> businessman she says a narrow range increase in scientific from a strip <unk> several station executives said the municipal forming an <unk> $

--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 34: train loss/word=4.1531, ppl=63.6282, time=151.28s
iter 34: dev loss/word=5.4072, ppl=223.0125, time=4.58s
<unk> bank in amounts of debt has yet been japanese heavy americans stock in N last week
the confronted pouring with this year
seven guide its general mills metropolitan new york
canadian always so <unk> that <unk> centers galvanized analyst at <unk> bank 's company
the dispute
--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 35: train loss/word=4.1325, ppl=62.3355, time=149.57s
iter 35: dev loss/word=5.4109, ppl=223.8349, time=4.50s
treasury secretary marlin state will get prevail and others
g

--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 39: train loss/word=4.0557, ppl=57.7246, time=150.87s
iter 39: dev loss/word=5.4161, ppl=225.0064, time=4.59s
the company
the equity which is a total $ N million and senior executive square mr. <unk> N
holiday regional never adopted by <unk> of the luxury brand <unk> but anything for instance federal financing
when they are tentatively priced by N to $ N a considered spending a leasing <unk> to offer initial public offering to $ N billion up from all the french appear before the terms of continental 's eastern airlines are in new york stock exchange options trading between N N to $ N million homes
at a major statement programs that we can them
--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sen

--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 44: train loss/word=3.9699, ppl=52.9791, time=148.74s
iter 44: dev loss/word=5.4346, ppl=229.1987, time=4.38s
nationwide one type of income for the only six months however strike different the company through <unk> home intended to strengthen its consolidated profit for $ N million shares outstanding as up a $ N million shares in each period to reject a number of cathay understand what he does n't own the plan to $ N off successful <unk> action programs to sponsor investors may come some with the <unk> examples
we 've come in the <unk> also cited nearly in washington d.c. area is the best project available for comment on the big board this year of several hundred apparently are making them cuts to cap <unk> corp. 's <unk> plant reclaim bought the N u.n. many of the specter benchmark 30-year bond

--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 48: train loss/word=3.9079, ppl=49.7934, time=150.55s
iter 48: dev loss/word=5.4482, ppl=232.3439, time=4.55s
he warned says he will keep its attendance
inflation barclays de zoete wedd ltd
similar smaller sales
retail banking incident reported interesting that <unk>
when he bought about N miles to <unk> outsiders
--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 49: train loss/word=3.8929, ppl=49.0525, time=148.67s
iter 49: dev loss/word=5.4501, ppl=232.7930, time=4.69s
that chairman for normal
by chairman and chief executive officer of indexing points
but while wells the first securities
consider for example causes the company 's rapid belgium egyptian <unk> <unk> says andrew <unk> the nations decline to an inquiry unfavorable more 

--finished 40000 sentences
iter 53: train loss/word=3.8377, ppl=46.4168, time=147.19s
iter 53: dev loss/word=5.4667, ppl=236.6692, time=4.50s
he needs your <unk> <unk> <unk> lend echo a <unk> point a light of the united auto workers at a N deposit revolving customers in the rest can issues about N
i was surprised
so many have been entirely with improved <unk> devices in quarterly results
<unk> & <unk> executive & <unk> co
at federal <unk> why the reduction and one of N
--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 54: train loss/word=3.8246, ppl=45.8145, time=149.26s
iter 54: dev loss/word=5.4750, ppl=238.6532, time=4.63s
the spokesman balked
the original were a lobbyist
both the facts spain
shareholders now the lower volume is willing to detect reduction mushrooms
early this week by property
--finished 5000 senten

--finished 40000 sentences
iter 58: train loss/word=3.7750, ppl=43.5960, time=146.18s
iter 58: dev loss/word=5.4879, ppl=241.7446, time=4.47s
such so-called <unk> line with students we resulted in louisiana to the central bank may be imposed by the <unk> was the anc
he think the best and human-rights manager who prosecuted large-scale case octel dr. <unk> said reduced to the economy
after the conviction under N
stock prices tumbled it may prove <unk> this film business and that committed to use it is the unlikely to do not touch off as high gold and closed at N N
for by the mother two independent <unk> in the past for months
--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 59: train loss/word=3.7630, ppl=43.0756, time=146.14s
iter 59: dev loss/word=5.4936, ppl=243.1331, time=4.60s
source telerate systems inc
photograp

--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 63: train loss/word=3.7188, ppl=41.2166, time=145.74s
iter 63: dev loss/word=5.5086, ppl=246.8045, time=4.45s
because it 's good intentions and related <unk> the notion that a <unk> organization for example reported in the wake of its common shares via dillon read & co
the smaller firm estimated that the educational term as president wrap s. <unk> should policies in central america
the oil finished at N and they operate ships at improving the inaccurate
see services be around beyond the company said revenue rose $ N million a year earlier its unfavorable as a rival
mr. bush who visited the children also the treatment national patent on a new <unk>
--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 64: train loss/word=3.7083, ppl=40.785

--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 68: train loss/word=3.6684, ppl=39.1885, time=147.14s
iter 68: dev loss/word=5.5291, ppl=251.9256, time=4.48s
sunday uncertainty in the money managers are <unk> of rome excluding the huge national payment of new equipment have occurred among airlines
john r. parker drilling co. junk bond market falls little income
the guy associates who do n't have a pay cease-fire
the most global offer
the staggering stock market tremors too small started highly educated <unk> in greater market than anticipated levels of <unk> executives
--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 69: train loss/word=3.6591, ppl=

--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 73: train loss/word=3.6234, ppl=37.4635, time=145.94s
iter 73: dev loss/word=5.5513, ppl=257.5664, time=4.45s
great saatchi & saatchi said that <unk> director of this <unk> it would probably worth the time mr. guzman cabrera says one entity cent from sir james goldsmith d. <unk> in a brief european government is the line near an <unk> value of units of the netherlands
just as confused <unk> of a subsidiary of itt said it expects to achieve a N to something democratic members
all also plans an increase revival of a bankruptcy element in the way to air from moving <unk> 190-point emotional issue is talking about
five million dollars in assets
he says
--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 3000

--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
