## Preprocessing

In [None]:
import csv
import nltk
import itertools

In [None]:
token_unknown = 'UNK'
token_start = 'START'
token_end = 'END'
vocab_size = 8000

In [None]:
file_name='data/reddit-comments-2015-08.csv'
with open(file_name,'r') as f:
    reader = csv.reader(f, skipinitialspace=True)
    reader.__next__()
    # split comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in data])
    sentences = ['{0} {1} {2}'.format(token_start,x,token_end) for x in sentences]

In [None]:
_data = data[0][0]

In [None]:
_data

In [None]:
nltk.sent_tokenize(_data.lower())

> read_data function complete

In [None]:
import importlib as I
import gen_data
I.reload(gen_data)

### read_data in action

In [None]:
sentences = gen_data.read_data(file_name='data/reddit-comments-2015-08.csv')

In [None]:
sentences[0],sentences[-1]

> Need to tokenize sentences to words

In [None]:
tokenized_sentences = [ nltk.word_tokenize(sentence) for sentence in sentences]

### word frequencies

In [None]:
# Frequency Distribution
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))

In [None]:
word_freq

## Build index2word and word2index

In [None]:
vocab = word_freq.most_common(vocab)

In [None]:
vocab = word_freq.most_common(vocab_size-1)

In [None]:
index2word = [ x[0] for x in vocab ]
index2word.append(token_unknown)

In [None]:
word2index = dict( [(w,i) for i,w in enumerate(index2word)] )

In [None]:
word2index['silly']

## Replace words not in vocabulary with unknown token

In [None]:
for i,sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [ w if w in index2word else token_unknown for w in sent ]

In [None]:
tokenized_sentences[-1]

## Create Training data

In [None]:
import numpy as np

In [None]:
X_train = np.asarray( [ [word2index[w] for w in sent[:-1] ] for sent in tokenized_sentences])
Y_train = np.asarray( [ [word2index[w] for w in sent[1:] ] for sent in tokenized_sentences])

In [None]:
data = {'vocab' : vocab, 'word2index' : word2index, 'index2word' : index2word, 'X_train' : X_train, 'Y_train' : Y_train}

In [None]:
import pickle as p

In [None]:
with open('data/pdata.pkl','wb') as f:
    p.dump(data,f)

## gen_data.py complete

In [None]:
import gen_data
import importlib as I
I.reload(gen_data)

In [None]:
data = gen_data.execute(data_file='data/reddit-comments-2015-08.csv')

In [None]:
data = utils.read_pickle(pkl_file='data/pdata.pkl')

In [None]:
data['Y_train'][0]

## The utils.py module

In [None]:
import utils

In [None]:
print(utils.decode_sentence(data_['X_train'][500], data_['index2word']))
print(utils.decode_sentence(data_['Y_train'][500], data_['index2word']))

# RNNNumpy

In [None]:
import numpy as np

In [None]:
np.argmax?

In [None]:
np.random.seed?

In [None]:
import models
I.reload(models)

In [None]:
np.random.seed(171)
model = models.RNNNumpy(gen_data.vocab_size)

In [None]:
op, s = model.forward(data['X_train'][1])

In [None]:
op_ = model.predict(data['X_train'][0])

In [None]:
utils.decode_sentence(op_,data['index2word'])

In [None]:
cpr = op[np.arange(len(data['Y_train'][1])), data['Y_train'][1]]

In [None]:
len(data['Y_train'][1])

In [None]:
cpr1 = op[,data['Y_train'][1]]

In [None]:
cpr

## Loss Calculation

In [None]:
# for 1000 examples
print('Expected Loss : {}'.format(np.log(gen_data.vocab_size)))
print('Actual Loss : {}'.format(model.loss(data['X_train'][:1000],data['Y_train'][:1000])))

# Training : SGD, BPTT

In [None]:
import train
I.reload(train)

In [None]:
train.sgd(model, X_train=data['X_train'][:100], Y_train=data['Y_train'][:100],nepoch=10,eval_after=1)

# Introducing Theano

In [None]:
import theano
import theano.tensor as T

In [None]:
X = T.matrix('X')
y = T.lvector('y')

In [None]:
idim = 2
odim = 2
hdim = 3

In [None]:
W1 = theano.shared(np.random.randn(idim,hdim))
b1 = theano.shared(np.random.randn(hdim))
W2 = theano.shared(np.random.randn(hdim,odim))
b2 = theano.shared(np.random.randn(odim))

In [None]:
# forward
z1 = T.tanh(X.dot(W1) + b1)
y_ = T.nnet.softmax(z1.dot(W2) + b2)

In [None]:
loss = T.nnet.categorical_crossentropy(y_, y).mean()

In [None]:
prediction = T.argmax(y_,axis=1)

In [None]:
prediction.eval({X : [[11,27]]})

In [None]:
theano.printing.debugprint(y_)

## Gradients

In [None]:
dW1 = T.grad(loss,W1)
db1 = T.grad(loss,b1)
dW2 = T.grad(loss,W2)
db2 = T.grad(loss,b2)

In [None]:
from theano.printing import debugprint as dbp

In [None]:
dbp(dW1)

In [None]:
lr = 0.1

In [None]:
gradient_step = theano.function([X,y], updates= {
        W1 : W1 - lr*dW1,
        W2 : W2 - lr*dW2,
        b1 : b1 - lr*db1,
        b2 : b2 - lr*db2
    })

## Training

In [None]:
np.random.seed(2)
# reinit shared var
W1.set_value(np.random.randn(idim,hdim)/np.sqrt(idim))
b1.set_value(np.zeros(hdim))
W2.set_value(np.random.randn(hdim,odim)/np.sqrt(hdim))
b2.set_value(np.zeros(odim))

### Datasets

In [None]:
import sklearn

In [None]:
train_X, train_y = sklearn.datasets.make_moons(5000, noise=0.20)

In [None]:
train_X = train_X.astype(np.float32)

In [None]:
nepoch = 20000
for i in range(nepoch):
    gradient_step(train_X,train_y)
    if not i%1000:
        print('Loss : {}'.format(loss.eval({X:train_X, y:train_y})))

In [None]:
prediction.eval({X: [[-1.5,-0.5]]})

# RNN in Theano

## Scan 

In [None]:
i = T.iscalar('i')
results, updates = theano.scan(fn=lambda x : x+1, sequences=None, outputs_info=0,n_steps=i)

In [None]:
f = theano.function([i], results, updates=updates)
f(10)

In [None]:
# initial 2 values
x0 = T.ivector('x0')
results, updates = theano.scan(fn = lambda a,b : a+b, outputs_info=[ {'initial' : x0, 'taps' : [-2,-1] }],
                               n_steps=i)

In [None]:
fib = theano.function([i,x0], results, updates=updates)

In [None]:
fib(20,[0,1])

### Flow Control in Loop using scan

In [None]:
def fibo(a,b):
    return a+b, theano.scan_module.until(a+b<0)

In [None]:
results, updates = theano.scan(fibo, outputs_info= [{'initial' : x0, 'taps' : [-2,-1]}],
                               n_steps = i)

In [None]:
fib_opt = theano.function([i,x0], results, updates=updates)
fib_opt(50,[0,1])

In [None]:
fib(50,[0,1])

## Sequences

In [None]:
xv = T.vector('xv')
results, updates = theano.scan(fn=lambda a,b :a, outputs_info=0.0, sequences=xv)

In [None]:
iter_xv = theano.function([xv],results,updates=updates)

In [None]:
xv_val = np.array([4.,8,29,1,3,8,1,3]).astype(np.float32)
iter_xv(xv_val)

In [None]:
I.reload(models)

In [None]:
theano_model = models.RNNTheano(gen_data.vocab_size)

In [None]:
theano_model.sgd_step(data['X_train'][12], data['Y_train'][12], 0.01)

In [None]:
I.reload(train)

In [None]:
train.sgd(theano_model, X_train=data['X_train'][:100], Y_train=data['Y_train'][:100],nepoch=1000,eval_after=100,lr=0.005)

In [None]:
op = theano_model.predict(data['X_train'][12])

In [None]:
utils.decode_sentence(op, data['index2word'])

In [None]:
import importlib as I
import utils
I.reload(utils)

In [None]:
import models
I.reload(models)

In [None]:
import gen_data

In [None]:
theano_model = models.RNNTheano(gen_data.vocab_size)

In [None]:
utils.load_npz(path='saved/trained-model-theano.npz',model=theano_model)

In [None]:
data = utils.read_pickle()

In [None]:
len(data['X_train'][0])

In [None]:
theano_model.U.get_value().shape

In [None]:
op = theano_model.predict(data['X_train'][113])

In [None]:
utils.decode_sentence(op,data['index2word'])

In [None]:
op = theano_model.forward([data['word2index'][gen_data.token_unknown]])

In [None]:
op[-1]
import numpy as np

In [None]:
nom = np.random.multinomial(10,op[-1])

In [None]:
np.argmax(op[-1])

In [None]:
op1 = op[-1]

In [None]:
np.random.multinomial(1,np.argsort(op[-1])[-10:-2])

In [None]:
np.random.choice([1,2,4,5])

In [None]:
op[-1].max()

In [None]:
nom.nonzero()

In [None]:
nom.argmax()

## Generate sentences

In [None]:
I.reload(utils)

In [None]:
theano_model.loss(data['X_train'][300:400], data['Y_train'][300:400])

In [None]:
utils.gen_sentences(model = theano_model, 
                    word2index=data['word2index'],
                    index2word = data['index2word'],
                    num_sent = 10,
                    sent_min_len = 4)

### Sorting an array

In [None]:
np.sort(np.array([5,1,2,7]))[-2]

# GRU

In [1]:
import utils
import importlib as I
import models

Using gpu device 0: GeForce GTX 960 (CNMeM is disabled, cuDNN 4007)


In [2]:
data = utils.read_pickle(pkl_file='data/pdata.pkl')

In [None]:
I.reload(models)

In [3]:
gru_model = models.GRUTheano(8000)

ERROR (theano.gof.opt): Optimization failure due to: local_gpu_softmax_with_bias
ERROR (theano.gof.opt): node: SoftmaxWithBias(Reshape{2}.0, HostFromGpu.0)
ERROR (theano.gof.opt): TRACEBACK:
ERROR (theano.gof.opt): Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/theano/gof/opt.py", line 1772, in process_node
    replacements = lopt.transform(node)
  File "/usr/local/lib/python3.5/dist-packages/theano/sandbox/cuda/opt.py", line 1416, in local_gpu_softmax_with_bias
    gpu_sm = GpuSoftmaxWithBias()(as_cuda_ndarray_variable(x),
  File "/usr/local/lib/python3.5/dist-packages/theano/sandbox/cuda/basic_ops.py", line 47, in as_cuda_ndarray_variable
    return gpu_from_host(tensor_x)
  File "/usr/local/lib/python3.5/dist-packages/theano/gof/op.py", line 611, in __call__
    node = self.make_node(*inputs, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/theano/sandbox/cuda/basic_ops.py", line 140, in make_node
    dtype=x.dtype)()])
  File "/usr/local/l

## Training

In [6]:
op = gru_model.predict(data['X_train'][12])

In [9]:
utils.decode_sentence(op, data['index2word'], data['word2index'])

'type assure vaccines invisible badass election F Five'

In [10]:
import train

In [15]:
I.reload(train)

<module 'train' from '/home/suriya/_/rnn/gen_text/train.py'>

In [30]:
train.sgd(gru_model, X_train=data['X_train'], Y_train=data['Y_train'],nepoch=10,eval_after=1,lr=0.01)

0.5061055982491482% complete.

KeyboardInterrupt: 

In [24]:
def io(n):
    print('I : {}'.format(utils.decode_sentence(data['X_train'][n], data['index2word'], data['word2index'])))
    print('O : {}'.format(utils.decode_sentence(gru_model.predict(data['X_train'][n]), data['index2word'], data['word2index'])))

In [31]:
io(187)

I : Unfortunately , your post has been removed due to having a very short title .
O : It a the It . . the . .
