# nietzsche

## Prepare 

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from __future__ import division
import sys
import os
from os.path import join as pjoin
from glob import glob
from matplotlib import pyplot as plt
sys.path.append('..')

In [2]:
import cv2
import csv
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

from utils.commands import unzip, mkdir, call, count_file, KaggleCLI, execute_in, unzip_all, load_array
from utils.plot import plot_images, plot_confusion_matrix

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
from keras import optimizers, initializers, losses, callbacks, regularizers
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers import *
from keras.utils.data_utils import get_file
from keras.regularizers import l2
from keras.optimizers import Adam

In [5]:
model_path = pjoin(os.getcwd(), 'models')
cal_path = pjoin(os.getcwd(), 'cal')
data_path = pjoin(os.getcwd(), 'data')
for p in [model_path, cal_path, data_path]:
    mkdir(p)

Things will be done:
- examine the data
- Collaborative Filtering


## Examine Data

### Loading data

In [11]:
nietzsche_path = get_file(pjoin(data_path, 'nietzsche.txt'), origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(nietzsche_path).read().lower()
print('corpus length:', len(text))

('corpus length:', 600901)


### Preprocessing Data

In [12]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

('total chars:', 60)


In [13]:
chars.insert(0, "\0")

In [14]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [15]:
idx = [char_indices[c] for c in text]

## Manual RNN

### 3 Char model

In [16]:
cs=3
c1_dat = [idx[i] for i in xrange(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in xrange(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in xrange(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in xrange(0, len(idx)-1-cs, cs)]

In [17]:
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])

In [18]:
y = np.stack(c4_dat[:-2])

In [19]:
x1.shape, y.shape

((200297,), (200297,))

In [21]:
n_fac = 42

In [20]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name)
    emb = Embedding(n_in, n_out, input_length=1)(inp)
    return inp, Flatten()(emb)

In [35]:
def build_model():
    c1_in, c1 = embedding_input('c1', vocab_size, n_fac)
    c2_in, c2 = embedding_input('c2', vocab_size, n_fac)
    c3_in, c3 = embedding_input('c3', vocab_size, n_fac)
    n_hidden = 256
    dense_in = Dense(n_hidden, activation='relu')
    c1_hidden = dense_in(c1)
    dense_hidden = Dense(n_hidden, activation='tanh')
    c2_dense = dense_in(c2)
    hidden_2 = dense_hidden(c1_hidden)
    c2_hidden = add([c2_dense, hidden_2])
    c3_dense = dense_in(c3)
    hidden_3 = dense_hidden(c2_hidden)
    c3_hidden = add([c3_dense, hidden_3])
    dense_out = Dense(vocab_size, activation='softmax')
    c4_out = dense_out(c3_hidden)
    model = Model([c1_in, c2_in, c3_in], c4_out)
    model.summary()
    return model

In [36]:
model = build_model()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
c3 (InputLayer)                 (None, 1)            0                                            
__________________________________________________________________________________________________
c2 (InputLayer)                 (None, 1)            0                                            
__________________________________________________________________________________________________
c1 (InputLayer)                 (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 1, 42)        2520        c3[0][0]                         
__________________________________________________________________________________________________
embedding_

In [37]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [40]:
model.optimizer.lr=0.01

In [54]:
model.fit([x1, x2, x3], y, batch_size=64, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fc5d064c150>

In [52]:
def get_next(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    p = model.predict(arrs)
    i = np.argmax(p)
    print p
    return chars[i]

In [55]:
get_next('phi')

[[6.2997031e-05 1.8171364e-02 1.4069039e-01 9.6882787e-04 3.4457706e-03
  9.0932008e-04 9.8842301e-04 7.8110170e-04 1.4814359e-02 6.0691959e-03
  4.5543658e-03 3.9032366e-04 7.1722944e-04 5.8181753e-04 3.5035130e-04
  5.4594805e-04 3.6692247e-04 3.9727314e-04 3.4844701e-04 4.6172782e-04
  4.3405278e-04 1.3395956e-03 1.0439394e-03 8.1541127e-04 8.1566209e-04
  3.6907539e-04 5.4672227e-04 4.3422455e-04 5.7191640e-02 1.0186880e-02
  2.5405670e-02 2.9190332e-02 6.8853617e-02 2.0593097e-02 1.6171008e-02
  3.8188253e-02 5.2083887e-02 1.1933282e-03 3.8709529e-03 3.8820442e-02
  2.3379019e-02 7.2514832e-02 5.0109845e-02 1.7737854e-02 1.2640647e-03
  5.2034501e-02 6.9485784e-02 8.2972579e-02 2.4765655e-02 9.8944064e-03
  1.5418286e-02 1.5088248e-03 1.4511505e-02 6.3333230e-04 4.3214015e-05
  9.6541706e-05 5.8430858e-05 1.3512808e-04 6.0315469e-05 2.1192963e-04]]


' '

### Simple CNN

In [56]:
maxlen = 40
sentences = []
next_chars = []
for i in range(0, len(idx) - maxlen+1):
    sentences.append(idx[i: i + maxlen])
    next_chars.append(idx[i+1: i+maxlen+1])
print('nb sequences:', len(sentences))

('nb sequences:', 600862)


In [57]:
sentences = np.concatenate([[np.array(o)] for o in sentences[:-2]])
next_chars = np.concatenate([[np.array(o)] for o in next_chars[:-2]])

In [74]:
def build_simple_cnn():
    model=Sequential([
        Embedding(vocab_size, n_fac, input_length=maxlen, batch_input_shape=(20, maxlen)),
        LSTM(512, activation='relu', recurrent_activation='relu', dropout=0.1, recurrent_dropout=0.1,
             stateful=True, return_sequences=True),
        Dropout(0.2),
        LSTM(512, activation='relu', recurrent_activation='relu', dropout=0.1, recurrent_dropout=0.1,
             stateful=True, return_sequences=True),
        Dropout(0.2),
        TimeDistributed(Dense(vocab_size)),
        Activation('softmax')
    ])    
    model.summary()
    return model

simple_rnn = build_simple_cnn()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (20, 40, 42)              2520      
_________________________________________________________________
lstm_9 (LSTM)                (20, 40, 512)             1136640   
_________________________________________________________________
dropout_9 (Dropout)          (20, 40, 512)             0         
_________________________________________________________________
lstm_10 (LSTM)               (20, 40, 512)             2099200   
_________________________________________________________________
dropout_10 (Dropout)         (20, 40, 512)             0         
_________________________________________________________________
time_distributed_5 (TimeDist (20, 40, 60)              30780     
_________________________________________________________________
activation_5 (Activation)    (20, 40, 60)              0         
Total para

In [75]:
simple_rnn.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [66]:
def print_example():
    seed_string="ethics is a basic foundation of all that"
    for i in range(320):
        x=np.array([char_indices[c] for c in seed_string[-40:]])[np.newaxis,:]
        preds = model.predict(x, verbose=0)[0][-1]
        preds = preds/np.sum(preds)
        next_char = choice(chars, p=preds)
        seed_string = seed_string + next_char
    print(seed_string)

In [76]:
simple_rnn.fit(sentences, np.expand_dims(next_chars,-1), batch_size=20, nb_epoch=1)

Epoch 1/1


<keras.callbacks.History at 0x7fc553274a50>

In [77]:
print_example()

ValueError: Error when checking model : the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 3 array(s), but instead got the following list of 1 arrays: [array([[32, 47, 35, 36, 30, 46,  2, 36, 46,  2, 28,  2, 29, 28, 46, 36,
        30,  2, 33, 42, 48, 41, 31, 28, 47, 36, 42, 41,  2, 42, 33,  2,
        28, 39, 39,  2, 47, 35, 28, 47]])]...

### vgg style cnn 

In [73]:
def build_vgg_cnn():
    model = Sequential([
        Embedding(num_words, 32, input_length=maxlen),
        BatchNormalization(),
        Conv1D(16, 5, padding='same', activation='relu'),
        BatchNormalization(),
        Conv1D(16, 5, padding='same', activation='relu'),
        BatchNormalization(),
        MaxPooling1D(),
        Conv1D(32, 10, padding='same', activation='relu'),
        BatchNormalization(),
        Conv1D(32, 10, padding='same', activation='relu'),
        BatchNormalization(),
        MaxPooling1D(),
        Conv1D(64, 20, padding='same', activation='relu'),
        BatchNormalization(),
        Conv1D(64, 20, padding='same', activation='relu'),
        BatchNormalization(),
        MaxPooling1D(),
        Flatten(),
        Dropout(0.5),
        Dense(200, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(200, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.summary()
    return model

vgg_cnn = build_vgg_cnn()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 500, 32)           160000    
_________________________________________________________________
batch_normalization_21 (Batc (None, 500, 32)           128       
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 500, 16)           2576      
_________________________________________________________________
batch_normalization_22 (Batc (None, 500, 16)           64        
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 500, 16)           1296      
_________________________________________________________________
batch_normalization_23 (Batc (None, 500, 16)           64        
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 250, 16)           0         
__________

In [74]:
vgg_cnn.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [75]:
vgg_cnn.fit(x_train, y_train, batch_size=64, epochs=5, 
          validation_data=(x_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f10a04eb450>

### Transfer learning with GloVe

In [7]:
def get_glove_dataset(dataset):
    """Download the requested glove dataset from files.fast.ai
    and return a location that can be passed to load_vectors.
    """
    # see wordvectors.ipynb for info on how these files were
    # generated from the original glove data.
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)

def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

vecs, words, wordidx = load_vectors(get_glove_dataset('6B.50d'))

In [8]:
import re
def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((num_words, n_fact))    
    for i in range(1,len(emb)):
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = np.random.normal(scale=0.6, size=(n_fact,))
    emb /= 3
    return emb


In [18]:
from keras.layers import SpatialDropout1D

def build_glove_cnn():
    model = Sequential([
        Embedding(num_words, 50, input_length=maxlen, weights=[create_emb()], trainable=False),
        SpatialDropout1D(0.2),
        Conv1D(64, 5, padding='same', activation='relu'),
        Dropout(0.2),
        MaxPooling1D(),
        Flatten(),
        Dense(100, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    model.summary()
    return model

glove_cnn = build_glove_cnn()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 50)           250000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 500, 50)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 500, 64)           16064     
_________________________________________________________________
dropout_8 (Dropout)          (None, 500, 64)           0         
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 250, 64)           0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_7 (Dense)              (None, 100)               1600100   
__________

In [22]:
glove_cnn.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [23]:
glove_cnn.optimizer.lr = 0.0001

In [21]:
glove_cnn.layers[0].trainable = True

In [24]:
glove_cnn.fit(x_train, y_train, batch_size=64, epochs=3, 
          validation_data=(x_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fc6081477d0>

### Multi-Size CNN with GloVe

In [29]:
def build_mutisize_cnn_glove():
    graph_in = Input((num_words, 50))
    convs = []
    for fsz in range(3, 6):
        x = Conv1D(64, fsz, padding='same', activation='relu')(graph_in)
        x = MaxPooling1D()(x)
        x = Flatten()(x)
        convs.append(x)
        
    out = Concatenate()(convs)
    graph = Model(graph_in, out)
    
    model = Sequential([
        Embedding(num_words, 50, input_length=maxlen, weights=[create_emb()], trainable=False),
        SpatialDropout1D(0.2),
        graph,
        Dropout(0.2),
        Dense(100, activation='relu'),
        Dropout(0.4),
        Dense(1, activation='sigmoid')
    ])
    
    model.summary()
    return model

multisize_cnn_glove = build_mutisize_cnn_glove()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 500, 50)           250000    
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 500, 50)           0         
_________________________________________________________________
model_3 (Model)              multiple                  38592     
_________________________________________________________________
dropout_14 (Dropout)         (None, 48000)             0         
_________________________________________________________________
dense_13 (Dense)             (None, 100)               4800100   
_________________________________________________________________
dropout_15 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 101       
Total para

In [33]:
multisize_cnn_glove.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [34]:
multisize_cnn_glove.optimizer.lr = 0.0001

In [32]:
multisize_cnn_glove.layers[0].trainable = True

In [35]:
multisize_cnn_glove.fit(x_train, y_train, batch_size=64, epochs=5, 
          validation_data=(x_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc58c2c4f10>