In [1]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras import Model
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold
import numpy as np
from sklearn import preprocessing
from bokeh.io import output_notebook
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import time
from keras.layers import Dense, LSTM, Dropout, Embedding,Flatten,Bidirectional, MaxPooling1D, AveragePooling1D
from keras.models import Sequential
from collections import defaultdict
from keras import regularizers
import json
output_notebook()
%matplotlib inline

Using TensorFlow backend.


#### Load and prepare data

In [2]:
output_name_prefix = 'unrestricted'
def load_data(path_file, data_file, remove_no_path=False):
    frame = pd.read_csv(path_file)
    if remove_no_path:
        frame = frame[(frame.path != 'NO_PATH_A') & (frame.path != 'NO_PATH_B')]
    #frame = frame[frame.most_frequent_percentage >= 0.6]
    data_frame = pd.read_csv(data_file)
    data_frame = data_frame[data_frame.sentence.isin(frame.sentence.values.tolist())]
    output_name = '../paths-from-input-{}-empty-paths-{}.csv'.format(path_file.replace('.csv',''),(not remove_no_path))
    return frame, data_frame, output_name

#paths, data, output_name = load_data('../code/data/paths_original.csv', '../code/data/data.csv', remove_no_path=True)
paths, data, output_name = load_data('../code/data/paths_unrestricted.csv',  '../code/data/data_if.csv', remove_no_path=False)

In [3]:
vocab_size=5000
maxlen = 20

# transform labels into 3d tensors for lstm
paths_list = paths['path'].values.tolist()
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(paths_list)
input_train = tokenizer.texts_to_sequences(paths_list)
input_train = sequence.pad_sequences(input_train, maxlen=maxlen)
input_train = input_train.reshape((len(paths_list),maxlen,1))

# create target labels, one for each path
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(paths_list)
y_train =  to_categorical(label_encoder.transform(paths_list))

print('{} paths total'.format(len(paths)))
print('{} different (target) paths'.format(y_train[0].shape[0]))

11191 paths total
9555 different (target) paths


#### Train the network

In [4]:
def plot_lines(model, first,second):
    plt.plot(model.history[first[0]],color=first[1])
    plt.plot(model.history[second[0]],linestyle='--',color=second[1])
    plt.title('{} {} vs {}'.format(model.name,first[0],second[0]))
    plt.xlabel('epoch')
    plt.legend([first[0],second[0]])
    plt.figure(figsize=(5,5))
    plt.show()
    

In [5]:
def train_model(layers,name='',epochs=100,batch_size=64, shuffle=True, validation_split=0.2, optimizer='adam'):
    model = Sequential(name=name)
    [model.add(layer) for layer in layers]
    model.add(Dense(y_train[0].shape[0],activation='softmax'))
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
    print('===== {} ====='.format(name))
    model.summary()

    history = model.fit(input_train, y_train,
    epochs=epochs, batch_size=batch_size,shuffle=shuffle, validation_split=validation_split)
    return history, model
    

#### Prepare the embeddings

In [6]:
def get_embeddings(model):
    layer = model.layers[-2]
    assert type(layer) == LSTM
    assert type(model.layers[-1]) == Dense
    intermediate_layer_model = Model(inputs=model.input,
                                     outputs=layer.output)
    return intermediate_layer_model.predict(input_train)

### Average Embedding

In [7]:
def average_embeddings(embeddings):
    assert len(paths) == len(embeddings)

    embedding_dict = defaultdict(list)
    mean_embedding_dict = {}
    idx = 0
    for i, row in paths.iterrows():
        embedding_dict[row['sentence']].append(embeddings[idx])
        idx += 1

    assert len(embedding_dict)== len(data)

    for k,v in embedding_dict.items():
        adder = np.zeros(v[0].shape)
        for val in v:
            adder += val
        adder /= len(v)
        mean_embedding_dict[k] = adder

    assert len(mean_embedding_dict)== len(data)
    return mean_embedding_dict

In [8]:
def reformat(mean_embedding_dict):
    X = []
    y = []
    plot_data = []
    plot_x = []
    idx = 0
    embedding_df = pd.DataFrame(columns=['id', 'sentence', 'label', 'embedding', 'paths'])
    for k,v in mean_embedding_dict.items():
        f_slice = paths[paths.sentence == k]
        label = f_slice.most_frequent_label.values.tolist()[0]
        f_paths = f_slice.path.values.tolist()
        embedding_df.loc[idx] = [f_slice.id.values.tolist()[0], f_slice.sentence.values.tolist()[0],label,
                                 v.reshape(-1,1).squeeze().tolist(), f_paths]
        idx+=1
        # prepare results for classification
        X.append(v.reshape(-1,1).squeeze().tolist())
        y.append(label)

        # prepare results for t-sne plot
        plot_x.append(v)
        plot_data.append( (label,k,'\n'.join(paths)) )
    return X, y, plot_data, plot_x, embedding_df

In [9]:
def save_embedding_df(embedding_df,name):
    assert len(embedding_df) == len(data)
    embedding_df.to_csv(output_name_prefix+'__'+name)

### Plot

In [10]:
def tsne_plot(plot_x, plot_data):
    X_embedded = TSNE(n_components=2,verbose=0).fit_transform(plot_x)
    plot_frame = pd.DataFrame(columns=['x','y','class','sentence', 'path'])
    for i,x in enumerate(X_embedded):
        plot_frame.loc[i] = [x[0],x[1],plot_data[i][0],plot_data[i][1],plot_data[i][2]]

    def build_source(label, df):
        df = df[df['class'] == label]
        return ColumnDataSource(data=dict(
        x=df.x.values.tolist(),
        y=df.y.values.tolist(),
        sentence=df.sentence.tolist(),
        path=df.path.tolist(),
        label=df['class'].tolist()))

    colormap = {'WORSE': 'red', 'BETTER': 'green', 'NONE': 'blue'}
    colors = [colormap[x] for x in plot_frame['class']]

    # GROUP tooltips
    hover = HoverTool(tooltips=[
        ("Sentence", "@sentence"),
        ("Path", "@path"),
        ("label", "@label"),
    ])

    p = figure(plot_width=900,  plot_height=900, tools="pan,wheel_zoom,box_zoom,reset,previewsave")
    p.add_tools(hover)

    p.cross(x='x', y='y', source=build_source('NONE', plot_frame),size=5,color='gray')
    p.circle(x='x', y='y', source=build_source('WORSE', plot_frame) ,size=5,color='red')
    p.circle(x='x', y='y', source=build_source('BETTER', plot_frame),size=5,color='green')
    return p

### Classification Test

In [12]:
def classification_test(X,y,with_embeddings=False):
    if with_embeddings:
        print("With embeddings")
        paths_ = np.asarray(X)
        infersent_ = np.array(lst_lst)
        X = np.concatenate([paths_, infersent_],axis=1)
        assert X.shape[0] == paths_.shape[0] == infersent_.shape[0]
        assert X.shape[1] == paths_.shape[1] + infersent_.shape[1]
    else:
        print("Without embeddings")
    kf = StratifiedKFold(n_splits=5, random_state=1337)
    for train_index, test_index in kf.split(X,y):
        X_train, X_test = np.array(X)[train_index], np.array(X)[test_index]
        y_train, y_test = np.array(y)[train_index], np.array(y)[test_index]

        log = LogisticRegression()
        log.fit(X_train, y_train)
        pred = log.predict(X_test)
        print(classification_report(pred,y_test,labels=['BETTER', 'WORSE', 'NONE']))
        print("\n\n")

In [13]:
def full_run(layers,name='',epochs=100,batch_size=64, shuffle=True, validation_split=0.2, optimizer='adam'):
    history, trained_model = train_model(layers,name=name,epochs=epochs,batch_size=batch_size,
                                         shuffle=shuffle, validation_split=validation_split, optimizer=optimizer)
    print('===== {} ====='.format(trained_model.name))
    plot_lines(history,('acc','green'),('val_acc', 'yellowgreen'))
    plot_lines(history,('loss', 'red'), ('val_loss', 'orangered'))
    embeddings = get_embeddings(trained_model)
    mean_embedding_dict = average_embeddings(embeddings)
    X, y, plot_data, plot_x, embedding_df = reformat(mean_embedding_dict)
    save_embedding_df(embedding_df,name)
    show(tsne_plot(plot_x, plot_data))
    classification_test(X,y,with_embeddings=True)
    classification_test(X,y)

In [None]:
full_run(
    [LSTM(1024,return_sequences=False, input_shape=(maxlen,1))
    ],name='1 LSTM 1024, 500 Epochs, 64 Batch, Adam', epochs=500,batch_size=64)

===== 1 LSTM 1024, 500 Epochs, 64 Batch, Adam =====
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 1024)              4202496   
_________________________________________________________________
dense_1 (Dense)              (None, 9555)              9793875   
Total params: 13,996,371
Trainable params: 13,996,371
Non-trainable params: 0
_________________________________________________________________
Train on 8952 samples, validate on 2239 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500

In [None]:
full_run(
    [LSTM(1024,return_sequences=False, input_shape=(maxlen,1))
    ],name='1 LSTM 1024, 500 Epochs, 124 Batch, Adam', epochs=500,batch_size=124)

In [None]:
full_run(
    [
        LSTM(300,return_sequences=True, input_shape=(maxlen,1)),
        LSTM(300,return_sequences=True, input_shape=(maxlen,1)),
        LSTM(300,return_sequences=False, input_shape=(maxlen,1)),
    ],name='3 LSTM 300, 500 Epochs, 124 Batch, Adam', epochs=500,batch_size=124)

In [None]:
full_run(
    [
        LSTM(300,return_sequences=True, input_shape=(maxlen,1)),
        LSTM(300,return_sequences=True, input_shape=(maxlen,1)),
        LSTM(300,return_sequences=False, input_shape=(maxlen,1)),
    ],name='3 LSTM 300, 500 Epochs, 256 Batch, Adam', epochs=500,batch_size=256)

In [None]:
full_run(
    [LSTM(2048,return_sequences=False, input_shape=(maxlen,1))
    ],name='1 LSTM 2048, 500 Epochs, 124 Batch, Adam', epochs=500,batch_size=124)