# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys
sys.path.append('./pylingtools/src/')
sys.path.append('./pyexling/src/')
sys.path.append('./syntaxnet_wrapper/src/')

In [2]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

['/device:GPU:0']

In [3]:
import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

from tensorflow.python.keras import backend as K
K.set_session(sess)

In [4]:
import logging

logPath = '../logs/'
! mkdir $logPath
fileName = 'main.log'
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s")

logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()

fileHandler = logging.FileHandler(os.path.join(logPath, fileName))
fileHandler.setFormatter(logFormatter)
logger.addHandler(fileHandler)

consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
logger.addHandler(consoleHandler)

logger.setLevel(logging.INFO)

mkdir: cannot create directory ‘../logs/’: File exists


In [5]:
import sklearn
import numpy as np
import pandas as pd
import json
import pickle
from tqdm import tqdm_notebook as tqdm
from joblib import Parallel, delayed

# Load data

In [6]:
data = pd.read_pickle('data/dataset.pkl')

In [7]:
data.shape

(48983, 586)

In [8]:
vectorizer_rel = pickle.load(open('models/relation_vectorizer.pkl', 'rb'))
bow_rel = vectorizer_rel.transform(data['relation'].values).toarray()
columns = [column + '_bow_rel' for column in vectorizer_rel.get_feature_names()]
features = pd.concat([data.reset_index(drop=True), pd.DataFrame(bow_rel, columns=columns)], axis=1)

vectorizer_path = pickle.load(open('models/path_vectorizer.pkl', 'rb'))
bow_path = vectorizer_path.transform(features['dep_path'].values).toarray()
columns = [column + '_bow_path' for column in vectorizer_path.get_feature_names()]
features = pd.concat([features.reset_index(drop=True), pd.DataFrame(bow_path, columns=columns)], axis=1)

del data

In [9]:
features.shape

(48983, 2586)

In [10]:
excluding_cols = ['docid', 'subject', 'relation', 'object', 'dep_path', 'lemma_subj', 'lemma_rel', 'lemma_obj']
embedding_cols = ['w2v_subj', 'w2v_rel', 'w2v_obj']
w2v_subj = features['w2v_subj']
w2v_rel = features['w2v_rel']
w2v_obj = features['w2v_obj']
x_train = features.drop(columns=excluding_cols+embedding_cols)

# Models

In [11]:
%load_ext cython

from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.layers import LSTM, GRU, Dense
from tensorflow.python.keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Conv2DTranspose
from tensorflow.python.keras.layers import Dropout, UpSampling2D
from tensorflow.python.keras.layers import Concatenate
from tensorflow.python.keras.layers import Masking
from tensorflow.python.keras.layers import Reshape
from tensorflow.python.keras.layers import Flatten
from tensorflow.python.keras.layers import Input, Layer
from tensorflow.python.keras.layers import Lambda
from tensorflow.python.keras.layers import GlobalMaxPooling1D
from tensorflow.python.keras.layers import RepeatVector
from tensorflow.python.keras.layers import Activation
from tensorflow.python.keras.layers import Permute, Add
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.models import model_from_json
from tensorflow.python.keras import regularizers
from tensorflow.python.keras.callbacks import Callback
from tensorflow.python.keras.optimizers import Adam

from copy import deepcopy
from sklearn.metrics import f1_score
from tensorboardX import SummaryWriter

import math
from time import time

from sklearn.cluster import KMeans

from tensorflow.python.keras.layers import Conv2D, Conv2DTranspose, Flatten, Reshape, Layer, InputSpec
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.utils.vis_utils import plot_model
from datetime import datetime

from tensorflow.python.keras.callbacks import TensorBoard

In [12]:
def plain_ae(input_shape):
    
    input_layer = Input(shape=input_shape)
    x = Dense(1000, activation='relu', name='enc1')(input_layer)
    x = Dense(500, activation='relu', name='enc2')(x)
    latent = Dense(10, activation='sigmoid', name='embedding')(x)
    x = Dense(500, activation='relu', name='dec1')(latent)
    x = Dense(1000, activation='relu', name='dec2')(x)
    output_layer = Dense(input_shape[0], name='output')(x)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    
    return model

## Test DCEC

In [13]:
import deep_clustering
save_dir = 'models'

dcec = deep_clustering.DCEC(input_shape=x_train.shape[1:], 
                            autoencoder_ctor=lambda input_shape: plain_ae(input_shape),
                            n_clusters=10, 
                            pretrain_epochs=10,
                            save_dir=save_dir, 
                            log_dir=logPath)
plot_model(dcec._model, to_file=os.path.join(save_dir, 'dcec_model.png'), show_shapes=True)
dcec.compile(loss_weights=[0.1, 1], optimizer='adam')

In [14]:
dcec._model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 2575)         0                                            
__________________________________________________________________________________________________
enc1 (Dense)                    (None, 1000)         2576000     input_1[0][0]                    
__________________________________________________________________________________________________
enc2 (Dense)                    (None, 500)          500500      enc1[0][0]                       
__________________________________________________________________________________________________
embedding (Dense)               (None, 10)           5010        enc2[0][0]                       
__________________________________________________________________________________________________
dec1 (Dens

In [15]:
dcec.fit(x_train)

2019-07-22 11:01:05,232 [MainThread  ] [INFO ]  Pretraining...
2019-07-22 11:03:20,396 [MainThread  ] [INFO ]  Pretraining time: 135.05495166778564
2019-07-22 11:03:20,836 [MainThread  ] [INFO ]  Pretrained weights are saved to models/pretrain_cae_model.h5
2019-07-22 11:03:20,837 [MainThread  ] [INFO ]  Initializing cluster centers with k-means.
2019-07-22 11:03:33,074 [MainThread  ] [INFO ]  Cluster centers initialized: 12.236179828643799
2019-07-22 11:03:33,076 [MainThread  ] [INFO ]  Training model.
2019-07-22 11:03:33,077 [MainThread  ] [INFO ]  Update interval 140
2019-07-22 11:03:33,078 [MainThread  ] [INFO ]  Save interval 956.69921875
2019-07-22 11:03:40,164 [MainThread  ] [INFO ]  Loss: [0, 0, 0]
2019-07-22 11:03:42,718 [MainThread  ] [INFO ]  saving model to: models/dcec_model_0.h5
2019-07-22 11:03:53,684 [MainThread  ] [INFO ]  Loss: [0.0025808907, 0.003088091, 0.0022720816]
2019-07-22 11:03:53,685 [MainThread  ] [INFO ]  delta_label: 0.20211093644733888
2019-07-22 11:04:04,

2019-07-22 11:11:34,977 [MainThread  ] [INFO ]  delta_label: 0.004960904803707408
2019-07-22 11:11:45,880 [MainThread  ] [INFO ]  Loss: [0.020665912, 0.15796809, 0.004869103]
2019-07-22 11:11:45,881 [MainThread  ] [INFO ]  delta_label: 0.26066186227874977
2019-07-22 11:11:56,730 [MainThread  ] [INFO ]  Loss: [0.021177229, 0.17755537, 0.0034216922]
2019-07-22 11:11:56,732 [MainThread  ] [INFO ]  delta_label: 0.008553988118326767
2019-07-22 11:12:07,597 [MainThread  ] [INFO ]  Loss: [0.027743267, 0.21937516, 0.0058057494]
2019-07-22 11:12:07,599 [MainThread  ] [INFO ]  delta_label: 0.21284935589898535
2019-07-22 11:12:18,641 [MainThread  ] [INFO ]  Loss: [0.020873565, 0.17287165, 0.0035863996]
2019-07-22 11:12:18,642 [MainThread  ] [INFO ]  delta_label: 0.22230161484596697
2019-07-22 11:12:29,714 [MainThread  ] [INFO ]  Loss: [0.01810345, 0.1493603, 0.0031674209]
2019-07-22 11:12:29,716 [MainThread  ] [INFO ]  delta_label: 0.08970459138884919
2019-07-22 11:12:40,730 [MainThread  ] [INFO 

2019-07-22 11:19:59,593 [MainThread  ] [INFO ]  Reached tolerance threshold. Stopping training.
2019-07-22 11:19:59,594 [MainThread  ] [INFO ]  Done. 986.5170562267303
2019-07-22 11:19:59,595 [MainThread  ] [INFO ]  Saving model to: models/dcec_model_final.h5
2019-07-22 11:19:59,659 [MainThread  ] [INFO ]  Pretrain time: 135.60578036308289
2019-07-22 11:19:59,660 [MainThread  ] [INFO ]  Clustering time: 998.8207538127899
2019-07-22 11:19:59,661 [MainThread  ] [INFO ]  Total time: 1134.4265341758728


In [18]:
y_pred = dcec._y_pred

In [41]:
def show_cluster_sample(number):
    return features[y_pred == number][['docid', 'subject', 'relation', 'object']].sample(frac=1).iloc[:10] 

In [43]:
show_cluster_sample(9)

Unnamed: 0,docid,subject,relation,object
15291,42721893,REA,was founded as,publisher
23536,29418677,Candoxatril,is orally active prodrug as,neutral endopeptidase 24.11 complexed
46462,22886826,Ethernet Alliance,was preceded by,Ethernet Alliance
39670,56788647,Aoife McLysaght,is Professor in,Evolution Laboratory of Smurfit Institute of G...
25642,61001,Microsoft,was evolving OLE into,Component Object Model
22418,18684775,Japanese television adaptation,aired on,TBS
47725,48915162,Social Credit System,originated from,strategy first implemented in select locations...
39259,17589963,French chemist,discovered,first known occurring methyl ester
32744,925356,"22,200 assisted living facilities",were in,U.S.
7805,31573781,MG 5,is compact car,has produced from 2012


## Clustering of internal representations generated by autoencoder

In [None]:
pae = plain_ae(x_train.shape[1:])
pae.compile(optimizer='adam', loss='mse')
pae.fit(x_train, x_train, batch_size=256, epochs=200, verbose=0)
hidden = pae.get_layer(name='embedding').output
encoder = Model(inputs=pae.input, outputs=hidden)
embeddings = encoder.predict(x_train)
cluzeriser = KMeans(10, n_jobs=6)
clusters = cluzeriser.fit_predict(embeddings)

In [47]:
def show_cluster_sample(number):
    return features[clusters == number][['docid', 'subject', 'relation', 'object']].sample(frac=1).iloc[:10] 

In [53]:
show_cluster_sample(1)

Unnamed: 0,docid,subject,relation,object
41326,311537,Ultra Games Ultra Software Corporation,was,shell corporation created in effort get around...
42228,17339,Kendall Square Research,was,supercomputer company headquartered in Kendall...
32079,52744500,Tetrahedron Computer Methodology,was lived journal,published by Pergamon Press to experiment with...
20721,6271783,Elxsi,was,minicomputer company established in 1970s
15091,10973748,NRW,was,unreleased computer workstation designed durin...
44213,53999565,first experimental demonstration,was shown by,Fercher et al.
11822,6013482,Western Australian Directory,was published by,H. Pierssene
33480,19712538,third generation,was,introduced by Pontiac alongside its corporate ...
37732,1753031,first generation,was,launched in 1998 by General Motors ' South Kor...
24881,3991133,Audi Roadjet,was officially unveiled at,2006 North American International Auto Show
