# Initialization

In [None]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys
sys.path.append('./pylingtools/src/')
sys.path.append('./pyexling/src/')
sys.path.append('./syntaxnet_wrapper/src/')

In [None]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

In [None]:
import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

from tensorflow.python.keras import backend as K
K.set_session(sess)

In [None]:
import logging

logPath = '../logs/'
! mkdir $logPath
fileName = 'main.log'
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s")

logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()

fileHandler = logging.FileHandler(os.path.join(logPath, fileName))
fileHandler.setFormatter(logFormatter)
logger.addHandler(fileHandler)

consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
logger.addHandler(consoleHandler)

logger.setLevel(logging.INFO)

In [None]:
import sklearn
import numpy as np
import pandas as pd
import json
import pickle
from tqdm import tqdm_notebook as tqdm
from joblib import Parallel, delayed

# Load data

In [None]:
#data = pd.read_pickle('data/dataset_ner.pkl')  # w2v of lemmas
#data = pd.read_pickle('data/processed_separately.pkl')
data = pd.read_pickle('data/processed_separately_clean.pkl')

In [None]:
data.shape

In [None]:
data.sample(10)

In [None]:
_object, _subject, _relation = data.object_matr.values, data.subject_matr.values, data.relation_matr.values

In [None]:
final_shape = (data.shape[0], 3, 343)
_object = np.concatenate(_object).reshape(final_shape)
_subject = np.concatenate(_subject).reshape(final_shape)
_relation = np.concatenate(_relation).reshape(final_shape)

In [None]:
from sklearn.preprocessing import StandardScaler

scalers = {}
for i in range(_object.shape[1]):
    scalers[i] = StandardScaler()
    _object[:, i, :] = scalers[i].fit_transform(_object[:, i, :]) 

for i in range(_subject.shape[1]):
    scalers[i] = StandardScaler()
    _subject[:, i, :] = scalers[i].fit_transform(_subject[:, i, :]) 
    
for i in range(_relation.shape[1]):
    scalers[i] = StandardScaler()
    _relation[:, i, :] = scalers[i].fit_transform(_relation[:, i, :]) 

# Models

In [None]:
%load_ext cython

from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.layers import LSTM, GRU, Dense
from tensorflow.python.keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Conv2DTranspose
from tensorflow.python.keras.layers import Dropout, UpSampling2D
from tensorflow.python.keras.layers import Concatenate
from tensorflow.python.keras.layers import Masking
from tensorflow.python.keras.layers import Reshape
from tensorflow.python.keras.layers import Flatten
from tensorflow.python.keras.layers import Input, Layer
from tensorflow.python.keras.layers import Lambda
from tensorflow.python.keras.layers import GlobalMaxPooling1D
from tensorflow.python.keras.layers import RepeatVector
from tensorflow.python.keras.layers import Activation
from tensorflow.python.keras.layers import Permute, Add
from tensorflow.python.keras.layers import concatenate
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.models import model_from_json
from tensorflow.python.keras import regularizers
from tensorflow.python.keras.callbacks import Callback
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.layers import GaussianNoise
from tensorflow.python.keras.layers import UpSampling1D

from copy import deepcopy
from sklearn.metrics import f1_score
from tensorboardX import SummaryWriter

import math
from time import time

from sklearn.cluster import KMeans

from tensorflow.python.keras.layers import Conv2D, Conv2DTranspose, Flatten, Reshape, Layer, InputSpec
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.utils.vis_utils import plot_model
from datetime import datetime

from tensorflow.python.keras.callbacks import TensorBoard

In [None]:
def noised_ae(input_shape):
    
    INNER_SIZE = 80
    
    def encode_embedding_input(input_layer):
        input_layer = GaussianNoise(stddev=.1)(input_layer)
        conv1 = Conv1D(128, (2,), activation='relu', padding='same')(input_layer)
        pool1 = MaxPooling1D((2,), padding='same')(conv1)
        conv2 = Conv1D(32, (2,), activation='relu', padding='same')(pool1)
        pool2 = MaxPooling1D((2,), padding='same')(conv2)
        return Flatten()(pool2)
    
    def decode_embedding_input(latent, name):
        latent = Reshape((1, INNER_SIZE))(latent)
        conv1 = Conv1D(128, (1,), activation='relu', padding='same', name=name+'_conv1')(latent)
        up1 = UpSampling1D(input_shape[0], name=name+'_up1')(conv1)
        conv2 = Conv1D(input_shape[1], (6,), activation='relu', padding='same', name=name+'_conv2')(up1)
        return conv2
    
    input_subject = Input(shape=input_shape, name='input_subject')
    input_object = Input(shape=input_shape, name='input_object')
    input_rel = Input(shape=input_shape, name='input_rel')

    encode_subject = encode_embedding_input(input_subject)
    encode_object = encode_embedding_input(input_object)
    encode_rel = encode_embedding_input(input_rel)
    
    x = concatenate([encode_subject, encode_object, encode_rel])
    latent = Dense(INNER_SIZE, activation='sigmoid', name='embedding')(x)

    output_subject = decode_embedding_input(latent, 'output_subject')
    output_object = decode_embedding_input(latent, 'output_object')
    output_rel = decode_embedding_input(latent, 'output_rel')
    
    model = Model(inputs=[input_subject, input_object, input_rel], 
                  outputs=[output_subject, output_object, output_rel])

    return model

"""
Epoch 200/200
32419/32419 [==============================] - 1s 45us/step - loss: 2.3175 - output_subject_conv2_loss: 0.8170 - output_object_conv2_loss: 0.7654 - output_rel_conv2_loss: 0.7352
"""

In [None]:
def masked_ae(input_shape):
    """ mask relation embedding and try to restore it along with the arguments """
    
    INNER_SIZE = 80
    
    def encode_embedding_input(input_layer):
        conv1 = Conv1D(128, (2,), activation='relu', padding='same')(input_layer)
        pool1 = MaxPooling1D((2,), padding='same')(conv1)
        conv2 = Conv1D(32, (2,), activation='relu', padding='same')(pool1)
        pool2 = MaxPooling1D((2,), padding='same')(conv2)
        return Flatten()(pool2)
    
    def decode_embedding_input(latent, name):
        latent = Reshape((1, INNER_SIZE))(latent)
        conv1 = Conv1D(128, (1,), activation='relu', padding='same', name=name+'_conv1')(latent)
        up1 = UpSampling1D(input_shape[0], name=name+'_up1')(conv1)
        conv2 = Conv1D(input_shape[1], (6,), activation='relu', padding='same', name=name+'_conv2')(up1)
        return conv2
    
    input_subject = Input(shape=input_shape, name='input_subject')
    input_object = Input(shape=input_shape, name='input_object')
    input_rel = Input(shape=input_shape, name='input_rel')
    
    encode_subject = encode_embedding_input(input_subject)
    encode_object = encode_embedding_input(input_object)
    
    x = concatenate([encode_subject, encode_object])
    latent = Dense(INNER_SIZE, activation='sigmoid', name='embedding')(x)

    output_subject = decode_embedding_input(latent, 'output_subject')
    output_object = decode_embedding_input(latent, 'output_object')
    output_rel = decode_embedding_input(latent, 'output_rel')
    
    model = Model(inputs=[input_subject, input_object, input_rel], 
                  outputs=[output_subject, output_object, output_rel])

    return model

"""
Epoch 200/200
32419/32419 [==============================] - 1s 44us/step - loss: 2.4200 - output_subject_conv2_loss: 0.7845 - output_object_conv2_loss: 0.7825 - output_rel_conv2_loss: 0.8530
"""

In [None]:
LAYER_1 = 200
LAYER_2 = 100
INNER_SIZE = 80

def masked_subj(input_shape):
    """ mask subject embedding and try to restore it """
    
    def encode_plain_input(input_layer):
        x = Dense(LAYER_2, activation='relu', name='enc2')(input_layer)
        return x

    def encode_embedding_input(input_layer):
        conv1 = Conv1D(128, (2,), activation='relu', padding='same')(input_layer)
        pool1 = MaxPooling1D((2,), padding='same')(conv1)
        return Flatten()(pool1)
    
    def decode_plain_input(latent):
        x = Dense(LAYER_2, activation='relu', name='dec1')(latent)
        output = Dense(input_shape_plain[0], name='output_plain')(x)
        return output
    
    def decode_embedding_input(latent, name):
        latent = Reshape((1, INNER_SIZE))(latent)
        conv1 = Conv1D(128, (1,), activation='relu', padding='same', name=name+'_conv1')(latent)
        up1 = UpSampling1D(3, name=name+'_up1')(conv1)
        conv2 = Conv1D(300, (1,), activation='relu', padding='same', name=name+'_conv2')(up1)
        return conv2
    
    input_shape_plain, input_shape_emb = input_shape
    
    input_plain = Input(shape=input_shape_plain, name='input_plain')
    input_subject = Input(shape=input_shape_emb, name='input_subject')
    input_object = Input(shape=input_shape_emb, name='input_object')
    input_rel = Input(shape=input_shape_emb, name='input_rel')

    encode_plain = encode_plain_input(input_plain)
    encode_subject = encode_embedding_input(input_subject)
    encode_object = encode_embedding_input(input_object)
    encode_rel = encode_embedding_input(input_rel)
    
    x = concatenate([encode_plain, encode_object, encode_rel])
    latent = Dense(INNER_SIZE, activation='sigmoid', name='embedding')(x)
    
    output_plain = decode_plain_input(latent)
    output_subject = decode_embedding_input(latent, 'output_subject')
    output_object = decode_embedding_input(latent, 'output_object')
    output_rel = decode_embedding_input(latent, 'output_rel')
    
    model = Model(inputs=[input_plain, input_subject, input_object, input_rel], 
                  outputs=[output_plain, output_subject, output_object, output_rel])

    return model

In [None]:
_subject.shape[1:], _object.shape[1:], _relation.shape[1:]

## Test autoencoder

In [None]:
model = noised_ae((_object.shape[1:]))
model.summary()

optimizer = Adam(lr=0.01)
model.compile(optimizer='adam', loss='mse')

model.fit(x=[_subject, _object, _relation],
          y=[_subject, _object, _relation], epochs=200, batch_size=256)

## Test DCEC

#### DCEC

In [None]:
import deep_clustering
save_dir = 'models/restore_rel_kmeans'
! mkdir $save_dir

dcec = deep_clustering.DCEC(input_shape=(_object.shape[1:]),
                            autoencoder_ctor=lambda input_shape: masked_ae(input_shape),  # select model here
                            n_clusters=80,
                            pretrain_epochs=10,
                            maxiter=int(1e5),
                            save_dir=save_dir, 
                            log_dir=logPath)
plot_model(dcec._model, to_file=os.path.join(save_dir, 'dcec_model.png'), show_shapes=True)
dcec.compile(optimizer='adam')

In [None]:
dcec.fit([_subject, _object, _relation])

In [None]:
y_pred = dcec._y_pred

dumb_features = data[:]
scores = dcec.score_examples([_subject, _object, _relation])
dumb_features['cluster'] = y_pred
dumb_features['score'] = scores
threshold = 0.05

In [None]:
dumb_features.head()

In [None]:
def get_tokens(column):
    return ' '.join(column['tokens'])

dumb_features['subject'] = dumb_features['subject'].map(get_tokens)
dumb_features['relation'] = dumb_features['relation'].map(get_tokens)
dumb_features['object'] = dumb_features['object'].map(get_tokens)

In [None]:
def show_cluster_sample(number, rows=100):
    def get_tokens(column):
        return ' '.join(column['tokens'])
    
    cluster = dumb_features[y_pred == number]
    return cluster[['subject', 'relation', 'object', 'score']].iloc[:rows]
    #return subj#dumb_features[y_pred == number][['docid', 'subject', 'relation', 'object', 'score']].iloc[:rows] 

In [None]:
dumb_features.head()

In [None]:
dumb_features[dumb_features.object == 'eliza'].sort_values('cluster')

In [None]:
dumb_features[dumb_features.relation.str.contains('published')].sort_values('cluster')

In [None]:
temp = show_cluster_sample(27)
temp.relation.value_counts()

In [None]:
temp[temp.subject == 'joseph laycock']

In [None]:
temp.sort_values('score', ascending=False)

In [None]:
weights = dcec._model.get_layer(name='clustering').get_weights()[0]

In [None]:
temp[temp.relation == "professor"].sort_values('score', ascending=False)

In [None]:
temp[temp.relation == "was born"].sort_values('score', ascending=False)

In [None]:
temp.head(20)

In [None]:
PATH_DIRTY_JSON = 'unfiltered_results/'
! mkdir $PATH_DIRTY_JSON

def save_dirty_json(id, y_pred):
    result = {}
    number_of_clusters = y_pred.max()
    for number in range(number_of_clusters):
        sample = show_cluster_sample(number, 999).sort_values('score', ascending=False)
        cluster = {
            "data": list(zip(*[sample[c].values.tolist() for c in sample])),
            "predicates": {key: int(value) for key, value in dict(sample.relation.value_counts()).items()}
        }
        result[int(number)] = cluster
    
    json.dump(result, open(os.path.join(PATH_DIRTY_JSON, id), 'w'))
    return result

In [None]:
save_dirty_json('dcec_kmeans_80c_002.json', y_pred)

In [None]:
"94": {"rel_type": "released", "qa_examples": [["When was released Synchronet?", "October 2000"], ["When was first released Skype?", "August 2003"]]}

#### DAEC

In [None]:
import deep_clustering
save_dir = 'models/daec'
! mkdir $save_dir

daec = deep_clustering.DAEC(input_shape=(_subject.shape[1:]),
                            autoencoder_ctor=lambda input_shape: masked_ae(input_shape),  # select model here
                            n_clusters=80, 
                            pretrain_epochs=10,
                            log_dir=logPath,
                            save_dir=save_dir, 
                            )

plot_model(daec._model, to_file=os.path.join(save_dir, 'daec_model.png'), show_shapes=True)
daec.compile(optimizer='adam')

In [None]:
daec._model.summary()

In [None]:
daec.fit([_subject, _object, _relation])

In [None]:
y_pred = daec._y_pred

dumb_features = data[:]
scores = daec.score_examples([_subject, _object, _relation])
dumb_features['cluster'] = y_pred
dumb_features['score'] = scores
threshold = 0.05

In [None]:
def show_cluster_sample(number, rows=10):
    def get_tokens(column):
        return ' '.join(column['tokens'])
    
    cluster = dumb_features[y_pred == number]
    cluster['subject'] = cluster.subject.map(get_tokens)
    cluster['relation'] = cluster.relation.map(get_tokens)
    cluster['object'] = cluster.object.map(get_tokens)
    return cluster[['subject', 'relation', 'object', 'score']].iloc[:rows]
    #return subj#dumb_features[y_pred == number][['docid', 'subject', 'relation', 'object', 'score']].iloc[:rows] 

In [None]:
temp = show_cluster_sample(1)
temp

In [None]:
temp = save_dirty_json('daec_kmeans_80c_001.json', y_pred)

In [None]:
temp[5]['predicates']

## Test DC_Kmeans 

In [None]:
import deep_clustering
save_dir = 'models'

dckmeans = deep_clustering.DC_Kmeans(input_shape=x_train.shape[1:], 
                            autoencoder_ctor=lambda input_shape: plain_noised_ae(input_shape),
                            n_clusters=30,
                            max_epochs=200,
                            save_dir=save_dir, 
                            log_dir=logPath)
plot_model(dckmeans._model, to_file=os.path.join(save_dir, 'dckmeans_model.png'), show_shapes=True)
dckmeans.compile(optimizer='adam')

In [None]:
dckmeans._model.summary()

In [None]:
dckmeans.fit(x_train)

## Clustering of internal representations generated by autoencoder

In [None]:
pae = plain_ae(x_train.shape[1:])
pae.compile(optimizer='adam', loss='mse')
pae.fit(x_train, x_train, batch_size=256, epochs=10, verbose=0)
hidden = pae.get_layer(name='embedding').output
encoder = Model(inputs=pae.input, outputs=hidden)
#embeddings = encoder.predict(x_train)
#cluzeriser = KMeans(2, n_jobs=6)
#clusters = cluzeriser.fit_predict(embeddings)

In [None]:
pae.save('models/pae_model.h5')

In [None]:
def show_cluster_sample(number):
    return features[clusters == number][['docid', 'subject', 'relation', 'object']].sample(frac=1).iloc[:10] 

In [None]:
with open('pae_clusters.txt', 'w') as f:
    for i in range(50):
        try:
            line = "\n".join(map(str, show_cluster_sample(i).values.tolist()))
            f.write(str(i)+'-----------------\n' + line + '\n\n\n')
        except ValueError:
            f.write(str(i)+'-----------------\n')