# Initialization

In [None]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys
sys.path.append('./pylingtools/src/')
sys.path.append('./pyexling/src/')
sys.path.append('./syntaxnet_wrapper/src/')

from tensorflow.python.client import device_lib
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

print(get_available_gpus())

import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

from tensorflow.python.keras import backend as K
K.set_session(sess)

In [None]:
import logging

logPath = '../logs/'
! mkdir $logPath
fileName = 'main.log'
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s")

logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()

fileHandler = logging.FileHandler(os.path.join(logPath, fileName))
fileHandler.setFormatter(logFormatter)
logger.addHandler(fileHandler)

consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
logger.addHandler(consoleHandler)

logger.setLevel(logging.INFO)

In [None]:
import sklearn
import numpy as np
import pandas as pd
import json
import pickle
from tqdm import tqdm_notebook as tqdm
from joblib import Parallel, delayed

# Load data

In [None]:
#data = pd.read_pickle('data/dataset_ner.pkl')  # w2v of lemmas
#data = pd.read_pickle('data/processed_separately.pkl')
data = pd.read_pickle('data/processed_separately_clean.pkl').sample(frac=1, random_state=42).reset_index(drop=True)
print(data.shape)

In [None]:
def get_tokens(column):
    return ' '.join(column['tokens'])

data['subject'] = data['subject'].map(get_tokens)
data['relation'] = data['relation'].map(get_tokens)
data['object'] = data['object'].map(get_tokens)

In [None]:
from clusters_examples import SAME_CLUSTERS

def get_labels(row):
    counter = 1
    for labeled_cluster in SAME_CLUSTERS:
        if (row.subject, row.relation, row.object) in labeled_cluster:
            row['label'] = counter
        counter += 1
    return row

data = data.apply(get_labels, axis=1)

In [None]:
data.fillna(0, inplace=True)
data = data.drop_duplicates(['subject', 'relation', 'object'])
print(data.shape)

In [None]:
data[data.label == 8]

In [None]:
data[data.label > 0]['label'].shape

In [None]:
sum([len(cluster) for cluster in SAME_CLUSTERS])

In [None]:
_object, _subject, _relation, _labels = data.object_matr.values, data.subject_matr.values, data.relation_matr.values, data.label.values

In [None]:
final_shape = (data.shape[0], 
               data.subject_matr.iloc[0].shape[0], 
               data.subject_matr.iloc[0].shape[1])  # (17901, 3, 343)

_object = np.concatenate(_object).reshape(final_shape)
_subject = np.concatenate(_subject).reshape(final_shape)
_relation = np.concatenate(_relation).reshape(final_shape)

In [None]:
from sklearn.preprocessing import StandardScaler

scalers = [{}, {}, {}]

for i in range(_object.shape[1]):
    scalers[0][i] = StandardScaler()
    _object[:, i, :] = scalers[0][i].fit_transform(_object[:, i, :]) 

for i in range(_subject.shape[1]):
    scalers[1][i] = StandardScaler()
    _subject[:, i, :] = scalers[1][i].fit_transform(_subject[:, i, :]) 
    
for i in range(_relation.shape[1]):
    scalers[2][i] = StandardScaler()
    _relation[:, i, :] = scalers[2][i].fit_transform(_relation[:, i, :]) 

# Models

In [None]:
%load_ext cython

from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.layers import LSTM, GRU, Dense
from tensorflow.python.keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Conv2DTranspose
from tensorflow.python.keras.layers import Dropout, UpSampling2D
from tensorflow.python.keras.layers import Concatenate
from tensorflow.python.keras.layers import Masking
from tensorflow.python.keras.layers import Reshape
from tensorflow.python.keras.layers import Flatten
from tensorflow.python.keras.layers import Input, Layer
from tensorflow.python.keras.layers import Lambda
from tensorflow.python.keras.layers import GlobalMaxPooling1D
from tensorflow.python.keras.layers import RepeatVector
from tensorflow.python.keras.layers import Activation
from tensorflow.python.keras.layers import Permute, Add
from tensorflow.python.keras.layers import concatenate
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.models import model_from_json
from tensorflow.python.keras import regularizers
from tensorflow.python.keras.callbacks import Callback
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.layers import GaussianNoise
from tensorflow.python.keras.layers import UpSampling1D

from copy import deepcopy
from sklearn.metrics import f1_score
from tensorboardX import SummaryWriter

import math
from time import time

from sklearn.cluster import KMeans

from tensorflow.python.keras.layers import Conv2D, Conv2DTranspose, Flatten, Reshape, Layer, InputSpec
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.utils.vis_utils import plot_model
from datetime import datetime

from tensorflow.python.keras.callbacks import TensorBoard

In [None]:
def noised_ae(input_shape):
    
    INNER_SIZE = 100
    
    def encode_embedding_input(input_layer):
        input_layer = GaussianNoise(stddev=.1)(input_layer)
        conv1 = Conv1D(128, (2,), activation='relu', padding='same')(input_layer)
        pool1 = MaxPooling1D((2,), padding='same')(conv1)
        conv2 = Conv1D(32, (2,), activation='relu', padding='same')(pool1)
        pool2 = MaxPooling1D((2,), padding='same')(conv2)
        return Flatten()(pool2)
    
    def decode_embedding_input(latent, name):
        latent = Reshape((1, INNER_SIZE))(latent)
        conv1 = Conv1D(128, (1,), activation='relu', padding='same', name=name+'_conv1')(latent)
        up1 = UpSampling1D(input_shape[0], name=name+'_up1')(conv1)
        conv2 = Conv1D(input_shape[1], (6,), activation='relu', padding='same', name=name+'_conv2')(up1)
        return conv2
    
    input_subject = Input(shape=input_shape, name='input_subject')
    input_object = Input(shape=input_shape, name='input_object')
    input_rel = Input(shape=input_shape, name='input_rel')

    encode_subject = encode_embedding_input(input_subject)
    encode_object = encode_embedding_input(input_object)
    encode_rel = encode_embedding_input(input_rel)
    
    x = concatenate([encode_subject, encode_object, encode_rel])
    latent = Dense(INNER_SIZE, activation='sigmoid', name='embedding')(x)

    output_subject = decode_embedding_input(latent, 'output_subject')
    output_object = decode_embedding_input(latent, 'output_object')
    output_rel = decode_embedding_input(latent, 'output_rel')
    
    model = Model(inputs=[input_subject, input_object, input_rel], 
                  outputs=[output_subject, output_object, output_rel])

    return model

In [None]:
def masked_ae(input_shape):
    """ mask relation embedding and try to restore it along with the arguments """
    
    INNER_SIZE = 50
    
    def encode_embedding_input(input_layer):
        conv1 = Conv1D(128, (2,), activation='relu', padding='same')(input_layer)
        pool1 = MaxPooling1D((2,), padding='same')(conv1)
        conv2 = Conv1D(32, (2,), activation='relu', padding='same')(pool1)
        pool2 = MaxPooling1D((2,), padding='same')(conv2)
        return Flatten()(pool2)
    
    def decode_embedding_input(latent, name):
        latent = Reshape((1, INNER_SIZE))(latent)
        conv1 = Conv1D(128, (1,), activation='relu', padding='same', name=name+'_conv1')(latent)
        up1 = UpSampling1D(input_shape[0], name=name+'_up1')(conv1)
        conv2 = Conv1D(input_shape[1], (6,), activation='relu', padding='same', name=name+'_conv2')(up1)
        return conv2
    
    input_subject = Input(shape=input_shape, name='input_subject')
    input_object = Input(shape=input_shape, name='input_object')
    input_rel = Input(shape=input_shape, name='input_rel')
    
    encode_subject = encode_embedding_input(input_subject)
    encode_object = encode_embedding_input(input_object)
    
    x = concatenate([encode_subject, encode_object])
    latent = Dense(INNER_SIZE, activation='sigmoid', name='embedding')(x)

    output_subject = decode_embedding_input(latent, 'output_subject')
    output_object = decode_embedding_input(latent, 'output_object')
    output_rel = decode_embedding_input(latent, 'output_rel')
    
    model = Model(inputs=[input_subject, input_object, input_rel], 
                  outputs=[output_subject, output_object, output_rel])

    return model

In [None]:
def restore_rel(input_shape):
    """ mask relation embedding and try to restore it alone """
    
    INNER_SIZE = 50
    
    def encode_embedding_input(input_layer):
        conv1 = Conv1D(128, (2,), activation='relu', padding='same')(input_layer)
        pool1 = MaxPooling1D((2,), padding='same')(conv1)
        conv2 = Conv1D(32, (2,), activation='relu', padding='same')(pool1)
        pool2 = MaxPooling1D((2,), padding='same')(conv2)
        return Flatten()(pool2)
    
    def decode_embedding_input(latent, name):
        latent = Reshape((1, INNER_SIZE))(latent)
        conv1 = Conv1D(128, (1,), activation='relu', padding='same', name=name+'_conv1')(latent)
        up1 = UpSampling1D(input_shape[0], name=name+'_up1')(conv1)
        conv2 = Conv1D(input_shape[1], (6,), activation='relu', padding='same', name=name+'_conv2')(up1)
        return conv2
    
    input_subject = Input(shape=input_shape, name='input_subject')
    input_sub_noised = GaussianNoise(stddev=.001)(input_subject)
    input_object = Input(shape=input_shape, name='input_object')
    input_obj_noised = GaussianNoise(stddev=.001)(input_object)
    input_rel = Input(shape=input_shape, name='input_rel')
    
    encode_subject = encode_embedding_input(input_subject)
    encode_object = encode_embedding_input(input_object)
    
    x = concatenate([encode_subject, encode_object])
    latent = Dense(INNER_SIZE, activation='sigmoid', name='embedding')(x)

    output_rel = decode_embedding_input(latent, 'output_rel')
    
    model = Model(inputs=[input_subject, input_object, input_rel], 
                  outputs=[input_sub_noised, input_obj_noised, output_rel])

    return model

In [None]:
def restore_obj(input_shape):
    """ mask object embedding and try to restore it alone """
    
    INNER_SIZE = 50
    
    def encode_embedding_input(input_layer):
        conv1 = Conv1D(128, (2,), activation='relu', padding='same')(input_layer)
        pool1 = MaxPooling1D((2,), padding='same')(conv1)
        conv2 = Conv1D(32, (2,), activation='relu', padding='same')(pool1)
        pool2 = MaxPooling1D((2,), padding='same')(conv2)
        return Flatten()(pool2)
    
    def decode_embedding_input(latent, name):
        latent = Reshape((1, INNER_SIZE))(latent)
        conv1 = Conv1D(128, (1,), activation='relu', padding='same', name=name+'_conv1')(latent)
        up1 = UpSampling1D(input_shape[0], name=name+'_up1')(conv1)
        conv2 = Conv1D(input_shape[1], (6,), activation='relu', padding='same', name=name+'_conv2')(up1)
        return conv2
    
    input_subject = Input(shape=input_shape, name='input_subject')
    input_sub_noised = GaussianNoise(stddev=.001)(input_subject)
    input_object = Input(shape=input_shape, name='input_object')
    input_rel = Input(shape=input_shape, name='input_rel')
    input_rel_noised = GaussianNoise(stddev=.001)(input_rel)
    
    encode_subject = encode_embedding_input(input_subject)
    encode_rel = encode_embedding_input(input_rel)
    
    x = concatenate([encode_subject, encode_rel])
    latent = Dense(INNER_SIZE, activation='sigmoid', name='embedding')(x)

    #output_subject = decode_embedding_input(latent, 'output_subject')
    output_object = decode_embedding_input(latent, 'output_object')
    #output_rel = decode_embedding_input(latent, 'output_rel')
    
    model = Model(inputs=[input_subject, input_object, input_rel], 
                  outputs=[input_sub_noised, output_object, input_rel_noised])

    return model

In [None]:
_subject.shape[1:], _object.shape[1:], _relation.shape[1:]

## Test autoencoder

In [None]:
model = noised_ae((_object.shape[1:]))
model.summary()

optimizer = Adam(lr=0.01)
model.compile(optimizer='adam', loss='mse')

model.fit(x=[_subject, _object, _relation],
          y=[_subject, _object, _relation], epochs=200, batch_size=256)

In [None]:
model = restore_rel((_object.shape[1:]))
model.summary()

optimizer = Adam(lr=0.01)
model.compile(optimizer='adam', loss='mse')

model.fit(x=[_subject, _object, _relation],
          y=[_subject, _object, _relation], epochs=10, batch_size=256)

## Train IDEC

In [None]:
import deep_clustering_ss as deep_clustering
save_dir = 'semi_sup/idec/restore_rel'
! mkdir $save_dir

idec = deep_clustering.IDEC(input_shape=(_object.shape[1:]),
                            autoencoder_ctor=lambda input_shape: restore_rel(input_shape),#restore_rel(input_shape),  # select model here
                            n_clusters=2,
                            pretrain_epochs=2,
                            maxiter=int(1e2),
                            save_dir=save_dir, 
                            log_dir=logPath)

plot_model(idec._model, to_file=os.path.join(save_dir, 'idec_model.png'), show_shapes=True)
idec.compile(gamma=0.1)

In [None]:
idec.fit([_subject, _object, _relation], labels=_labels)
#idec.fit([_subject, _object, _relation])

In [None]:
def get_tokens(column):
    return ' '.join(column['tokens'])

y_pred = idec._y_pred
dumb_features = pd.DataFrame()
dumb_features['subject'] = data['subject'].map(get_tokens)
dumb_features['relation'] = data['relation'].map(get_tokens)
dumb_features['object'] = data['object'].map(get_tokens)
dumb_features['cluster'] = y_pred
scores = idec.score_examples([_subject, _object, _relation])
dumb_features['score'] = scores
threshold = 0.05
dumb_features = dumb_features[dumb_features['score'] > threshold]

In [None]:
def show_cluster_sample(number, rows=999):
    def get_tokens(column):
        return ' '.join(column['tokens'])
    
    cluster = dumb_features[dumb_features.cluster == number]
    return cluster[['subject', 'relation', 'object', 'score']].iloc[:rows]

In [None]:
dumb_features[dumb_features.object == 'eliza'].sort_values('cluster')

In [None]:
dumb_features[dumb_features.relation.str.contains('announced launch')].sort_values('cluster')

In [None]:
dumb_features[dumb_features.relation.str.contains('worked')].sort_values('cluster')

In [None]:
number = 34
temp = show_cluster_sample(number)
temp.relation.value_counts()

In [None]:
PATH_DIRTY_JSON = 'unfiltered_results/idec/restore_obj/'
! mkdir $PATH_DIRTY_JSON

def save_dirty_json(id, y_pred):
    result = {}
    number_of_clusters = y_pred.max()
    for number in range(number_of_clusters):
        sample = show_cluster_sample(number, 999).sort_values('score', ascending=False)
        cluster = {
            "data": list(zip(*[sample[c].values.tolist() for c in sample])),
            "predicates": {key: int(value) for key, value in dict(sample.relation.value_counts()).items()}
        }
        result[int(number)] = cluster
    
    json.dump(result, open(os.path.join(PATH_DIRTY_JSON, id), 'w'))
    return result

In [None]:
res = save_dirty_json('dcec_kmeans_50c_000_1e4.json', y_pred)

## Train DAEC

In [None]:
! mkdir models/daec

In [None]:
import deep_clustering
save_dir = 'models/daec/restore_rel'
! mkdir $save_dir

daec = deep_clustering.DAEC(input_shape=(_subject.shape[1:]),
                            autoencoder_ctor=lambda input_shape: restore_rel(input_shape),  # select model here
                            n_clusters=50, 
                            pretrain_epochs=10,
                            log_dir=logPath,
                            save_dir=save_dir, 
                            )

plot_model(dcec._model, to_file=os.path.join(save_dir, 'daec_model.png'), show_shapes=True)
daec.compile(optimizer='adam')

In [None]:
daec._model.summary()

In [None]:
daec.fit([_subject, _object, _relation])

In [None]:
def get_tokens(column):
    return ' '.join(column['tokens'])

y_pred = daec._y_pred
dumb_features = pd.DataFrame()
dumb_features['subject'] = data['subject'].map(get_tokens)
dumb_features['relation'] = data['relation'].map(get_tokens)
dumb_features['object'] = data['object'].map(get_tokens)
dumb_features['cluster'] = y_pred
scores = dcec.score_examples([_subject, _object, _relation])
dumb_features['score'] = scores
threshold = 0.05
dumb_features = dumb_features[dumb_features['score'] > threshold]

In [None]:
def show_cluster_sample(number, rows=100):
    def get_tokens(column):
        return ' '.join(column['tokens'])
    
    cluster = dumb_features[dumb_features.cluster == number]
    return cluster[['subject', 'relation', 'object', 'score']].iloc[:rows]

In [None]:
dumb_features.head()

In [None]:
dumb_features[dumb_features.object == 'eliza'].sort_values('cluster')

In [None]:
dumb_features[dumb_features.relation.str.contains('born')].sort_values('cluster').iloc[:20]

In [None]:
number = 4
temp = show_cluster_sample(number)
temp.relation.value_counts()

In [None]:
temp.sort_values('score', ascending=False)

In [None]:
PATH_DIRTY_JSON = 'unfiltered_results/daec/restore_rel/'
! mkdir $PATH_DIRTY_JSON

def save_dirty_json(id, y_pred):
    result = {}
    number_of_clusters = y_pred.max()
    for number in range(number_of_clusters):
        sample = show_cluster_sample(number, 999).sort_values('score', ascending=False)
        cluster = {
            "data": list(zip(*[sample[c].values.tolist() for c in sample])),
            "predicates": {key: int(value) for key, value in dict(sample.relation.value_counts()).items()}
        }
        result[int(number)] = cluster
    
    json.dump(result, open(os.path.join(PATH_DIRTY_JSON, id), 'w'))
    return result

In [None]:
res = save_dirty_json('daec_kmeans_80c_002.json', y_pred)

In [None]:
temp[6]

## Train DC_Kmeans 

In [None]:
! mkdir models/dc_kmeans

In [None]:
import deep_clustering
save_dir = 'models/dc_kmeans/restore_rel'
! mkdir $save_dir

dckmeans = deep_clustering.DC_Kmeans(
                            input_shape=(_subject.shape[1:]),
                            autoencoder_ctor=lambda input_shape: restore_rel(input_shape),  # select model here
                            n_clusters=30,
                            pretrain_epochs=50,
                            max_epochs=200,
                            save_dir=save_dir, 
                            log_dir=logPath)
plot_model(dckmeans._model, to_file=os.path.join(save_dir, 'dckmeans_model.png'), show_shapes=True)
dckmeans.compile(optimizer='adam')

In [None]:
dckmeans._model.summary()

In [None]:
dckmeans.fit([_subject, _object, _relation])

In [None]:
def get_tokens(column):
    return ' '.join(column['tokens'])

y_pred = dckmeans.y_pred
dumb_features = pd.DataFrame()
dumb_features['subject'] = data['subject'].map(get_tokens)
dumb_features['relation'] = data['relation'].map(get_tokens)
dumb_features['object'] = data['object'].map(get_tokens)
dumb_features['cluster'] = y_pred
scores = dckmeans.get_scores([_subject, _object, _relation])
dumb_features['score'] = scores
threshold = 0.05
dumb_features = dumb_features[dumb_features['score'] > threshold]

In [None]:
def show_cluster_sample(number, rows=100):
    def get_tokens(column):
        return ' '.join(column['tokens'])
    
    cluster = dumb_features[dumb_features.cluster == number]
    return cluster[['subject', 'relation', 'object', 'score']].iloc[:rows]

In [None]:
dumb_features[dumb_features.object == 'eliza'].sort_values('cluster')

In [None]:
dumb_features[dumb_features.relation.str.contains('born')].sort_values('cluster').iloc[:20]

In [None]:
number = 4
temp = show_cluster_sample(number)
temp.relation.value_counts()

In [None]:
temp.sort_values('score', ascending=False)

In [None]:
PATH_DIRTY_JSON = 'unfiltered_results/dc_kmeans/restore_rel/'
! mkdir $PATH_DIRTY_JSON

def save_dirty_json(id, y_pred):
    result = {}
    number_of_clusters = y_pred.max()
    for number in range(number_of_clusters):
        sample = show_cluster_sample(number, 999).sort_values('score', ascending=False)
        cluster = {
            "data": list(zip(*[sample[c].values.tolist() for c in sample])),
            "predicates": {key: int(value) for key, value in dict(sample.relation.value_counts()).items()}
        }
        result[int(number)] = cluster
    
    json.dump(result, open(os.path.join(PATH_DIRTY_JSON, id), 'w'))
    return result

In [None]:
res = save_dirty_json('dc_kmeans_30c_000.json', y_pred)

## Clustering of internal representations generated by autoencoder

In [None]:
pae = plain_ae(x_train.shape[1:])
pae.compile(optimizer='adam', loss='mse')
pae.fit(x_train, x_train, batch_size=256, epochs=10, verbose=0)
hidden = pae.get_layer(name='embedding').output
encoder = Model(inputs=pae.input, outputs=hidden)
#embeddings = encoder.predict(x_train)
#cluzeriser = KMeans(2, n_jobs=6)
#clusters = cluzeriser.fit_predict(embeddings)

In [None]:
pae.save('models/pae_model.h5')

In [None]:
def show_cluster_sample(number):
    return features[clusters == number][['docid', 'subject', 'relation', 'object']].sample(frac=1).iloc[:10] 

In [None]:
with open('pae_clusters.txt', 'w') as f:
    for i in range(50):
        try:
            line = "\n".join(map(str, show_cluster_sample(i).values.tolist()))
            f.write(str(i)+'-----------------\n' + line + '\n\n\n')
        except ValueError:
            f.write(str(i)+'-----------------\n')