# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys
sys.path.append('./pylingtools/src/')
sys.path.append('./pyexling/src/')
sys.path.append('./syntaxnet_wrapper/src/')

In [2]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

['/device:GPU:0']

In [3]:
import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

from tensorflow.python.keras import backend as K
K.set_session(sess)

In [4]:
import logging

logPath = '../logs/'
! mkdir $logPath
fileName = 'main.log'
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s")

logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()

fileHandler = logging.FileHandler(os.path.join(logPath, fileName))
fileHandler.setFormatter(logFormatter)
logger.addHandler(fileHandler)

consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
logger.addHandler(consoleHandler)

logger.setLevel(logging.INFO)

mkdir: cannot create directory ‘../logs/’: File exists


In [5]:
import sklearn
import numpy as np
import pandas as pd
import json
import pickle
from tqdm import tqdm_notebook as tqdm
from joblib import Parallel, delayed

# Load data

In [6]:
data = pd.read_pickle('data/dataset_ner.pkl')

In [7]:
data.shape

(31917, 351)

In [8]:
deprecated = ('one', 'she', 'they', 'his', 'her', 'its', 'our', 'day', 'co.', 'inc.', 
              'society', 'people', 'inventor', 'head', 'poet', 'doctor', 'teacher', 'inventor', 
              'thanksgiving day', 'halloween',
              'sales person', 'model', 'board', 'technology', 'owner', 'one', 'two', 
              'university', 'fbi', 'patricia churchland'
             )

data = data[data.object.map(len) > 2]
data = data[data.subject.map(len) > 2]

for dep_word in deprecated:
    data = data[data.subject.str.lower() != dep_word]
    data = data[data.object.str.lower() != dep_word]

In [9]:
data = data[data.lemma_subj.map(len) < 4]
data = data[data.lemma_obj.map(len) < 4]
data = data[data.lemma_rel.map(len) < 4]

In [10]:
data.shape

(30325, 351)

In [11]:
vectorizer_rel = pickle.load(open('models/relation_vectorizer.pkl', 'rb'))
bow_rel = vectorizer_rel.transform(data['relation'].values).toarray()
columns = [column + '_bow_rel' for column in vectorizer_rel.get_feature_names()]
features = pd.concat([data.reset_index(drop=True), pd.DataFrame(bow_rel, columns=columns)], axis=1)

vectorizer_path = pickle.load(open('models/path_vectorizer.pkl', 'rb'))
bow_path = vectorizer_path.transform(features['dep_path'].values).toarray()
columns = [column + '_bow_path' for column in vectorizer_path.get_feature_names()]
features = pd.concat([features.reset_index(drop=True), pd.DataFrame(bow_path, columns=columns)], axis=1)

data = features
del features



In [12]:
data.shape

(30325, 2351)

In [13]:
excluding_cols = ['docid', 'subject', 'relation', 'object', 'dep_path', 'lemma_subj', 'lemma_rel', 'lemma_obj']
embedding_cols = ['w2v_subj', 'w2v_rel', 'w2v_obj']
w2v_subj = np.array([line.flatten() for line in data['w2v_subj'].values])
w2v_rel = np.array([line.flatten() for line in data['w2v_rel'].values])
w2v_obj = np.array([line.flatten() for line in data['w2v_obj'].values])
x_train = np.concatenate([data.drop(columns=excluding_cols+embedding_cols), 
                          w2v_subj, w2v_rel, w2v_obj], axis=1)

In [14]:
x_train.shape

(30325, 5040)

In [15]:
from sklearn.preprocessing import StandardScaler

x_train = StandardScaler().fit_transform(x_train)

np.random.shuffle(x_train)

plain_features = data.drop(columns=excluding_cols+embedding_cols).values
plain_features = StandardScaler().fit_transform(plain_features)



In [16]:
w2v_subj = np.array([np.array(line) for line in data['w2v_subj']])
w2v_obj = np.array([np.array(line) for line in data['w2v_obj']])
w2v_rel = np.array([np.array(line) for line in data['w2v_rel']])

In [17]:
plain_features.shape, w2v_subj.shape, w2v_rel.shape, w2v_obj.shape

((30325, 2340), (30325, 3, 300), (30325, 3, 300), (30325, 3, 300))

# Models

In [18]:
%load_ext cython

from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.layers import LSTM, GRU, Dense
from tensorflow.python.keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Conv2DTranspose
from tensorflow.python.keras.layers import Dropout, UpSampling2D
from tensorflow.python.keras.layers import Concatenate
from tensorflow.python.keras.layers import Masking
from tensorflow.python.keras.layers import Reshape
from tensorflow.python.keras.layers import Flatten
from tensorflow.python.keras.layers import Input, Layer
from tensorflow.python.keras.layers import Lambda
from tensorflow.python.keras.layers import GlobalMaxPooling1D
from tensorflow.python.keras.layers import RepeatVector
from tensorflow.python.keras.layers import Activation
from tensorflow.python.keras.layers import Permute, Add
from tensorflow.python.keras.layers import concatenate
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.models import model_from_json
from tensorflow.python.keras import regularizers
from tensorflow.python.keras.callbacks import Callback
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.layers import GaussianNoise
from tensorflow.python.keras.layers import UpSampling1D

from copy import deepcopy
from sklearn.metrics import f1_score
from tensorboardX import SummaryWriter

import math
from time import time

from sklearn.cluster import KMeans

from tensorflow.python.keras.layers import Conv2D, Conv2DTranspose, Flatten, Reshape, Layer, InputSpec
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.utils.vis_utils import plot_model
from datetime import datetime

from tensorflow.python.keras.callbacks import TensorBoard

In [19]:
LAYER_1 = 200
LAYER_2 = 50
INNER = 10

def plain_ae(input_shape):
    
    input_layer = Input(shape=input_shape)
    x = Dense(LAYER_1, activation='relu', name='enc1')(input_layer)
    x = Dense(LAYER_2, activation='relu', name='enc2')(x)
    latent = Dense(INNER, activation='sigmoid', name='embedding')(x)
    x = Dense(LAYER_2, activation='relu', name='dec1')(latent)
    x = Dense(LAYER_1, activation='relu', name='dec2')(x)
    output_layer = Dense(input_shape[0], name='output')(x)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    
    return model

In [20]:
LAYER_1 = 200
LAYER_2 = 50
INNER = 10

def plain_noised_ae(input_shape):
    
    input_layer = Input(shape=input_shape)
    x = GaussianNoise(0.5)(input_layer)
    x = Dense(LAYER_1, activation='relu', name='enc1')(x)
    x = Dense(LAYER_2, activation='relu', name='enc2')(x)
    latent = Dense(INNER, activation='sigmoid', name='embedding')(x)
    x = Dense(LAYER_2, activation='relu', name='dec1')(latent)
    x = Dense(LAYER_1, activation='relu', name='dec2')(x)
    output_layer = Dense(input_shape[0], name='output')(x)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    
    return model

In [21]:
LAYER_1 = 500
LAYER_2 = 200
INNER_SIZE = 100

def noised_ae(input_shape):
    
    def encode_plain_input(input_layer):
        x = GaussianNoise(0.5)(input_layer)
        x = Dense(LAYER_1, activation='relu', name='enc1')(x)
        #x = Dense(LAYER_2, activation='relu', name='enc2')(x)
        return x

    def encode_embedding_input(input_layer):
        conv1 = Conv1D(128, (2,), activation='relu', padding='same')(input_layer)
        pool1 = MaxPooling1D((2,), padding='same')(conv1)
        conv2 = Conv1D(32, (2,), activation='relu', padding='same')(pool1)
        pool2 = MaxPooling1D((2,), padding='same')(conv2)
        return Flatten()(pool2)
    
    def decode_plain_input(latent):
        x = Dense(LAYER_2, activation='relu', name='dec1')(latent)
        #x = Dense(LAYER_1, activation='relu', name='dec2')(x)
        output = Dense(input_shape_plain[0], name='output_plain')(x)
        return output
    
    def decode_embedding_input(latent, name):
        latent = Reshape((1, INNER_SIZE))(latent)
        up0 = UpSampling1D(3)(latent)
        conv1 = Conv1D(3, (3,), activation='relu', padding='same', name=name+'_conv1')(up0)
        #up1 = UpSampling1D(3, name=name+'_up1')(conv1)
        conv2 = Conv1D(300, (1,), activation='relu', padding='same', name=name+'_conv2')(conv1)
        #up2 = UpSampling1D(1, name=name+'_up2')(conv2)
        #r = Conv1D(1, (2,), activation='sigmoid', padding='same', name=name)(up2)
        return conv2
    
    input_shape_plain, input_shape_emb = input_shape
    
    input_plain = Input(shape=input_shape_plain, name='input_plain')
    input_subject = Input(shape=input_shape_emb, name='input_subject')
    input_object = Input(shape=input_shape_emb, name='input_object')
    input_rel = Input(shape=input_shape_emb, name='input_rel')

    encode_plain = encode_plain_input(input_plain)
    encode_subject = encode_embedding_input(input_subject)
    encode_object = encode_embedding_input(input_object)
    encode_rel = encode_embedding_input(input_rel)
    
    x = concatenate([encode_plain, encode_subject, encode_object, encode_rel])
    latent = Dense(INNER_SIZE, activation='sigmoid', name='embedding')(x)
    
    output_plain = decode_plain_input(latent)
    output_subject = decode_embedding_input(latent, 'output_subject')
    output_object = decode_embedding_input(latent, 'output_object')
    output_rel = decode_embedding_input(latent, 'output_rel')
    
    model = Model(inputs=[input_plain, input_subject, input_object, input_rel], 
                  outputs=[output_plain, output_subject, output_object, output_rel])

    return model

In [22]:
LAYER_1 = 200
LAYER_2 = 100
INNER_SIZE = 100

def masked_ae(input_shape):
    """ mask relation embedding and try to restore it """
    
    def encode_plain_input(input_layer):
        x = Dense(LAYER_2, activation='relu', name='enc2')(input_layer)
        return x

    def encode_embedding_input(input_layer):
        conv1 = Conv1D(128, (2,), activation='relu', padding='same')(input_layer)
        pool1 = MaxPooling1D((2,), padding='same')(conv1)
        return Flatten()(pool1)
    
    def decode_plain_input(latent):
        x = Dense(LAYER_2, activation='relu', name='dec1')(latent)
        output = Dense(input_shape_plain[0], name='output_plain')(x)
        return output
    
    def decode_embedding_input(latent, name):
        latent = Reshape((1, INNER_SIZE))(latent)
        conv1 = Conv1D(128, (1,), activation='relu', padding='same', name=name+'_conv1')(latent)
        up1 = UpSampling1D(3, name=name+'_up1')(conv1)
        conv2 = Conv1D(300, (1,), activation='relu', padding='same', name=name+'_conv2')(up1)
        return conv2
    
    input_shape_plain, input_shape_emb = input_shape
    
    input_plain = Input(shape=input_shape_plain, name='input_plain')
    input_subject = Input(shape=input_shape_emb, name='input_subject')
    input_object = Input(shape=input_shape_emb, name='input_object')
    input_rel = Input(shape=input_shape_emb, name='input_rel')

    encode_plain = encode_plain_input(input_plain)
    encode_subject = encode_embedding_input(input_subject)
    encode_object = encode_embedding_input(input_object)
    encode_rel = encode_embedding_input(input_rel)
    
    x = concatenate([encode_plain, encode_subject, encode_object])
    latent = Dense(INNER_SIZE, activation='sigmoid', name='embedding')(x)
    
    output_plain = decode_plain_input(latent)
    output_subject = decode_embedding_input(latent, 'output_subject')
    output_object = decode_embedding_input(latent, 'output_object')
    output_rel = decode_embedding_input(latent, 'output_rel')
    
    model = Model(inputs=[input_plain, input_subject, input_object, input_rel], 
                  outputs=[output_plain, output_subject, output_object, output_rel])

    return model

In [23]:
plain_features.shape[1:], w2v_obj.shape[1:]

((2340,), (3, 300))

model = masked_ae((plain_features.shape[1:], w2v_obj.shape[1:]))
model.summary()

optimizer = Adam(lr=0.3)
model.compile(optimizer='adam', loss='mse')

model.fit(x=[plain_features, w2v_obj, w2v_subj, w2v_rel],
          y=[plain_features, w2v_obj, w2v_subj, w2v_rel], epochs=100, batch_size=256)

## Test DCEC

### Monoinput 

In [None]:
import deep_clustering
save_dir = 'models'

dcec = deep_clustering.DCEC(input_shape=x_train.shape[1:], 
                            autoencoder_ctor=lambda input_shape: plain_ae(input_shape),
                            n_clusters=40, 
                            pretrain_epochs=50,
                            maxiter=int(5e4),
                            save_dir=save_dir, 
                            log_dir=logPath)
plot_model(dcec._model, to_file=os.path.join(save_dir, 'dcec_model.png'), show_shapes=True)
dcec.compile(optimizer='adam')

In [None]:
dcec.fit(x_train)

### Multiinput

#### DCEC

In [24]:
import deep_clustering
save_dir = 'models'

dcec = deep_clustering.DCEC(input_shape=(plain_features.shape[1:], w2v_obj.shape[1:]),
                            autoencoder_ctor=lambda input_shape: masked_ae(input_shape),  # select model here
                            n_clusters=100, 
                            pretrain_epochs=50,
                            maxiter=int(1e3),
                            save_dir=save_dir, 
                            log_dir=logPath)
plot_model(dcec._model, to_file=os.path.join(save_dir, 'dcec_model.png'), show_shapes=True)
dcec.compile(optimizer='adam')

In [25]:
dcec.fit([plain_features, w2v_obj, w2v_subj, w2v_rel])

2019-10-10 13:10:36,234 [MainThread  ] [INFO ]  Pretraining...
2019-10-10 13:12:47,481 [MainThread  ] [INFO ]  Pretraining time: 131.08045172691345
2019-10-10 13:12:48,201 [MainThread  ] [INFO ]  Pretrained weights are saved to models/pretrain_cae_model.h5
2019-10-10 13:12:48,202 [MainThread  ] [INFO ]  Initializing cluster centers with k-means.
2019-10-10 13:13:09,396 [MainThread  ] [INFO ]  Cluster centers initialized: 21.194454669952393
2019-10-10 13:13:09,397 [MainThread  ] [INFO ]  Training model.
2019-10-10 13:13:09,397 [MainThread  ] [INFO ]  Update interval 140
2019-10-10 13:13:09,398 [MainThread  ] [INFO ]  Save interval 592.28515625
2019-10-10 13:13:13,192 [MainThread  ] [INFO ]  saving model to: models/dcec_model_0.h5
2019-10-10 13:13:16,082 [MainThread  ] [INFO ]  Loss: [array([1.1500921, 1.1500921], dtype=float32), 0.08641264, 1.035676, 0.009198443, 0.009475036, 0.00933005]
2019-10-10 13:13:16,083 [MainThread  ] [INFO ]  delta_label: 0.2518713932399011
2019-10-10 13:13:18,

In [None]:
y_pred = dcec._y_pred

dumb_features = data[:]
#scores = dcec.score([plain_features, w2v_obj, w2v_subj, w2v_rel]).max(axis=1)
scores = dcec.score_examples([plain_features, w2v_obj, w2v_subj, w2v_rel]).max(axis=1)
#scores2 = scores / scores.max()
dumb_features['score'] = scores
threshold = 0.05
#dumb_features = dumb_features[dumb_features['score'] > threshold]

#dumb_features.sort_values('score', ascending=False, inplace=True)

#### DAEC

In [23]:
import deep_clustering
save_dir = 'models/daec'
! mkdir $save_dir

daec = deep_clustering.DAEC(input_shape=(plain_features.shape[1:], w2v_obj.shape[1:]),
                            autoencoder_ctor=lambda input_shape: masked_ae(input_shape),  # select model here
                            n_clusters=100, 
                            pretrain_epochs=10,
                            log_dir=logPath,
                            save_dir=save_dir, 
                            )
plot_model(daec._model, to_file=os.path.join(save_dir, 'daec_model.png'), show_shapes=True)
daec.compile(optimizer='adam')

mkdir: cannot create directory ‘models/daec’: File exists


In [24]:
daec._model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_subject (InputLayer)      (None, 3, 300)       0                                            
__________________________________________________________________________________________________
input_object (InputLayer)       (None, 3, 300)       0                                            
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 3, 128)       76928       input_subject[0][0]              
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 3, 128)       76928       input_object[0][0]               
__________________________________________________________________________________________________
input_plai

In [25]:
daec.fit([plain_features, w2v_obj, w2v_subj, w2v_rel])

2019-10-10 12:11:20,202 [MainThread  ] [INFO ]  Pretraining...
2019-10-10 12:11:50,599 [MainThread  ] [INFO ]  Pretraining time: 30.2330801486969
2019-10-10 12:11:51,253 [MainThread  ] [INFO ]  Pretrained weights are saved to models/daec/pretrain_cae_model.h5
2019-10-10 12:11:51,254 [MainThread  ] [INFO ]  Initializing cluster centers with k-means.
2019-10-10 12:11:51,254 [MainThread  ] [INFO ]  Cluster centers initialized: 1.6689300537109375e-06
2019-10-10 12:11:51,255 [MainThread  ] [INFO ]  Training model.
2019-10-10 12:11:51,255 [MainThread  ] [INFO ]  Training k-means...
2019-10-10 12:12:11,207 [MainThread  ] [INFO ]  Done.
2019-10-10 12:12:11,208 [MainThread  ] [INFO ]  Training model.
2019-10-10 12:12:15,545 [MainThread  ] [INFO ]  Training k-means...
2019-10-10 12:12:29,947 [MainThread  ] [INFO ]  Done.
2019-10-10 12:12:29,948 [MainThread  ] [INFO ]  delta_label: 0.9903928406922419
2019-10-10 12:12:29,948 [MainThread  ] [INFO ]  Training model.
2019-10-10 12:12:32,192 [MainThre

In [33]:
def score(x):
    def closeness(x, clusters):
        result = [np.sqrt(sum([(x[i][j] - daec._kmeans.cluster_centers_[clusters[i]][j])**2 
                               for j in range(len(x[i]))])) for i in range(len(x))]
        return 1. - result / np.max(result)

    x_emb = daec._encoder.predict(x, verbose=0)
    clusters = daec.predict(x)
    return closeness(x_emb, clusters)

In [27]:
y_pred = daec.predict([plain_features, w2v_obj, w2v_subj, w2v_rel])

dumb_features = data[:]
scores = daec.score([plain_features, w2v_obj, w2v_subj, w2v_rel])
#scores2 = scores / scores.max()
#scores = score([plain_features, w2v_obj, w2v_subj, w2v_rel])
dumb_features['score'] = scores
threshold = 0.05
#dumb_features = dumb_features[dumb_features['score'] > threshold]

#dumb_features.sort_values('score', ascending=False, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [28]:
def show_cluster_sample(number, rows=10):
    return dumb_features[y_pred == number][['docid', 'subject', 'relation', 'object', 'score']].iloc[:rows] 

In [57]:
show_cluster_sample(1, 100).relation.value_counts()

is in                      18
is                         12
was                         4
has                         4
became                      3
was developer By            2
is compatible with          2
reunited with               2
had shifted By              2
is In                       2
is with                     2
were likely than            2
stories about               1
see unfavorably             1
often promotes In           1
's phone is                 1
promotes In                 1
reviewed at_time            1
of chapter is               1
focusing on                 1
co-found                    1
took challenge on           1
eventually found in         1
see                         1
go to                       1
following blackout is       1
found in                    1
decorated                   1
questions to                1
petitioned                  1
in                          1
moved In                    1
on                          1
Center for

In [77]:
show_cluster_sample(16, 100).sort_values('score', ascending=False)

[autoreload of deep_clustering failed: Traceback (most recent call last):
  File "/root/.pyenv/versions/3.6.7/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 244, in check
    superreload(m, reload, self.old_objects)
  File "/root/.pyenv/versions/3.6.7/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 376, in superreload
    module = reload(module)
  File "/root/.pyenv/versions/3.6.7/lib/python3.6/imp.py", line 315, in reload
    return importlib.reload(module)
  File "/root/.pyenv/versions/3.6.7/lib/python3.6/importlib/__init__.py", line 166, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 618, in _exec
  File "<frozen importlib._bootstrap_external>", line 674, in exec_module
  File "<frozen importlib._bootstrap_external>", line 781, in get_code
  File "<frozen importlib._bootstrap_external>", line 741, in source_to_code
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File 

Unnamed: 0,docid,subject,relation,object,score
4949,6097297,Torvalds,sued,him,0.938832
20710,55718325,Appleby,sued,Guardian,0.938058
8134,29156200,Waymo,sued,Uber,0.937597
23228,58418911,Meyer,sued,Waid,0.937416
8153,29156200,Waymo,sued,Uber,0.937391
26358,54450737,ByteDance,sued,Huxiu,0.937052
2205,21955841,Zynga,sued,Kobojo,0.936915
27698,231554,Kawasaki,sued,Aruze,0.936752
21787,21197,Nintendo,sued,Jacob Mathias,0.936578
26996,56822861,Douyin,sued,Tencent,0.936176


In [87]:
daec._model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_subject (InputLayer)      (None, 3, 300)       0                                            
__________________________________________________________________________________________________
input_object (InputLayer)       (None, 3, 300)       0                                            
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 3, 128)       76928       input_subject[0][0]              
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 3, 128)       76928       input_object[0][0]               
__________________________________________________________________________________________________
input_plai

In [83]:
len(daec._model.get_weights())

24

In [90]:
weights = daec._model.get_layer(name='output_plain').get_weights()

In [92]:
len(weights[0])

100

In [93]:
len(weights[1])

2340

In [98]:
weights = daec._model.get_layer(name='embedding').get_weights()

In [100]:
len(weights[0])

612

In [101]:
len(weights[1])

100

In [106]:
weights[0]

array([[ 0.01005727, -0.03032287,  0.00761776, ..., -0.02403401,
         0.01714256, -0.01964788],
       [ 0.01312937,  0.01122171, -0.00770224, ..., -0.00094526,
        -0.067614  ,  0.02353039],
       [-0.00471022, -0.02592151,  0.01896739, ..., -0.01797611,
        -0.02844654,  0.02356325],
       ...,
       [ 0.04968828, -0.05934293, -0.06397039, ..., -0.01340513,
         0.08435149, -0.09370936],
       [-0.00671365,  0.01605963, -0.02208265, ..., -0.0310556 ,
         0.03987392,  0.06885269],
       [ 0.04472084,  0.02056069,  0.0848379 , ..., -0.04034006,
         0.03463504,  0.02074194]], dtype=float32)

In [102]:
weights[1]

array([-0.03293329, -0.01755087, -0.05104557, -0.00273838, -0.06966775,
        0.01961962, -0.03101648, -0.02525761, -0.03901241, -0.05267859,
       -0.04749049, -0.07171173, -0.03019076, -0.0254373 , -0.10835081,
       -0.03509561, -0.05550556,  0.05979065, -0.05398833, -0.03258474,
       -0.05119634, -0.02960631, -0.06345114, -0.130921  , -0.06369817,
       -0.05504875, -0.02970846, -0.0100568 , -0.0057167 ,  0.031838  ,
       -0.04361647, -0.05097609, -0.04120166, -0.06228884, -0.0280369 ,
       -0.02401849, -0.06526639,  0.02371101, -0.02741057, -0.05565085,
        0.00068465,  0.0535402 , -0.04755868, -0.07319726, -0.00167551,
        0.0146594 , -0.01479836, -0.04927625, -0.09499393,  0.0087162 ,
       -0.05000103, -0.00305545, -0.01200419, -0.02954646,  0.00502397,
       -0.06643018,  0.04697659, -0.03395459, -0.04212091, -0.00361292,
       -0.10137761, -0.02821815, -0.0289038 , -0.00226842, -0.06107631,
       -0.03559491, -0.04677316, -0.08247527, -0.01288289, -0.04

In [104]:
len(weights[1])

100

In [None]:
weights[0] - A; weights[1] - b

In [None]:
((2, "settled_in"),
 (3, "offers"),
 (4, "began_in"),
 (6, "tell"),
 (7, "stepped"),
 (8, "use"),
 (9, "release"),
 (10, "launch"),
 (11, "return"),
 (12, "born"),
 (13, "publish"),
 (14, "present"),
 (15, "work"),
 (16, "sued")
)

In [None]:
{2: {"rel_type": "settled_in"},
 3: {"rel_type"}}

In [47]:
PATH_DIRTY_JSON = 'unfiltered_results_daec/'
! mkdir $PATH_DIRTY_JSON

def save_dirty_json(id, y_pred):
    result = {}
    number_of_clusters = y_pred.max()
    for number in range(number_of_clusters):
        sample = dumb_features[y_pred == number][['subject', 'relation', 'object', 'score']].sort_values('score', ascending=False)
        cluster = {
            "data": list(zip(*[sample[c].values.tolist() for c in sample[['subject', 'relation', 'object', 'score']]])),
            "predicates": {key: int(value) for key, value in dict(sample.relation.value_counts()).items()}
        }
        result[int(number)] = cluster
    
    json.dump(result, open(os.path.join(PATH_DIRTY_JSON, id), 'w'))
    return result

mkdir: cannot create directory ‘unfiltered_results_daec/’: File exists


In [48]:
temp = save_dirty_json('001_10ptrain_20train.json', y_pred)

In [49]:
print([key for key in temp.keys() if temp[key]])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98]


In [50]:
number = 0

In [51]:
temp[number]['predicates']

{'partnered In': 41,
 'partnered with': 29,
 'won': 22,
 'of Bureau is': 8,
 'ran In': 8,
 'Duke of': 7,
 'targeting': 6,
 'threatened': 6,
 'ran in': 4,
 'runs': 4,
 'runs on': 4,
 'ran from': 4,
 'run': 3,
 'was absorbed into': 3,
 'partnering with': 3,
 'withdrew in': 3,
 'conducted': 3,
 'cofounder of': 3,
 'partnered in': 3,
 'run on': 3,
 'was initiated by': 3,
 'international partner since': 2,
 'also partnered with': 2,
 'Bureau of': 2,
 'drew': 2,
 'gaining support in': 2,
 'drew attention In': 2,
 'partner to': 2,
 'worldwide partner since': 2,
 'is partnered with': 2,
 'conducted interview In': 2,
 'was initiated in': 2,
 'caught up with': 2,
 'has partnered with': 2,
 'was absorbed in': 2,
 'international partner to': 2,
 'was won by': 2,
 'ran at': 2,
 'partner since': 2,
 'withdrew at_time': 2,
 'was threatened by': 2,
 'also partnered In': 2,
 'has partnered In': 2,
 'running': 2,
 'initiated': 2,
 'partner at': 2,
 'worldwide partner to': 2,
 'won contest In': 2,
 'ran 

In [52]:
temp[number]['data']

[('Townsend Security', 'partnered in', '2000', 0.8830577585168978),
 ('Company', 'partnered In', '2010', 0.881188046376431),
 ('Tradesy', 'partnered In', 'fall', 0.8810684283327198),
 ('R. L. Polk', 'partnered In', '2010', 0.8805668969153696),
 ('Driveway Software', 'partnered In', '2015', 0.8803844420022673),
 ('Rogers Cable', 'partnered In', '2001', 0.8803556044260143),
 ('Verizon', 'partnered In', '2006', 0.8801321092225618),
 ('Ericsson', 'partnered In', '2012', 0.8800160008846911),
 ('IPM', 'partnered In', '1977', 0.879874529595046),
 ('Verizon', 'partnered In', '2017', 0.8797246219707353),
 ('Google', 'partnered In', '2011', 0.8796979742773541),
 ('Wikia', 'partnered In', '2014', 0.8796397054779174),
 ('WeChat', 'partnered In', '2016', 0.8796261404328051),
 ('Capcom', 'partnered in', '2011', 0.8796011386474994),
 ('Wikia', 'partnered In', '2013', 0.8796006384291689),
 ('Coriell', 'partnered In', '2018', 0.8796006384291689),
 ('Motorola', 'partnered In', '2006', 0.8795936288585319

In [56]:
temp[15]['predicates']

{'worked at': 32,
 'working for': 20,
 'working in': 20,
 'worked with': 19,
 'working at': 16,
 'working with': 14,
 'worked In': 14,
 'worked as': 11,
 'worked for': 11,
 'worked in': 10,
 'had worked at': 8,
 'work at': 7,
 'has worked at': 6,
 'were working In': 6,
 'worked on': 6,
 'work for': 5,
 'also worked with': 4,
 'working on': 4,
 'work on': 4,
 'Working in': 3,
 'work with': 3,
 'worked from': 3,
 'had worked for': 3,
 'work in': 3,
 'worked From': 3,
 'works with': 3,
 'worked During': 3,
 'working as': 3,
 'work as': 3,
 'worked since': 3,
 'Working first in': 3,
 'worked until': 2,
 'working closely with': 2,
 'work together at': 2,
 'has worked for': 2,
 'worked At': 2,
 'had worked with': 2,
 'worked For': 2,
 'was working with': 2,
 'work of': 2,
 'worked Over': 2,
 'works for': 2,
 'Prior worked for': 2,
 'worked project on': 2,
 'also worked at': 2,
 'works': 2,
 'worked As': 2,
 'worked during': 2,
 'was working for': 2,
 'was working at': 2,
 'had worked before'

In [None]:
features.shape, dumb_features.shape

In [None]:
def show_cluster_sample(number, rows=10):
    return dumb_features[y_pred == number][['docid', 'subject', 'relation', 'object', 'score']].sort_values('score', ascending=True).iloc[:rows] 

In [None]:
y_pred.max()

In [None]:
show_cluster_sample(33, 100).relation.value_counts()

In [None]:
with open('dcec_clusters_masked_100c_temp.txt', 'w') as f:
    for i in range(100):
        try:
            line = "\n".join(map(str, show_cluster_sample(i, 20).values.tolist()))
            f.write(str(i)+'-----------------\n' + line + '\n\n\n')
        except ValueError:
            f.write(str(i)+'-----------------\n')

In [None]:
PATH_DIRTY_JSON = 'unfiltered_results/'
! mkdir $PATH_DIRTY_JSON

def save_dirty_json(id, y_pred):
    result = {}
    #dumb_features = features[:]
    #dumb_features['score'] = dcec.score([plain_features, w2v_obj, w2v_subj, w2v_rel]).max(axis=1)
    #dumb_features.sort_values('score', ascending=False, inplace=True)
    number_of_clusters = y_pred.max()
    for number in range(number_of_clusters):
        sample = dumb_features[y_pred == number][['subject', 'relation', 'object', 'score']]
        #sample.sort_values('score', ascending=False, inplace=True)
        cluster = {
            "data": list(zip(*[sample[c].values.tolist() for c in sample[['subject', 'relation', 'object', 'score']]])),
            "predicates": {key: int(value) for key, value in dict(sample.relation.value_counts()).items()}
        }
        result[int(number)] = cluster
    
    json.dump(result, open(os.path.join(PATH_DIRTY_JSON, id), 'w'))
    return result

In [None]:
temp = save_dirty_json('002_100c_50e.json', y_pred)

In [None]:
sum([1 for _, __ in temp.items() if __['data']])

In [None]:
temp[6]

## Test DC_Kmeans 

In [None]:
import deep_clustering
save_dir = 'models'

dckmeans = deep_clustering.DC_Kmeans(input_shape=x_train.shape[1:], 
                            autoencoder_ctor=lambda input_shape: plain_noised_ae(input_shape),
                            n_clusters=30,
                            max_epochs=200,
                            save_dir=save_dir, 
                            log_dir=logPath)
plot_model(dckmeans._model, to_file=os.path.join(save_dir, 'dckmeans_model.png'), show_shapes=True)
dckmeans.compile(optimizer='adam')

In [None]:
dckmeans._model.summary()

In [None]:
dckmeans.fit(x_train)

## Clustering of internal representations generated by autoencoder

In [None]:
pae = plain_ae(x_train.shape[1:])
pae.compile(optimizer='adam', loss='mse')
pae.fit(x_train, x_train, batch_size=256, epochs=10, verbose=0)
hidden = pae.get_layer(name='embedding').output
encoder = Model(inputs=pae.input, outputs=hidden)
embeddings = encoder.predict(x_train)
cluzeriser = KMeans(2, n_jobs=6)
clusters = cluzeriser.fit_predict(embeddings)

In [None]:
pae.save('models/pae_model.h5')

In [None]:
def show_cluster_sample(number):
    return features[clusters == number][['docid', 'subject', 'relation', 'object']].sample(frac=1).iloc[:10] 

In [None]:
with open('pae_clusters.txt', 'w') as f:
    for i in range(50):
        try:
            line = "\n".join(map(str, show_cluster_sample(i).values.tolist()))
            f.write(str(i)+'-----------------\n' + line + '\n\n\n')
        except ValueError:
            f.write(str(i)+'-----------------\n')