# Initialization

In [6]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys
sys.path.append('./pylingtools/src/')
sys.path.append('./pyexling/src/')
sys.path.append('./syntaxnet_wrapper/src/')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

['/device:GPU:0']

In [8]:
import tensorflow as tf

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.compat.v1.Session(config=config)

from tensorflow.python.keras import backend as K
K.set_session(sess)

In [9]:
import logging

logPath = '../logs/'
! mkdir $logPath
fileName = 'main.log'
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s")

logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()

fileHandler = logging.FileHandler(os.path.join(logPath, fileName))
fileHandler.setFormatter(logFormatter)
logger.addHandler(fileHandler)

consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
logger.addHandler(consoleHandler)

logger.setLevel(logging.INFO)

mkdir: cannot create directory ‘../logs/’: File exists


In [10]:
import sklearn
import numpy as np
import pandas as pd
import json
import pickle
from tqdm import tqdm_notebook as tqdm
from joblib import Parallel, delayed

# Load data

In [11]:
from glob import glob
from tqdm.autonotebook import tqdm
import pandas as pd
import json


RESULT_PATH = 'data/processed_separately'
data = []

for file in tqdm(glob(RESULT_PATH + '/*.pkl')):
    data.append(pd.read_pickle(file))
    
data = pd.concat(data).sample(frac=1, random_state=42).reset_index(drop=True)

  


HBox(children=(FloatProgress(value=0.0, max=462.0), HTML(value='')))




In [12]:
data.fillna(0, inplace=True)
data = data.drop_duplicates(['_subject', '_relation', '_object'])
print(data.shape)

(61630, 6)


In [13]:
data._relation.value_counts()

was                         1507
directed by                  888
is                           815
is city in                   722
starring                     707
                            ... 
is co-production between       1
despised                       1
fix                            1
had run                        1
rule castile                   1
Name: _relation, Length: 15014, dtype: int64

In [14]:
tqdm.pandas()

def extract_matrix(row, predicate=False):
    _matrix = np.concatenate([row['ner'], row['postag']], axis=1)#.flatten()
    if predicate:
        _matrix = np.concatenate([_matrix, row['w2v'], [row['prep'], row['prep'], row['prep']]], axis=1)#.flatten()
    return _matrix

data['object_matr'] = data.object.progress_map(extract_matrix)
data['subject_matr'] = data.subject.progress_map(extract_matrix)
data['relation_matr'] = data.relation.progress_map(lambda row: extract_matrix(row, predicate=True))

  from pandas import Panel


HBox(children=(FloatProgress(value=0.0, max=61630.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=61630.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=61630.0), HTML(value='')))




In [15]:
_object, _subject, _relation = data.object_matr.values, data.subject_matr.values, data.relation_matr.values

In [16]:
np.stack(_object).shape

(61630, 3, 41)

In [17]:
_object = np.stack(_object)
_subject = np.stack(_subject)
_relation = np.stack(_relation)

In [18]:
from sklearn.preprocessing import StandardScaler

scalers = [{}, {}, {}]

for i in range(_object.shape[1]):
    scalers[0][i] = StandardScaler()
    _object[:, i, :] = scalers[0][i].fit_transform(_object[:, i, :]) 

for i in range(_subject.shape[1]):
    scalers[1][i] = StandardScaler()
    _subject[:, i, :] = scalers[1][i].fit_transform(_subject[:, i, :]) 
    
for i in range(_relation.shape[1]):
    scalers[2][i] = StandardScaler()
    _relation[:, i, :] = scalers[2][i].fit_transform(_relation[:, i, :]) 

# Models

In [21]:
%load_ext cython

from tensorflow.keras.utils import get_custom_objects
from tensorflow.keras.optimizers import Adam
from autoencoder_models import Mish, mish

get_custom_objects().update({'mish': Mish(mish)})

The cython extension is already loaded. To reload it, use:
  %reload_ext cython


In [22]:
input_shape=[_subject.shape[1:], _object.shape[1:], _relation.shape[1:]]
input_shape

[(3, 41), (3, 41), (3, 382)]

## Test autoencoder

In [23]:
from autoencoder_models import noised_ae

model = noised_ae(input_shape=input_shape)
model.summary()

model.compile(optimizer='adam', loss='mse')

model.fit(x=[_subject, _object, _relation],
          y=[_subject, _object, _relation], epochs=3, batch_size=256)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_subject (InputLayer)      [(None, 3, 41)]      0                                            
__________________________________________________________________________________________________
input_object (InputLayer)       [(None, 3, 41)]      0                                            
__________________________________________________________________________________________________
input_rel (InputLayer)          [(None, 3, 382)]     0                                            
__________________________________________________________________________________________________
gaussian_noise (GaussianNoise)  (None, 3, 41)        0           input_subject[0][0]              
_______________________________________________________________________________________

<tensorflow.python.keras.callbacks.History at 0x7f75e83d25d0>

## Train IDEC

In [52]:
import os
from pathlib import Path
import deep_clustering

save_dir = 'models/idec/noised_ae'
directory = os.path.dirname(save_dir)
if not Path(directory).is_dir():
    ! mkdir $save_dir

def train_idec(autoencoder, n_clusters, score_threshold=1e-5, save_dir=save_dir, partial_init=False):
    _directory = save_dir + f'/idec_{n_clusters}'
    ! mkdir $_directory
    idec = deep_clustering.IDEC(input_shape=input_shape,
                                autoencoder_ctor=lambda input_shape: autoencoder(input_shape),  # select model here
                                n_clusters=n_clusters,
                                pretrain_epochs=100,
                                max_iter=300,
                                partial_init=partial_init,
                                save_dir=_directory, 
                                log_dir=logPath)

    idec.compile(optimizer='adam')
    idec.fit([_subject, _object, _relation])

    # dump data somewhere
    y_pred = idec._y_pred
    dumb_features = data[['_subject', '_relation', '_object']]
    dumb_features['cluster'] = y_pred
    scores = idec.score_examples([_subject, _object, _relation])
    dumb_features['score'] = scores
    dumb_features = dumb_features[dumb_features['score'] > score_threshold]
    dumb_features.to_pickle(f'clusterized/idec_clusters_{n_clusters}_partial{str(partial_init)}.pkl')
    
    return idec

In [54]:
idec32 = train_idec(autoencoder=noised_ae, n_clusters=32, partial_init=False)
idec32 = train_idec(autoencoder=noised_ae, n_clusters=32, partial_init=True)
idec40 = train_idec(autoencoder=noised_ae, n_clusters=40, partial_init=False)
idec40 = train_idec(autoencoder=noised_ae, n_clusters=40, partial_init=True)

mkdir: cannot create directory ‘models/idec/noised_ae/idec_32’: File exists


2020-11-09 12:39:45,099 [MainThread  ] [INFO ]  Pretraining...
2020-11-09 12:42:43,917 [MainThread  ] [INFO ]  Pretraining time: 178.8123025894165
2020-11-09 12:42:43,982 [MainThread  ] [INFO ]  Pretrained weights are saved to models/idec/noised_ae/idec_32/pretrain_cae_model.h5
2020-11-09 12:42:43,982 [MainThread  ] [INFO ]  Initializing cluster centers.
2020-11-09 12:43:02,060 [MainThread  ] [INFO ]  Cluster centers initialized: 18.077159643173218
2020-11-09 12:43:02,060 [MainThread  ] [INFO ]  Training model.
2020-11-09 12:43:02,061 [MainThread  ] [INFO ]  Update interval 140
2020-11-09 12:43:02,061 [MainThread  ] [INFO ]  Save interval 1203.7109375
2020-11-09 12:43:02,061 [MainThread  ] [INFO ]  Training model. Iteration #0.
2020-11-09 12:43:11,731 [MainThread  ] [INFO ]  saving model to: models/idec/noised_ae/idec_32/dcec_model_0.h5
2020-11-09 12:43:13,137 [MainThread  ] [INFO ]  Training model. Iteration #140.
2020-11-09 12:43:21,980 [MainThread  ] [INFO ]  Loss: [0.65693539381027

mkdir: cannot create directory ‘models/idec/noised_ae/idec_32’: File exists


2020-11-09 12:43:46,243 [MainThread  ] [INFO ]  Pretraining...
2020-11-09 12:47:44,967 [MainThread  ] [INFO ]  Pretraining time: 238.7174768447876
2020-11-09 12:47:45,032 [MainThread  ] [INFO ]  Pretrained weights are saved to models/idec/noised_ae/idec_32/pretrain_cae_model.h5
2020-11-09 12:47:45,032 [MainThread  ] [INFO ]  Initializing cluster centers.
2020-11-09 12:48:02,079 [MainThread  ] [INFO ]  Cluster centers initialized: 17.046117305755615
2020-11-09 12:48:02,079 [MainThread  ] [INFO ]  Training model.
2020-11-09 12:48:02,080 [MainThread  ] [INFO ]  Update interval 140
2020-11-09 12:48:02,080 [MainThread  ] [INFO ]  Save interval 1203.7109375
2020-11-09 12:48:02,080 [MainThread  ] [INFO ]  Training model. Iteration #0.
2020-11-09 12:48:10,467 [MainThread  ] [INFO ]  saving model to: models/idec/noised_ae/idec_32/dcec_model_0.h5
2020-11-09 12:48:11,876 [MainThread  ] [INFO ]  Training model. Iteration #140.
2020-11-09 12:48:18,976 [MainThread  ] [INFO ]  Loss: [0.53398340940475

mkdir: cannot create directory ‘models/idec/noised_ae/idec_40’: File exists


2020-11-09 12:48:43,825 [MainThread  ] [INFO ]  Pretraining...
2020-11-09 12:52:34,979 [MainThread  ] [INFO ]  Pretraining time: 231.14883375167847
2020-11-09 12:52:35,036 [MainThread  ] [INFO ]  Pretrained weights are saved to models/idec/noised_ae/idec_40/pretrain_cae_model.h5
2020-11-09 12:52:35,037 [MainThread  ] [INFO ]  Initializing cluster centers.
2020-11-09 12:52:56,003 [MainThread  ] [INFO ]  Cluster centers initialized: 20.965651750564575
2020-11-09 12:52:56,003 [MainThread  ] [INFO ]  Training model.
2020-11-09 12:52:56,004 [MainThread  ] [INFO ]  Update interval 140
2020-11-09 12:52:56,004 [MainThread  ] [INFO ]  Save interval 1203.7109375
2020-11-09 12:52:56,004 [MainThread  ] [INFO ]  Training model. Iteration #0.
2020-11-09 12:53:02,545 [MainThread  ] [INFO ]  saving model to: models/idec/noised_ae/idec_40/dcec_model_0.h5
2020-11-09 12:53:03,936 [MainThread  ] [INFO ]  Training model. Iteration #140.
2020-11-09 12:53:09,371 [MainThread  ] [INFO ]  Loss: [0.5291867256164

mkdir: cannot create directory ‘models/idec/noised_ae/idec_40’: File exists


2020-11-09 12:53:33,595 [MainThread  ] [INFO ]  Pretraining...
2020-11-09 12:56:03,362 [MainThread  ] [INFO ]  Pretraining time: 149.76065254211426
2020-11-09 12:56:03,407 [MainThread  ] [INFO ]  Pretrained weights are saved to models/idec/noised_ae/idec_40/pretrain_cae_model.h5
2020-11-09 12:56:03,408 [MainThread  ] [INFO ]  Initializing cluster centers.
2020-11-09 12:56:26,460 [MainThread  ] [INFO ]  Cluster centers initialized: 23.051671028137207
2020-11-09 12:56:26,460 [MainThread  ] [INFO ]  Training model.
2020-11-09 12:56:26,461 [MainThread  ] [INFO ]  Update interval 140
2020-11-09 12:56:26,461 [MainThread  ] [INFO ]  Save interval 1203.7109375
2020-11-09 12:56:26,462 [MainThread  ] [INFO ]  Training model. Iteration #0.
2020-11-09 12:56:37,555 [MainThread  ] [INFO ]  saving model to: models/idec/noised_ae/idec_40/dcec_model_0.h5
2020-11-09 12:56:38,949 [MainThread  ] [INFO ]  Training model. Iteration #140.
2020-11-09 12:56:48,703 [MainThread  ] [INFO ]  Loss: [0.5304797291755

### Review the results data dump

In [36]:
dumb_features = pd.read_pickle('clusterized/idec_clusters_32.pkl')

In [37]:
def show_cluster_sample(number, rows=100):
    def get_tokens(column):
        return ' '.join(column['tokens'])
    
    cluster = dumb_features[dumb_features.cluster == number]
    return cluster[['_subject', '_relation', '_object', 'score']].iloc[:rows]

In [42]:
dumb_features[dumb_features._relation.str.contains('directed')]['cluster'].value_counts()

0     586
3     253
28    139
20     71
19     36
30     34
15     18
14     11
12     10
8       7
5       6
23      5
7       5
25      4
21      3
2       3
26      2
4       2
16      1
13      1
10      1
27      1
Name: cluster, dtype: int64

In [43]:
len(sorted(dumb_features.cluster.unique()))

32

In [49]:
number = 5
temp = show_cluster_sample(number, rows=2000)
temp._relation.value_counts()
#temp.sort_values('score', ascending=False)

was               68
is                56
city in           37
served            29
is located        22
                  ..
has expressed      1
emigrating         1
was scene of       1
also remaining     1
is having          1
Name: _relation, Length: 1140, dtype: int64

## Apply an IDEC model to the QA corpus

In [96]:
dumb_features.to_pickle('clusterized/partial_idec_clusters.pkl')

In [None]:
PATH_DIRTY_JSON = 'unfiltered_results/idec/restore_rel/'
! mkdir $PATH_DIRTY_JSON

def save_dirty_json(id, y_pred):
    result = {}
    number_of_clusters = y_pred.max()
    for number in range(number_of_clusters):
        sample = show_cluster_sample(number, 999).sort_values('score', ascending=False)
        cluster = {
            "data": list(zip(*[sample[c].values.tolist() for c in sample])),
            "predicates": {key: int(value) for key, value in dict(sample.relation.value_counts()).items()}
        }
        result[int(number)] = cluster
    
    json.dump(result, open(os.path.join(PATH_DIRTY_JSON, id), 'w'))
    return result

In [None]:
save_dirty_json('dcec_kmeans_80c_002.json', y_pred)

## Train DAEC

In [None]:
! mkdir models/daec

In [None]:
import deep_clustering
save_dir = 'models/daec/restore_rel'
! mkdir $save_dir

daec = deep_clustering.DAEC(input_shape=(_subject.shape[1:]),
                            autoencoder_ctor=lambda input_shape: restore_rel(input_shape),  # select model here
                            n_clusters=50, 
                            pretrain_epochs=10,
                            log_dir=logPath,
                            save_dir=save_dir, 
                            )

plot_model(dcec._model, to_file=os.path.join(save_dir, 'daec_model.png'), show_shapes=True)
daec.compile(optimizer='adam')

In [None]:
daec._model.summary()

In [None]:
daec.fit([_subject, _object, _relation])

In [None]:
def get_tokens(column):
    return ' '.join(column['tokens'])

y_pred = daec._y_pred
dumb_features = pd.DataFrame()
dumb_features['subject'] = data['subject'].map(get_tokens)
dumb_features['relation'] = data['relation'].map(get_tokens)
dumb_features['object'] = data['object'].map(get_tokens)
dumb_features['cluster'] = y_pred
scores = dcec.score_examples([_subject, _object, _relation])
dumb_features['score'] = scores
threshold = 0.01
dumb_features = dumb_features[dumb_features['score'] > threshold]

In [None]:
def show_cluster_sample(number, rows=100):
    def get_tokens(column):
        return ' '.join(column['tokens'])
    
    cluster = dumb_features[dumb_features.cluster == number]
    return cluster[['subject', 'relation', 'object', 'score']].iloc[:rows]

In [None]:
dumb_features.head()

In [None]:
dumb_features[dumb_features.object == 'eliza'].sort_values('cluster')

In [None]:
dumb_features[dumb_features.relation.str.contains('born')].sort_values('cluster').iloc[:20]

In [None]:
number = 4
temp = show_cluster_sample(number)
temp.relation.value_counts()

In [None]:
temp.sort_values('score', ascending=False)

In [None]:
PATH_DIRTY_JSON = 'unfiltered_results/daec/restore_rel/'
! mkdir $PATH_DIRTY_JSON

def save_dirty_json(id, y_pred):
    result = {}
    number_of_clusters = y_pred.max()
    for number in range(number_of_clusters):
        sample = show_cluster_sample(number, 999).sort_values('score', ascending=False)
        cluster = {
            "data": list(zip(*[sample[c].values.tolist() for c in sample])),
            "predicates": {key: int(value) for key, value in dict(sample.relation.value_counts()).items()}
        }
        result[int(number)] = cluster
    
    json.dump(result, open(os.path.join(PATH_DIRTY_JSON, id), 'w'))
    return result

In [None]:
res = save_dirty_json('daec_kmeans_80c_002.json', y_pred)

In [None]:
temp[6]

## Train DC_Kmeans 

In [None]:
! mkdir models/dc_kmeans

In [None]:
import deep_clustering
save_dir = 'models/dc_kmeans/restore_rel'
! mkdir $save_dir

dckmeans = deep_clustering.DC_Kmeans(
                            input_shape=(_subject.shape[1:]),
                            autoencoder_ctor=lambda input_shape: restore_rel(input_shape),  # select model here
                            n_clusters=30,
                            pretrain_epochs=50,
                            max_epochs=200,
                            save_dir=save_dir, 
                            log_dir=logPath)
plot_model(dckmeans._model, to_file=os.path.join(save_dir, 'dckmeans_model.png'), show_shapes=True)
dckmeans.compile(optimizer='adam')

In [None]:
dckmeans._model.summary()

In [None]:
dckmeans.fit([_subject, _object, _relation])

In [None]:
def get_tokens(column):
    return ' '.join(column['tokens'])

y_pred = dckmeans.y_pred
dumb_features = pd.DataFrame()
dumb_features['subject'] = data['subject'].map(get_tokens)
dumb_features['relation'] = data['relation'].map(get_tokens)
dumb_features['object'] = data['object'].map(get_tokens)
dumb_features['cluster'] = y_pred
scores = dckmeans.get_scores([_subject, _object, _relation])
dumb_features['score'] = scores
threshold = 0.05
dumb_features = dumb_features[dumb_features['score'] > threshold]

In [None]:
def show_cluster_sample(number, rows=100):
    def get_tokens(column):
        return ' '.join(column['tokens'])
    
    cluster = dumb_features[dumb_features.cluster == number]
    return cluster[['subject', 'relation', 'object', 'score']].iloc[:rows]

In [None]:
dumb_features[dumb_features.object == 'eliza'].sort_values('cluster')

In [None]:
dumb_features[dumb_features.relation.str.contains('born')].sort_values('cluster').iloc[:20]

In [None]:
number = 4
temp = show_cluster_sample(number)
temp.relation.value_counts()

In [None]:
temp.sort_values('score', ascending=False)

In [None]:
PATH_DIRTY_JSON = 'unfiltered_results/dc_kmeans/restore_rel/'
! mkdir $PATH_DIRTY_JSON

def save_dirty_json(id, y_pred):
    result = {}
    number_of_clusters = y_pred.max()
    for number in range(number_of_clusters):
        sample = show_cluster_sample(number, 999).sort_values('score', ascending=False)
        cluster = {
            "data": list(zip(*[sample[c].values.tolist() for c in sample])),
            "predicates": {key: int(value) for key, value in dict(sample.relation.value_counts()).items()}
        }
        result[int(number)] = cluster
    
    json.dump(result, open(os.path.join(PATH_DIRTY_JSON, id), 'w'))
    return result

In [None]:
res = save_dirty_json('dc_kmeans_30c_000.json', y_pred)

## Clustering of internal representations generated by autoencoder

In [None]:
pae = plain_ae(x_train.shape[1:])
pae.compile(optimizer='adam', loss='mse')
pae.fit(x_train, x_train, batch_size=256, epochs=10, verbose=0)
hidden = pae.get_layer(name='embedding').output
encoder = Model(inputs=pae.input, outputs=hidden)
#embeddings = encoder.predict(x_train)
#cluzeriser = KMeans(2, n_jobs=6)
#clusters = cluzeriser.fit_predict(embeddings)

In [None]:
pae.save('models/pae_model.h5')

In [None]:
def show_cluster_sample(number):
    return features[clusters == number][['docid', 'subject', 'relation', 'object']].sample(frac=1).iloc[:10] 

In [None]:
with open('pae_clusters.txt', 'w') as f:
    for i in range(50):
        try:
            line = "\n".join(map(str, show_cluster_sample(i).values.tolist()))
            f.write(str(i)+'-----------------\n' + line + '\n\n\n')
        except ValueError:
            f.write(str(i)+'-----------------\n')