# Initialization

In [None]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys
sys.path.append('./pylingtools/src/')
sys.path.append('./pyexling/src/')
sys.path.append('./syntaxnet_wrapper/src/')

In [None]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

In [None]:
## reset cuda
# from numba import cuda
# cuda.select_device(0)
# cuda.close()

In [None]:
import tensorflow as tf

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.compat.v1.Session(config=config)

from tensorflow.python.keras import backend as K
K.set_session(sess)

In [None]:
import logging

logPath = '../logs/'
! mkdir $logPath
fileName = 'main.log'
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s")

logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()

fileHandler = logging.FileHandler(os.path.join(logPath, fileName))
fileHandler.setFormatter(logFormatter)
logger.addHandler(fileHandler)

consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
logger.addHandler(consoleHandler)

logger.setLevel(logging.INFO)

In [None]:
import sklearn
import numpy as np
import pandas as pd
import json
import pickle
from tqdm import tqdm_notebook as tqdm
from joblib import Parallel, delayed
import re

# Load data

In [None]:
from glob import glob
from tqdm.autonotebook import tqdm
import pandas as pd
import json


RESULT_PATH = 'data/processed_separately'
data = []

for file in tqdm(glob(RESULT_PATH + '/*.pkl')):
    data.append(pd.read_pickle(file))
    
data = pd.concat(data).sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
data.fillna(0, inplace=True)
data = data.drop_duplicates(['_subject', '_relation', '_object'])
print(data.shape)

In [None]:
data._relation.value_counts()

In [None]:
tqdm.pandas()

def extract_matrix(row, predicate=False):
    _matrix = np.concatenate([row['ner'], row['postag']], axis=1)#.flatten()
    if predicate:
        _matrix = np.concatenate([_matrix, row['w2v'], [row['prep'], row['prep'], row['prep']]], axis=1)#.flatten()
    return _matrix

data['object_matr'] = data.object.progress_map(extract_matrix)
data['subject_matr'] = data.subject.progress_map(extract_matrix)
data['relation_matr'] = data.relation.progress_map(lambda row: extract_matrix(row, predicate=True))

In [None]:
data.object_matr.values[0].shape

In [None]:
_object, _subject, _relation = data.object_matr.values, data.subject_matr.values, data.relation_matr.values
_object = np.stack(_object)
_subject = np.stack(_subject)
_relation = np.stack(_relation)

In [None]:
_object.shape

# Models

In [None]:
%load_ext cython

from tensorflow.keras.utils import get_custom_objects
from tensorflow.keras.optimizers import Adam
from autoencoder_models import Mish, mish

get_custom_objects().update({'mish': mish})

In [None]:
input_shape=[_subject.shape[1:], _object.shape[1:], _relation.shape[1:]]
input_shape

## Test autoencoder

In [None]:
from autoencoder_models import noised_ae

In [None]:
model = noised_ae(input_shape=input_shape)
model.summary()

model.compile(optimizer='adam', loss='mse')

model.fit(x=[_subject, _object, _relation],
          y=[_subject, _object, _relation], epochs=100, batch_size=256)

## Train IDEC

In [None]:
import os
from pathlib import Path
import deep_clustering

save_dir = 'models/idec/noised_ae'
directory = os.path.dirname(save_dir)
if not Path(directory).is_dir():
    ! mkdir $save_dir

def train_idec(autoencoder, n_clusters, score_threshold=1e-5, save_dir=save_dir, partial_init=False):
    _directory = save_dir + f'/idec_{n_clusters}_partial{str(partial_init)}'
    ! mkdir $_directory
    idec = deep_clustering.IDEC(input_shape=input_shape,
                                autoencoder_ctor=lambda input_shape: autoencoder(input_shape),  # select model here
                                n_clusters=n_clusters,
                                pretrain_epochs=50,
                                max_iter=300,
                                partial_init=partial_init,
                                save_dir=_directory, 
                                log_dir=logPath)

    idec.compile(optimizer='adam')
    plot_model(idec._model, to_file='daec_model.png', show_shapes=True)
    idec.fit([_subject, _object, _relation])

    # dump data somewhere
    y_pred = idec._y_pred
    dumb_features = data[['_subject', '_relation', '_object']]
    dumb_features['cluster'] = y_pred
    scores = idec.score_examples([_subject, _object, _relation])
    dumb_features['score'] = scores
    dumb_features = dumb_features[dumb_features['score'] > score_threshold]
    dumb_features.to_pickle(f'clusterized/idec_clusters_{n_clusters}_partial{str(partial_init)}.pkl')
    
    return idec

In [None]:
idec32 = train_idec(autoencoder=noised_ae, n_clusters=32, partial_init=False)
idec32 = train_idec(autoencoder=noised_ae, n_clusters=32, partial_init=True)
idec40 = train_idec(autoencoder=noised_ae, n_clusters=40, partial_init=False)
idec40 = train_idec(autoencoder=noised_ae, n_clusters=40, partial_init=True)

In [None]:
from tensorflow.keras.utils import plot_model

idec64 = train_idec(autoencoder=noised_ae, n_clusters=40, partial_init=True)

### Review the results data dump

In [None]:
dumb_features = pd.read_pickle('clusterized/idec_clusters_40_partialTrue2.pkl')

In [None]:
def show_cluster_sample(number, rows=100):
    def get_tokens(column):
        return ' '.join(column['tokens'])
    
    cluster = dumb_features[dumb_features.cluster == number]
    return cluster[['_subject', '_relation', '_object', 'score']].iloc[:rows]

In [None]:
import seaborn as sns
%matplotlib inline

ax = sns.countplot(dumb_features.cluster, order=dumb_features.cluster.value_counts().index, color='green')
ax.set(xlabel='cluster', ylabel='size')

In [None]:
dumb_features[dumb_features._relation.str.contains('died')]['cluster'].value_counts()

In [None]:
len(sorted(dumb_features.cluster.unique()))

In [None]:
number = 28
temp = show_cluster_sample(number, rows=5000)
temp._relation.value_counts()
#temp.sort_values('score', ascending=False)

## Apply an IDEC model to the QA corpus

#### Load the features collected in ``1_parse_data_SimpleQuestions.ipynb``.

In [None]:
data = dict()

for part in ['train', 'valid', 'test']:
    data[part] = dict()
    
    for name in ['object', 'subject', 'relation']:
        path = "../uopenie_qa/SimpleWikidataQuestions/csv decoded/"
        path += f"annotated_wd_data_{part}_answerable_decoded_{name}_features.npy"
        
        data[part][name] = np.load(path)

#### Load saved IDEC 

In [None]:
from tensorflow.keras import models
from autoencoder_models import Mish, mish
import deep_clustering

get_custom_objects().update({'mish': mish})


def load_idec(path):
    # information about clusters number and partial init option is in the path name
    n_clusters = int(re.findall(r'idec_(\d+)', path)[0])
    partial_init = 'partialTrue' in path
    
    # load pretrained autoencoder
    aec = models.load_model(path.replace("dcec_model_final", "pretrain_cae_model"))
    
    # load pretrained IDEC
    model = deep_clustering.IDEC(input_shape=input_shape,
                                autoencoder_ctor=aec,
                                pretrained=True,
                                n_clusters=n_clusters,
                                pretrain_epochs=20,
                                max_iter=300,
                                partial_init=partial_init,
                                save_dir=path[:path.rfind('/')], 
                                log_dir=logPath)

    model.compile(optimizer='adam')
    model.load_weights(path)
    
    return model

In [None]:
model_name = '40_partialTrue2'

In [None]:
saved_idec = load_idec(f"models/idec/noised_ae/idec_{model_name}/dcec_model_final.h5")

In [None]:
n = 3
saved_idec.predict([_subject[:n], _object[:n], _relation[:n]])

#### Predict clusters for QA data 

In [None]:
qa_predictions = dict()

for part in ['train', 'valid', 'test']:
    qa_predictions[part] = saved_idec.predict([
        data[part]['subject'],
        data[part]['object'],
        data[part]['relation']])

## Run baseline classifier 

In [None]:
qa_data = {}

for part in ["train", "valid", "test"]:
    path = f"../uopenie_qa/SimpleWikidataQuestions/csv decoded/annotated_wd_data_{part}_answerable_decoded.csv"
    qa_data[part] = pd.read_csv(path).drop(columns=["Unnamed: 0", "Unnamed: 0.1"])
    qa_data[part]['class'] = qa_predictions[part]

In [None]:
qa_data['test'].head(2)

In [None]:
if False:
    # 40_partialTrue2
    qa_data['train'].to_csv('simplequestions_train_classified.csv')
    qa_data['valid'].to_csv('simplequestions_valid_classified.csv')
    qa_data['test'].to_csv('simplequestions_test_classified.csv')

In [None]:
from question_classifiers import FastTextClassifier

clf = FastTextClassifier()
clf.train(qa_data['train']['question'].values, qa_data['train']['class'].values,
          qa_data['valid']['question'].values, qa_data['valid']['class'].values)

In [None]:
clf.save(f"fasttext_clf_{model_name}.bin")

In [None]:
! ls

### Load the pretrained model (optional)

In [None]:
from question_classifiers import FastTextClassifier

model_name = '40_partialTrue'
clf = FastTextClassifier(path=f"fasttext_clf_{model_name}.bin")

#### Predict on validation set 

In [None]:
predictions, proba = clf.predict(qa_data['valid']['question'].values)

clf.evaluate(qa_data['valid']['class'].values, predictions)

#### Predict on test set 

In [None]:
predictions, proba = clf.predict(qa_data['test']['question'].values)

clf.evaluate(qa_data['test']['class'].values, predictions)

In [None]:
qa_data['test'][qa_data['test']['class'] == 4].property_decoded.value_counts()

In [None]:
PATH_DIRTY_JSON = 'unfiltered_results/idec/restore_rel/'
! mkdir $PATH_DIRTY_JSON

def save_dirty_json(id, y_pred):
    result = {}
    number_of_clusters = y_pred.max()
    for number in range(number_of_clusters):
        sample = show_cluster_sample(number, 999).sort_values('score', ascending=False)
        cluster = {
            "data": list(zip(*[sample[c].values.tolist() for c in sample])),
            "predicates": {key: int(value) for key, value in dict(sample.relation.value_counts()).items()}
        }
        result[int(number)] = cluster
    
    json.dump(result, open(os.path.join(PATH_DIRTY_JSON, id), 'w'))
    return result

In [None]:
save_dirty_json('dcec_kmeans_80c_002.json', y_pred)

## Other autoencoder architectures

### Train DAEC

In [None]:
! mkdir models/daec

In [None]:
import deep_clustering
save_dir = 'models/daec/restore_rel'
! mkdir $save_dir

daec = deep_clustering.DAEC(input_shape=(_subject.shape[1:]),
                            autoencoder_ctor=lambda input_shape: restore_rel(input_shape),  # select model here
                            n_clusters=50, 
                            pretrain_epochs=10,
                            log_dir=logPath,
                            save_dir=save_dir, 
                            )

plot_model(dcec._model, to_file=os.path.join(save_dir, 'daec_model.png'), show_shapes=True)
daec.compile(optimizer='adam')

In [None]:
daec._model.summary()

In [None]:
daec.fit([_subject, _object, _relation])

In [None]:
def get_tokens(column):
    return ' '.join(column['tokens'])

y_pred = daec._y_pred
dumb_features = pd.DataFrame()
dumb_features['subject'] = data['subject'].map(get_tokens)
dumb_features['relation'] = data['relation'].map(get_tokens)
dumb_features['object'] = data['object'].map(get_tokens)
dumb_features['cluster'] = y_pred
scores = dcec.score_examples([_subject, _object, _relation])
dumb_features['score'] = scores
threshold = 0.01
dumb_features = dumb_features[dumb_features['score'] > threshold]

In [None]:
def show_cluster_sample(number, rows=100):
    def get_tokens(column):
        return ' '.join(column['tokens'])
    
    cluster = dumb_features[dumb_features.cluster == number]
    return cluster[['subject', 'relation', 'object', 'score']].iloc[:rows]

In [None]:
dumb_features.head()

In [None]:
dumb_features[dumb_features.object == 'eliza'].sort_values('cluster')

In [None]:
dumb_features[dumb_features.relation.str.contains('born')].sort_values('cluster').iloc[:20]

In [None]:
number = 4
temp = show_cluster_sample(number)
temp.relation.value_counts()

In [None]:
temp.sort_values('score', ascending=False)

In [None]:
PATH_DIRTY_JSON = 'unfiltered_results/daec/restore_rel/'
! mkdir $PATH_DIRTY_JSON

def save_dirty_json(id, y_pred):
    result = {}
    number_of_clusters = y_pred.max()
    for number in range(number_of_clusters):
        sample = show_cluster_sample(number, 999).sort_values('score', ascending=False)
        cluster = {
            "data": list(zip(*[sample[c].values.tolist() for c in sample])),
            "predicates": {key: int(value) for key, value in dict(sample.relation.value_counts()).items()}
        }
        result[int(number)] = cluster
    
    json.dump(result, open(os.path.join(PATH_DIRTY_JSON, id), 'w'))
    return result

In [None]:
res = save_dirty_json('daec_kmeans_80c_002.json', y_pred)

In [None]:
temp[6]

### DC_Kmeans 

In [None]:
! mkdir models/dc_kmeans

In [None]:
import deep_clustering
save_dir = 'models/dc_kmeans/restore_rel'
! mkdir $save_dir

dckmeans = deep_clustering.DC_Kmeans(
                            input_shape=(_subject.shape[1:]),
                            autoencoder_ctor=lambda input_shape: restore_rel(input_shape),  # select model here
                            n_clusters=30,
                            pretrain_epochs=50,
                            max_epochs=200,
                            save_dir=save_dir, 
                            log_dir=logPath)
plot_model(dckmeans._model, to_file=os.path.join(save_dir, 'dckmeans_model.png'), show_shapes=True)
dckmeans.compile(optimizer='adam')

In [None]:
dckmeans._model.summary()

In [None]:
dckmeans.fit([_subject, _object, _relation])

In [None]:
def get_tokens(column):
    return ' '.join(column['tokens'])

y_pred = dckmeans.y_pred
dumb_features = pd.DataFrame()
dumb_features['subject'] = data['subject'].map(get_tokens)
dumb_features['relation'] = data['relation'].map(get_tokens)
dumb_features['object'] = data['object'].map(get_tokens)
dumb_features['cluster'] = y_pred
scores = dckmeans.get_scores([_subject, _object, _relation])
dumb_features['score'] = scores
threshold = 0.05
dumb_features = dumb_features[dumb_features['score'] > threshold]

In [None]:
def show_cluster_sample(number, rows=100):
    def get_tokens(column):
        return ' '.join(column['tokens'])
    
    cluster = dumb_features[dumb_features.cluster == number]
    return cluster[['subject', 'relation', 'object', 'score']].iloc[:rows]

In [None]:
dumb_features[dumb_features.object == 'eliza'].sort_values('cluster')

In [None]:
dumb_features[dumb_features.relation.str.contains('born')].sort_values('cluster').iloc[:20]

In [None]:
number = 4
temp = show_cluster_sample(number)
temp.relation.value_counts()

In [None]:
temp.sort_values('score', ascending=False)

In [None]:
PATH_DIRTY_JSON = 'unfiltered_results/dc_kmeans/restore_rel/'
! mkdir $PATH_DIRTY_JSON

def save_dirty_json(id, y_pred):
    result = {}
    number_of_clusters = y_pred.max()
    for number in range(number_of_clusters):
        sample = show_cluster_sample(number, 999).sort_values('score', ascending=False)
        cluster = {
            "data": list(zip(*[sample[c].values.tolist() for c in sample])),
            "predicates": {key: int(value) for key, value in dict(sample.relation.value_counts()).items()}
        }
        result[int(number)] = cluster
    
    json.dump(result, open(os.path.join(PATH_DIRTY_JSON, id), 'w'))
    return result

In [None]:
res = save_dirty_json('dc_kmeans_30c_000.json', y_pred)

### Clustering of internal representations generated by autoencoder

In [None]:
pae = plain_ae(x_train.shape[1:])
pae.compile(optimizer='adam', loss='mse')
pae.fit(x_train, x_train, batch_size=256, epochs=10, verbose=0)
hidden = pae.get_layer(name='embedding').output
encoder = Model(inputs=pae.input, outputs=hidden)
#embeddings = encoder.predict(x_train)
#cluzeriser = KMeans(2, n_jobs=6)
#clusters = cluzeriser.fit_predict(embeddings)

In [None]:
pae.save('models/pae_model.h5')

In [None]:
def show_cluster_sample(number):
    return features[clusters == number][['docid', 'subject', 'relation', 'object']].sample(frac=1).iloc[:10] 

In [None]:
with open('pae_clusters.txt', 'w') as f:
    for i in range(50):
        try:
            line = "\n".join(map(str, show_cluster_sample(i).values.tolist()))
            f.write(str(i)+'-----------------\n' + line + '\n\n\n')
        except ValueError:
            f.write(str(i)+'-----------------\n')