# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import sys
sys.path.append('./pylingtools/src/')
sys.path.append('./pyexling/src/')
sys.path.append('./syntaxnet_wrapper/src/')

In [2]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

['/device:GPU:0']

In [3]:
import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

from tensorflow.python.keras import backend as K
K.set_session(sess)

In [4]:
import logging

logPath = '../logs/'
! mkdir $logPath
fileName = 'main.log'
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s")

logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()

fileHandler = logging.FileHandler(os.path.join(logPath, fileName))
fileHandler.setFormatter(logFormatter)
logger.addHandler(fileHandler)

consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
logger.addHandler(consoleHandler)

logger.setLevel(logging.INFO)

mkdir: cannot create directory ‘../logs/’: File exists


In [5]:
import sklearn
import numpy as np
import pandas as pd
import json
import pickle
from tqdm import tqdm_notebook as tqdm
from joblib import Parallel, delayed

# Load data

In [6]:
data = pd.read_pickle('data/dataset.pkl')

In [7]:
data = pd.read_pickle('data/dataset_ner.pkl')

In [8]:
data.shape

(8308, 351)

In [9]:
deprecated = ('one', 'she', 'they', 'his', 'her', 'its', 'our', 'day', 'co.', 'inc.', 
              'society', 'people', 'inventor', 'head', 'poet', 'doctor', 'teacher', 'inventor', 
              'thanksgiving day', 'halloween',
              'sales person', 'model', 'board', 'technology', 'owner', 'one', 'two', 
             )

data = data[data.object.map(len) > 2]
data = data[data.subject.map(len) > 2]

for dep_word in deprecated:
    data = data[data.subject.str.lower() != dep_word]
    data = data[data.object.str.lower() != dep_word]

In [10]:
data = data[data.lemma_subj.map(len) < 4]
data = data[data.lemma_obj.map(len) < 4]
data = data[data.lemma_rel.map(len) < 4]

In [11]:
data.shape

(6729, 351)

In [12]:
vectorizer_rel = pickle.load(open('models/relation_vectorizer.pkl', 'rb'))
bow_rel = vectorizer_rel.transform(data['relation'].values).toarray()
columns = [column + '_bow_rel' for column in vectorizer_rel.get_feature_names()]
features = pd.concat([data.reset_index(drop=True), pd.DataFrame(bow_rel, columns=columns)], axis=1)

vectorizer_path = pickle.load(open('models/path_vectorizer.pkl', 'rb'))
bow_path = vectorizer_path.transform(features['dep_path'].values).toarray()
columns = [column + '_bow_path' for column in vectorizer_path.get_feature_names()]
features = pd.concat([features.reset_index(drop=True), pd.DataFrame(bow_path, columns=columns)], axis=1)

del data

In [13]:
features.shape

(6729, 2351)

In [14]:
excluding_cols = ['docid', 'subject', 'relation', 'object', 'dep_path', 'lemma_subj', 'lemma_rel', 'lemma_obj']
embedding_cols = ['w2v_subj', 'w2v_rel', 'w2v_obj']
w2v_subj = np.array([line.flatten() for line in features['w2v_subj'].values])
w2v_rel = np.array([line.flatten() for line in features['w2v_rel'].values])
w2v_obj = np.array([line.flatten() for line in features['w2v_obj'].values])
x_train = np.concatenate([features.drop(columns=excluding_cols+embedding_cols), 
                          w2v_subj, w2v_rel, w2v_obj], axis=1)

In [15]:
x_train.shape

(6729, 5040)

In [16]:
from sklearn.preprocessing import StandardScaler

x_train = StandardScaler().fit_transform(x_train)

In [17]:
np.random.shuffle(x_train)

In [18]:
plain_features = features.drop(columns=excluding_cols+embedding_cols).values
plain_features = StandardScaler().fit_transform(plain_features)

In [19]:
w2v_subj = np.array([np.array(line) for line in features['w2v_subj']])
w2v_obj = np.array([np.array(line) for line in features['w2v_obj']])
w2v_rel = np.array([np.array(line) for line in features['w2v_rel']])

In [20]:
plain_features.shape, w2v_subj.shape, w2v_rel.shape, w2v_obj.shape

((6729, 2340), (6729, 3, 300), (6729, 3, 300), (6729, 3, 300))

# Models

In [21]:
%load_ext cython

from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.layers import LSTM, GRU, Dense
from tensorflow.python.keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Conv2DTranspose
from tensorflow.python.keras.layers import Dropout, UpSampling2D
from tensorflow.python.keras.layers import Concatenate
from tensorflow.python.keras.layers import Masking
from tensorflow.python.keras.layers import Reshape
from tensorflow.python.keras.layers import Flatten
from tensorflow.python.keras.layers import Input, Layer
from tensorflow.python.keras.layers import Lambda
from tensorflow.python.keras.layers import GlobalMaxPooling1D
from tensorflow.python.keras.layers import RepeatVector
from tensorflow.python.keras.layers import Activation
from tensorflow.python.keras.layers import Permute, Add
from tensorflow.python.keras.layers import concatenate
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.models import model_from_json
from tensorflow.python.keras import regularizers
from tensorflow.python.keras.callbacks import Callback
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.layers import GaussianNoise
from tensorflow.python.keras.layers import UpSampling1D

from copy import deepcopy
from sklearn.metrics import f1_score
from tensorboardX import SummaryWriter

import math
from time import time

from sklearn.cluster import KMeans

from tensorflow.python.keras.layers import Conv2D, Conv2DTranspose, Flatten, Reshape, Layer, InputSpec
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.utils.vis_utils import plot_model
from datetime import datetime

from tensorflow.python.keras.callbacks import TensorBoard

In [22]:
LAYER_1 = 200
LAYER_2 = 50
INNER = 10

def plain_ae(input_shape):
    
    input_layer = Input(shape=input_shape)
    x = Dense(LAYER_1, activation='relu', name='enc1')(input_layer)
    x = Dense(LAYER_2, activation='relu', name='enc2')(x)
    latent = Dense(INNER, activation='sigmoid', name='embedding')(x)
    x = Dense(LAYER_2, activation='relu', name='dec1')(latent)
    x = Dense(LAYER_1, activation='relu', name='dec2')(x)
    output_layer = Dense(input_shape[0], name='output')(x)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    
    return model

In [23]:
LAYER_1 = 200
LAYER_2 = 50
INNER = 10

def plain_noised_ae(input_shape):
    
    input_layer = Input(shape=input_shape)
    x = GaussianNoise(0.5)(input_layer)
    x = Dense(LAYER_1, activation='relu', name='enc1')(x)
    x = Dense(LAYER_2, activation='relu', name='enc2')(x)
    latent = Dense(INNER, activation='sigmoid', name='embedding')(x)
    x = Dense(LAYER_2, activation='relu', name='dec1')(latent)
    x = Dense(LAYER_1, activation='relu', name='dec2')(x)
    output_layer = Dense(input_shape[0], name='output')(x)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    
    return model

In [24]:
LAYER_1 = 200
LAYER_2 = 100
INNER_SIZE = 50

def noised_ae(input_shape):
    
    def encode_plain_input(input_layer):
        x = GaussianNoise(0.5)(input_layer)
        x = Dense(LAYER_1, activation='relu', name='enc1')(x)
        #x = Dense(LAYER_2, activation='relu', name='enc2')(x)
        return x

    def encode_embedding_input(input_layer):
        conv1 = Conv1D(128, (2,), activation='relu', padding='same')(input_layer)
        pool1 = MaxPooling1D((2,), padding='same')(conv1)
        conv2 = Conv1D(32, (2,), activation='relu', padding='same')(pool1)
        pool2 = MaxPooling1D((2,), padding='same')(conv2)
        return Flatten()(pool2)
    
    def decode_plain_input(latent):
        x = Dense(LAYER_2, activation='relu', name='dec1')(latent)
        #x = Dense(LAYER_1, activation='relu', name='dec2')(x)
        output = Dense(input_shape_plain[0], name='output_plain')(x)
        return output
    
    def decode_embedding_input(latent, name):
        latent = Reshape((1, INNER_SIZE))(latent)
        up0 = UpSampling1D(3)(latent)
        conv1 = Conv1D(3, (3,), activation='relu', padding='same', name=name+'_conv1')(up0)
        #up1 = UpSampling1D(3, name=name+'_up1')(conv1)
        conv2 = Conv1D(300, (1,), activation='relu', padding='same', name=name+'_conv2')(conv1)
        #up2 = UpSampling1D(1, name=name+'_up2')(conv2)
        #r = Conv1D(1, (2,), activation='sigmoid', padding='same', name=name)(up2)
        return conv2
    
    input_shape_plain, input_shape_emb = input_shape
    
    input_plain = Input(shape=input_shape_plain, name='input_plain')
    input_subject = Input(shape=input_shape_emb, name='input_subject')
    input_object = Input(shape=input_shape_emb, name='input_object')
    input_rel = Input(shape=input_shape_emb, name='input_rel')

    encode_plain = encode_plain_input(input_plain)
    encode_subject = encode_embedding_input(input_subject)
    encode_object = encode_embedding_input(input_object)
    encode_rel = encode_embedding_input(input_rel)
    
    x = concatenate([encode_plain, encode_subject, encode_object, encode_rel])
    latent = Dense(INNER_SIZE, activation='sigmoid', name='embedding')(x)
    
    output_plain = decode_plain_input(latent)
    output_subject = decode_embedding_input(latent, 'output_subject')
    output_object = decode_embedding_input(latent, 'output_object')
    output_rel = decode_embedding_input(latent, 'output_rel')
    
    model = Model(inputs=[input_plain, input_subject, input_object, input_rel], 
                  outputs=[output_plain, output_subject, output_object, output_rel])

    return model

In [25]:
LAYER_1 = 200
LAYER_2 = 100
INNER_SIZE = 100

def masked_ae(input_shape):
    """ mask relation embedding and try to restore it """
    
    def encode_plain_input(input_layer):
        #x = GaussianNoise(0.5)(input_layer)
        #x = Dense(LAYER_1, activation='relu', name='enc1')(x)
        x = Dense(LAYER_2, activation='relu', name='enc2')(input_layer)
        return x

    def encode_embedding_input(input_layer):
        conv1 = Conv1D(128, (2,), activation='relu', padding='same')(input_layer)
        pool1 = MaxPooling1D((2,), padding='same')(conv1)
        #conv2 = Conv1D(32, (2,), activation='relu', padding='same')(pool1)
        #pool2 = MaxPooling1D((2,), padding='same')(conv2)
        return Flatten()(pool1)
    
    def decode_plain_input(latent):
        x = Dense(LAYER_2, activation='relu', name='dec1')(latent)
        #x = Dense(LAYER_1, activation='relu', name='dec2')(x)
        output = Dense(input_shape_plain[0], name='output_plain')(x)
        return output
    
    def decode_embedding_input(latent, name):
        latent = Reshape((1, INNER_SIZE))(latent)
        conv1 = Conv1D(128, (1,), activation='relu', padding='same', name=name+'_conv1')(latent)
        up1 = UpSampling1D(3, name=name+'_up1')(conv1)
        conv2 = Conv1D(300, (1,), activation='relu', padding='same', name=name+'_conv2')(up1)
        return conv2
    
    input_shape_plain, input_shape_emb = input_shape
    
    input_plain = Input(shape=input_shape_plain, name='input_plain')
    input_subject = Input(shape=input_shape_emb, name='input_subject')
    input_object = Input(shape=input_shape_emb, name='input_object')
    input_rel = Input(shape=input_shape_emb, name='input_rel')

    encode_plain = encode_plain_input(input_plain)
    encode_subject = encode_embedding_input(input_subject)
    encode_object = encode_embedding_input(input_object)
    encode_rel = encode_embedding_input(input_rel)
    
    x = concatenate([encode_plain, encode_subject, encode_object])
    latent = Dense(INNER_SIZE, activation='sigmoid', name='embedding')(x)
    
    output_plain = decode_plain_input(latent)
    output_subject = decode_embedding_input(latent, 'output_subject')
    output_object = decode_embedding_input(latent, 'output_object')
    output_rel = decode_embedding_input(latent, 'output_rel')
    
    model = Model(inputs=[input_plain, input_subject, input_object, input_rel], 
                  outputs=[output_plain, output_subject, output_object, output_rel])

    return model

In [26]:
plain_features.shape[1:], w2v_obj.shape[1:]

((2340,), (3, 300))

In [None]:
model = masked_ae((plain_features.shape[1:], w2v_obj.shape[1:]))

In [None]:
model.summary()

In [None]:
optimizer = Adam(lr=0.3)
model.compile(optimizer='adam', loss='mse')

In [None]:
model.fit(x=[plain_features, w2v_obj, w2v_subj, w2v_rel],
          y=[plain_features, w2v_obj, w2v_subj, w2v_rel], epochs=100, batch_size=256)

## Test DCEC

### Monoinput 

In [29]:
import deep_clustering
save_dir = 'models'

dcec = deep_clustering.DCEC(input_shape=x_train.shape[1:], 
                            autoencoder_ctor=lambda input_shape: plain_ae(input_shape),
                            n_clusters=40, 
                            pretrain_epochs=10,
                            maxiter=int(5e4),
                            save_dir=save_dir, 
                            log_dir=logPath)
plot_model(dcec._model, to_file=os.path.join(save_dir, 'dcec_model.png'), show_shapes=True)
dcec.compile(optimizer='adam')



In [None]:
dcec.fit(x_train)

### Multiinput

In [29]:
import deep_clustering
save_dir = 'models'

dcec = deep_clustering.DCEC(input_shape=(plain_features.shape[1:], w2v_obj.shape[1:]),
                            autoencoder_ctor=lambda input_shape: masked_ae(input_shape),  # select model here
                            n_clusters=50, 
                            pretrain_epochs=10,
                            maxiter=int(1e3),
                            save_dir=save_dir, 
                            log_dir=logPath)
plot_model(dcec._model, to_file=os.path.join(save_dir, 'dcec_model.png'), show_shapes=True)
dcec.compile(optimizer='adam')



In [30]:
dcec._model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_subject (InputLayer)      (None, 3, 300)       0                                            
__________________________________________________________________________________________________
input_object (InputLayer)       (None, 3, 300)       0                                            
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 3, 128)       76928       input_subject[0][0]              
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 3, 128)       76928       input_object[0][0]               
__________________________________________________________________________________________________
input_plai

In [None]:
dcec.fit([plain_features, w2v_obj, w2v_subj, w2v_rel])

In [32]:
y_pred = dcec._y_pred

In [33]:
def show_cluster_sample(number, rows=10):
    return features[y_pred == number][['docid', 'subject', 'relation', 'object']].sample(frac=1).iloc[:rows] 

In [34]:
show_cluster_sample(35, 100).relation.value_counts()

was acquired by                        14
acquired                               12
was acquired in                         7
purchased                               4
was acquired In                         3
acquired in                             3
acquired agency in                      3
was purchased by                        3
acquired Mustard Digital At             2
acquiring                               2
published demonstration In              2
acquire                                 2
was purchased In                        2
purchased NeXT in                       2
acquired copy In                        2
has purchased                           2
acquired agency In                      1
must acquire                            1
began in                                1
Founded in                              1
purchased codex In                      1
had acquired in                         1
purchased program from                  1
acquired Ulead Systems On         

In [36]:
with open('dcec_clusters_masked.txt', 'w') as f:
    for i in range(50):
        try:
            line = "\n".join(map(str, show_cluster_sample(i, 20).values.tolist()))
            f.write(str(i)+'-----------------\n' + line + '\n\n\n')
        except ValueError:
            f.write(str(i)+'-----------------\n')

## Test DC_Kmeans 

In [None]:
import deep_clustering
save_dir = 'models'

dckmeans = deep_clustering.DC_Kmeans(input_shape=x_train.shape[1:], 
                            autoencoder_ctor=lambda input_shape: plain_noised_ae(input_shape),
                            n_clusters=30,
                            max_epochs=200,
                            save_dir=save_dir, 
                            log_dir=logPath)
plot_model(dckmeans._model, to_file=os.path.join(save_dir, 'dckmeans_model.png'), show_shapes=True)
dckmeans.compile(optimizer='adam')

In [None]:
dckmeans._model.summary()

In [None]:
dckmeans.fit(x_train)

## Clustering of internal representations generated by autoencoder

In [None]:
pae = plain_ae(x_train.shape[1:])
pae.compile(optimizer='adam', loss='mse')
pae.fit(x_train, x_train, batch_size=256, epochs=200, verbose=0)
hidden = pae.get_layer(name='embedding').output
encoder = Model(inputs=pae.input, outputs=hidden)
embeddings = encoder.predict(x_train)
cluzeriser = KMeans(50, n_jobs=6)
clusters = cluzeriser.fit_predict(embeddings)

In [None]:
pae.save('models/pae_model.h5')

In [None]:
def show_cluster_sample(number):
    return features[clusters == number][['docid', 'subject', 'relation', 'object']].sample(frac=1).iloc[:10] 

In [None]:
with open('pae_clusters.txt', 'w') as f:
    for i in range(50):
        try:
            line = "\n".join(map(str, show_cluster_sample(i).values.tolist()))
            f.write(str(i)+'-----------------\n' + line + '\n\n\n')
        except ValueError:
            f.write(str(i)+'-----------------\n')