# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys
sys.path.append('./pylingtools/src/')
sys.path.append('./pyexling/src/')
sys.path.append('./syntaxnet_wrapper/src/')

In [None]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

In [None]:
import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

from tensorflow.python.keras import backend as K
K.set_session(sess)

In [None]:
import logging

logPath = '../logs/'
! mkdir $logPath
fileName = 'main.log'
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s")

logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()

fileHandler = logging.FileHandler(os.path.join(logPath, fileName))
fileHandler.setFormatter(logFormatter)
logger.addHandler(fileHandler)

consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
logger.addHandler(consoleHandler)

logger.setLevel(logging.INFO)

In [5]:
import sklearn
import numpy as np
import pandas as pd
import json
import pickle
from tqdm import tqdm_notebook as tqdm
from joblib import Parallel, delayed

# Load data (EDUs only)

In [6]:
text_html_map = {
    r'\n': r' ',
    r'&gt;': r'>',
    r'&lt;': r'<',
    r'&amp;': r'&',
    r'&quot;': r'"',
    r'&ndash;': r'–',
    r'##### ': r'',
    r'\\\\\\\\': r'\\',
    r'  ': r' ',
    r'——': r'-',
    r'—': r'-',
    r'/': r'',
    r'\^': r'',
    r'^': r'',
    r'±': r'+',
    r'y': r'у',
    r'x': r'х'
}

def read_edus(filename):
    edus = []
    with open(filename + '.edus', 'r') as f:
        for line in f.readlines():
            edu = str(line.strip())
            for key, value in text_html_map.items():
                edu = edu.replace(key, value)
            edus.append(edu)
    return edus

def read_gold(filename):
    df = pd.read_pickle(filename + '.gold.pkl')
    for key in text_html_map.keys():
        df['snippet_x'].replace(key, text_html_map[key], regex=True, inplace=True)
        df['snippet_y'].replace(key, text_html_map[key], regex=True, inplace=True)

    return df

def read_json(filename):
    df = pd.read_json(filename + '.json')
    for key in text_html_map.keys():
        df['snippet_x'].replace(key, text_html_map[key], regex=True, inplace=True)
        df['snippet_y'].replace(key, text_html_map[key], regex=True, inplace=True)

    return df

def read_annotation(filename):
    annot = pd.read_pickle(filename + '.annot.pkl')
    for key in text_html_map.keys():
        annot['text'] = annot['text'].replace(key, text_html_map[key])
        for token in annot['tokens']:
            token.text = token.text.replace(key, text_html_map[key])
    
    return annot

In [7]:
import pandas as pd
import glob
from tqdm import tqdm_notebook as tqdm

TARGET = 'category_id'

df = []
for file in tqdm(glob.glob('data/*.edus')):
    filename = file.replace('.edus', '')
    edus = read_edus(filename)
    gold = read_gold(filename)
    
    def label_edu(discourse_unit):
        return discourse_unit in edus
    
    gold['edu_x'] = gold.snippet_x.map(label_edu)
    gold['edu_y'] = gold.snippet_y.map(label_edu)
    gold['edu_pair'] = gold['edu_x'] & gold['edu_y']
    gold = gold[gold.edu_pair]
    gold = gold.drop(columns=['edu_x', 'edu_y', 'edu_pair'])
    df.append(gold)
    
df = pd.concat(df)
df = df.drop_duplicates(['snippet_x', 'snippet_y', TARGET])
df = df[df['snippet_x'].map(len) > 0]
df = df[df['snippet_y'].map(len) > 0]

TARGET = 'category_id'

df[TARGET] = df[TARGET].replace(['cause-effect_r', 'effect_r'], 'cause_r')
df[TARGET] = df[TARGET].replace(['interpretation-evaluation_r', 'conclusion_r'], 'evaluation_r')

y_stat = df[TARGET].value_counts()
drop_ys = y_stat[y_stat < 100].index #+ ['elaboration_r', 'joint_m', 'same-unit_m']

for dy in drop_ys:
    df = df[df[TARGET] != dy]

HBox(children=(IntProgress(value=0, max=305), HTML(value='')))




In [8]:
df[TARGET].value_counts()

joint_m          2767
elaboration_r    2224
cause_r          1180
condition_r      1133
purpose_r         844
contrast_m        636
attribution_r     516
evaluation_r      353
background_r      227
comparison_m      187
evidence_r        176
concession_r      170
sequence_m        165
restatement_m     143
preparation_r     138
Name: category_id, dtype: int64

In [9]:
y, X = df[TARGET].to_frame(), df.drop(TARGET, axis=1).drop(columns=['snippet_x', 'snippet_y', 'snippet_x_tmp', 'snippet_y_tmp', 'filename', 'order', 'postags_x', 'postags_y'])

In [10]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X = scaler.fit_transform(X)

# Models

In [11]:
%load_ext cython

from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.layers import LSTM, GRU, Dense
from tensorflow.python.keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Conv2DTranspose
from tensorflow.python.keras.layers import Dropout, UpSampling2D
from tensorflow.python.keras.layers import Concatenate
from tensorflow.python.keras.layers import Masking
from tensorflow.python.keras.layers import Reshape
from tensorflow.python.keras.layers import Flatten
from tensorflow.python.keras.layers import Input, Layer
from tensorflow.python.keras.layers import Lambda
from tensorflow.python.keras.layers import GlobalMaxPooling1D
from tensorflow.python.keras.layers import RepeatVector
from tensorflow.python.keras.layers import Activation
from tensorflow.python.keras.layers import Permute, Add
from tensorflow.python.keras.layers import concatenate
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.models import model_from_json
from tensorflow.python.keras import regularizers
from tensorflow.python.keras.callbacks import Callback
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.layers import GaussianNoise
from tensorflow.python.keras.layers import UpSampling1D

from copy import deepcopy
from sklearn.metrics import f1_score
from tensorboardX import SummaryWriter

import math
from time import time

from sklearn.cluster import KMeans

from tensorflow.python.keras.layers import Conv2D, Conv2DTranspose, Flatten, Reshape, Layer, InputSpec
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.utils.vis_utils import plot_model
from datetime import datetime

from tensorflow.python.keras.callbacks import TensorBoard

In [12]:
_INNER_SIZE = len(df.category_id.unique())

In [13]:
def noised_ae(input_shape):
    std_dev = 1e-1
    regul_constant = 1e-1
    
    def encode_plain_input(input_layer):
        input_layer = GaussianNoise(std_dev)(input_layer)
        enc_1 = Dense(input_shape[0] // 6, activation='tanh',
                      kernel_regularizer=regularizers.l2(regul_constant),
                      name='enc1')(input_layer)
        enc_2 = Dense(_INNER_SIZE, activation='tanh',
                      kernel_regularizer=regularizers.l2(regul_constant),
                      name='embedding')(enc_1)
        return enc_2
    
    def decode_plain_input(latent):
        dec_1 = Dense(input_shape[0] // 6, activation='tanh',
                      kernel_regularizer=regularizers.l2(regul_constant),
                      name='dec1')(latent)
        dec_2 = Dense(input_shape[0], activation='tanh',
                      kernel_regularizer=regularizers.l2(regul_constant),
                      name='dec2')(dec_1)
        return dec_2
        
    
    input_pair = Input(shape=input_shape, name='input_pair')
    latent = encode_plain_input(input_pair)
    decoded = decode_plain_input(latent)
    
    model = Model(inputs=[input_pair], 
                  outputs=[decoded])

    return model

## Test autoencoder

In [None]:
K.clear_session()

model = noised_ae((X.shape[1:]))
model.summary()

model.compile(optimizer='adadelta', loss='mse')

model.fit(x=[X],
          y=[X], epochs=200, batch_size=128)

## Evaluate models

In [56]:
import utils.deep_clustering as deep_clustering
from sklearn.metrics import v_measure_score

def idec_loop(iteration):
    result = []
    
    while iteration:
        K.clear_session()

        save_dir = 'idec'
        ! mkdir $save_dir

        idec = deep_clustering.IDEC(input_shape=(X.shape[1:]),
                                    autoencoder_ctor=lambda input_shape: noised_ae(input_shape),#restore_rel(input_shape),  # select model here
                                    n_clusters=_INNER_SIZE,
                                    pretrain_epochs=2,
                                    maxiter=100,
                                    save_dir=save_dir, 
                                    log_dir=logPath)

        plot_model(idec._model, to_file=os.path.join(save_dir, 'idec_model.png'), show_shapes=True)
        idec.compile(gamma=.1)
        idec.fit([X], batch_size=512)
        result.append(v_measure_score(y[TARGET].values, idec._y_pred))
        iteration -= 1
        
    return np.array(result)

def daec_loop(iteration):
    result = []
    
    while iteration:
        K.clear_session()

        save_dir = 'dkmeans'
        ! mkdir $save_dir
        
        daec = deep_clustering.DAEC(input_shape=(X.shape[1:]),
                            autoencoder_ctor=lambda input_shape: noised_ae(input_shape),  # select model here
                            n_clusters=_INNER_SIZE, 
                            pretrain_epochs=2,
                            log_dir=logPath,
                            save_dir=save_dir, 
                            )
        
        daec.compile(optimizer='adam')
        daec.fit(X)
        result.append(v_measure_score(y[TARGET].values, daec._y_pred))
        iteration -= 1
        
    return np.array(result)


def dkmeans_loop(iteration):
    result = []
    
    while iteration:
        K.clear_session()

        save_dir = 'dkmeans'
        ! mkdir $save_dir
        
        dckmeans = deep_clustering.DC_Kmeans(
                            input_shape=(X.shape[1:]),
                            autoencoder_ctor=lambda input_shape: noised_ae(input_shape),  # select model here
                            n_clusters=_INNER_SIZE,
                            pretrain_epochs=2,
                            max_epochs=10,
                            save_dir=save_dir, 
                            log_dir=logPath)
        
        plot_model(dckmeans._model, to_file=os.path.join(save_dir, 'dckmeans_model.png'), show_shapes=True)
        dckmeans.compile(optimizer='adam')
        dckmeans.fit(X, batch_size=512)
        result.append(v_measure_score(y[TARGET].values, dckmeans.y_pred))
        iteration -= 1
        
    return np.array(result)

In [None]:
res = idec_loop(5)

In [17]:
res.mean(), res.std()

(0.0315648759780115, 0.0023137329105746085)

In [57]:
res = daec_loop(5)

mkdir: cannot create directory ‘dkmeans’: File exists


2019-12-30 22:58:20,712 [MainThread  ] [INFO ]  Pretraining...
2019-12-30 22:58:31,192 [MainThread  ] [INFO ]  Pretraining time: 10.291308164596558
2019-12-30 22:58:31,349 [MainThread  ] [INFO ]  Pretrained weights are saved to dkmeans/pretrain_cae_model.h5
2019-12-30 22:58:31,351 [MainThread  ] [INFO ]  Initializing cluster centers.
2019-12-30 22:58:31,353 [MainThread  ] [INFO ]  Cluster centers initialized: 7.62939453125e-06
2019-12-30 22:58:31,356 [MainThread  ] [INFO ]  Training model.
2019-12-30 22:58:31,358 [MainThread  ] [INFO ]  Training k-means...
2019-12-30 22:58:33,523 [MainThread  ] [INFO ]  Done.
2019-12-30 22:58:33,523 [MainThread  ] [INFO ]  Training model. Iteration #0.
2019-12-30 22:58:38,861 [MainThread  ] [INFO ]  Training k-means...
2019-12-30 22:58:40,890 [MainThread  ] [INFO ]  Done.
2019-12-30 22:58:40,891 [MainThread  ] [INFO ]  delta_label: 0.9390367437148909
2019-12-30 22:58:40,891 [MainThread  ] [INFO ]  Training model. Iteration #1.
2019-12-30 22:58:45,428 [

mkdir: cannot create directory ‘dkmeans’: File exists


2019-12-30 22:58:52,515 [MainThread  ] [INFO ]  Pretraining...
2019-12-30 22:59:02,952 [MainThread  ] [INFO ]  Pretraining time: 10.370404958724976
2019-12-30 22:59:03,107 [MainThread  ] [INFO ]  Pretrained weights are saved to dkmeans/pretrain_cae_model.h5
2019-12-30 22:59:03,109 [MainThread  ] [INFO ]  Initializing cluster centers.
2019-12-30 22:59:03,111 [MainThread  ] [INFO ]  Cluster centers initialized: 7.152557373046875e-06
2019-12-30 22:59:03,114 [MainThread  ] [INFO ]  Training model.
2019-12-30 22:59:03,116 [MainThread  ] [INFO ]  Training k-means...
2019-12-30 22:59:05,383 [MainThread  ] [INFO ]  Done.
2019-12-30 22:59:05,384 [MainThread  ] [INFO ]  Training model. Iteration #0.
2019-12-30 22:59:10,606 [MainThread  ] [INFO ]  Training k-means...
2019-12-30 22:59:12,600 [MainThread  ] [INFO ]  Done.
2019-12-30 22:59:12,601 [MainThread  ] [INFO ]  delta_label: 0.9472327101943089
2019-12-30 22:59:12,601 [MainThread  ] [INFO ]  Training model. Iteration #1.
2019-12-30 22:59:17,1

mkdir: cannot create directory ‘dkmeans’: File exists


2019-12-30 22:59:24,222 [MainThread  ] [INFO ]  Pretraining...
2019-12-30 22:59:34,517 [MainThread  ] [INFO ]  Pretraining time: 10.22800612449646
2019-12-30 22:59:34,662 [MainThread  ] [INFO ]  Pretrained weights are saved to dkmeans/pretrain_cae_model.h5
2019-12-30 22:59:34,663 [MainThread  ] [INFO ]  Initializing cluster centers.
2019-12-30 22:59:34,663 [MainThread  ] [INFO ]  Cluster centers initialized: 4.5299530029296875e-06
2019-12-30 22:59:34,664 [MainThread  ] [INFO ]  Training model.
2019-12-30 22:59:34,664 [MainThread  ] [INFO ]  Training k-means...
2019-12-30 22:59:36,948 [MainThread  ] [INFO ]  Done.
2019-12-30 22:59:36,948 [MainThread  ] [INFO ]  Training model. Iteration #0.
2019-12-30 22:59:42,354 [MainThread  ] [INFO ]  Training k-means...
2019-12-30 22:59:44,403 [MainThread  ] [INFO ]  Done.
2019-12-30 22:59:44,403 [MainThread  ] [INFO ]  delta_label: 0.9326825674555668
2019-12-30 22:59:44,404 [MainThread  ] [INFO ]  Training model. Iteration #1.
2019-12-30 22:59:48,9

mkdir: cannot create directory ‘dkmeans’: File exists


2019-12-30 22:59:56,065 [MainThread  ] [INFO ]  Pretraining...
2019-12-30 23:00:06,362 [MainThread  ] [INFO ]  Pretraining time: 10.229871273040771
2019-12-30 23:00:06,512 [MainThread  ] [INFO ]  Pretrained weights are saved to dkmeans/pretrain_cae_model.h5
2019-12-30 23:00:06,513 [MainThread  ] [INFO ]  Initializing cluster centers.
2019-12-30 23:00:06,515 [MainThread  ] [INFO ]  Cluster centers initialized: 5.7220458984375e-06
2019-12-30 23:00:06,517 [MainThread  ] [INFO ]  Training model.
2019-12-30 23:00:06,519 [MainThread  ] [INFO ]  Training k-means...
2019-12-30 23:00:08,824 [MainThread  ] [INFO ]  Done.
2019-12-30 23:00:08,825 [MainThread  ] [INFO ]  Training model. Iteration #0.
2019-12-30 23:00:14,212 [MainThread  ] [INFO ]  Training k-means...
2019-12-30 23:00:16,198 [MainThread  ] [INFO ]  Done.
2019-12-30 23:00:16,198 [MainThread  ] [INFO ]  delta_label: 0.9411548024679989
2019-12-30 23:00:16,199 [MainThread  ] [INFO ]  Training model. Iteration #1.
2019-12-30 23:00:20,733

mkdir: cannot create directory ‘dkmeans’: File exists


2019-12-30 23:00:27,737 [MainThread  ] [INFO ]  Pretraining...
2019-12-30 23:00:38,045 [MainThread  ] [INFO ]  Pretraining time: 10.241287231445312
2019-12-30 23:00:38,250 [MainThread  ] [INFO ]  Pretrained weights are saved to dkmeans/pretrain_cae_model.h5
2019-12-30 23:00:38,252 [MainThread  ] [INFO ]  Initializing cluster centers.
2019-12-30 23:00:38,254 [MainThread  ] [INFO ]  Cluster centers initialized: 1.621246337890625e-05
2019-12-30 23:00:38,257 [MainThread  ] [INFO ]  Training model.
2019-12-30 23:00:38,258 [MainThread  ] [INFO ]  Training k-means...
2019-12-30 23:00:40,475 [MainThread  ] [INFO ]  Done.
2019-12-30 23:00:40,475 [MainThread  ] [INFO ]  Training model. Iteration #0.
2019-12-30 23:00:45,849 [MainThread  ] [INFO ]  Training k-means...
2019-12-30 23:00:47,852 [MainThread  ] [INFO ]  Done.
2019-12-30 23:00:47,853 [MainThread  ] [INFO ]  delta_label: 0.9382079381158486
2019-12-30 23:00:47,854 [MainThread  ] [INFO ]  Training model. Iteration #1.
2019-12-30 23:00:52,4

In [59]:
res.mean(), res.std()

(0.009477918789969031, 0.00021907955693088523)

In [None]:
res = dkmeans_loop(5)

In [46]:
res.mean(), res.std()

(-1.742625201269691e-16, 0.0)

In [24]:
def random_unif_loop(iteration):
    result = []
    
    for i in range(iteration):
        predicted = np.random.randint(0, _INNER_SIZE+1, size=df.shape[0])
        result.append(v_measure_score(y[TARGET].values, predicted))
        
    return np.array(result)

def random_exp_loop(iteration):
    result = []
    
    for i in range(iteration):
        predicted = np.random.exponential(scale=0.4, size=df.shape[0])
        predicted = predicted/predicted.max()*15.
        predicted = predicted.astype(int)
        result.append(v_measure_score(y[TARGET].values, predicted))
        
    return np.array(result)

In [36]:
random_res = random_unif_loop(50)
random_res.mean(), random_res.std()

(0.003925391325476878, 0.00036319401489300405)