# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys
sys.path.append('./pylingtools/src/')
sys.path.append('./pyexling/src/')
sys.path.append('./syntaxnet_wrapper/src/')

In [2]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


['/device:GPU:0']

In [3]:
import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

from tensorflow.python.keras import backend as K
K.set_session(sess)

In [4]:
import logging

logPath = '../logs/'
! mkdir $logPath
fileName = 'main.log'
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s")

logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()

fileHandler = logging.FileHandler(os.path.join(logPath, fileName))
fileHandler.setFormatter(logFormatter)
logger.addHandler(fileHandler)

consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
logger.addHandler(consoleHandler)

logger.setLevel(logging.INFO)

mkdir: cannot create directory ‘../logs/’: File exists


In [5]:
import sklearn
import numpy as np
import pandas as pd
import json
import pickle
from tqdm import tqdm_notebook as tqdm
from joblib import Parallel, delayed

# Load data (EDUs only)

In [6]:
text_html_map = {
    r'\n': r' ',
    r'&gt;': r'>',
    r'&lt;': r'<',
    r'&amp;': r'&',
    r'&quot;': r'"',
    r'&ndash;': r'–',
    r'##### ': r'',
    r'\\\\\\\\': r'\\',
    r'  ': r' ',
    r'——': r'-',
    r'—': r'-',
    r'/': r'',
    r'\^': r'',
    r'^': r'',
    r'±': r'+',
    r'y': r'у',
    r'x': r'х'
}

def read_edus(filename):
    edus = []
    with open(filename + '.edus', 'r') as f:
        for line in f.readlines():
            edu = str(line.strip())
            for key, value in text_html_map.items():
                edu = edu.replace(key, value)
            edus.append(edu)
    return edus

def read_gold(filename):
    df = pd.read_pickle(filename + '.gold.pkl')
    for key in text_html_map.keys():
        df['snippet_x'].replace(key, text_html_map[key], regex=True, inplace=True)
        df['snippet_y'].replace(key, text_html_map[key], regex=True, inplace=True)

    return df

def read_json(filename):
    df = pd.read_json(filename + '.json')
    for key in text_html_map.keys():
        df['snippet_x'].replace(key, text_html_map[key], regex=True, inplace=True)
        df['snippet_y'].replace(key, text_html_map[key], regex=True, inplace=True)

    return df

def read_annotation(filename):
    annot = pd.read_pickle(filename + '.annot.pkl')
    for key in text_html_map.keys():
        annot['text'] = annot['text'].replace(key, text_html_map[key])
        for token in annot['tokens']:
            token.text = token.text.replace(key, text_html_map[key])
    
    return annot

In [24]:
import pandas as pd
import glob
from tqdm import tqdm_notebook as tqdm

TARGET = 'category_id'

df = []
for file in tqdm(glob.glob('data/*.edus')):
    filename = file.replace('.edus', '')
    edus = read_edus(filename)
    gold = read_gold(filename)
    
    def label_edu(discourse_unit):
        return discourse_unit in edus
    
    gold['edu_x'] = gold.snippet_x.map(label_edu)
    gold['edu_y'] = gold.snippet_y.map(label_edu)
    gold['edu_pair'] = gold['edu_x'] & gold['edu_y']
    gold = gold[gold.edu_pair]
    gold = gold.drop(columns=['edu_x', 'edu_y', 'edu_pair'])
    df.append(gold)
    
df = pd.concat(df)
df = df.drop_duplicates(['snippet_x', 'snippet_y', TARGET])
df = df[df['snippet_x'].map(len) > 0]
df = df[df['snippet_y'].map(len) > 0]

TARGET = 'category_id'

df[TARGET] = df[TARGET].replace(['cause-effect_r', 'effect_r'], 'cause_r')
df[TARGET] = df[TARGET].replace(['interpretation-evaluation_r', 'conclusion_r'], 'evaluation_r')

y_stat = df[TARGET].value_counts()
drop_ys = y_stat[y_stat < 100].index #+ ['elaboration_r', 'joint_m', 'same-unit_m']

for dy in drop_ys:
    df = df[df[TARGET] != dy]

HBox(children=(IntProgress(value=0, max=178), HTML(value='')))




In [25]:
df.category_id.value_counts()

elaboration_r    1491
joint_m          1376
cause_r           604
condition_r       519
purpose_r         476
attribution_r     328
contrast_m        196
evidence_r        124
comparison_m      110
restatement_m     108
Name: category_id, dtype: int64

In [28]:
y, X = df[TARGET].to_frame(), df.drop(TARGET, axis=1).drop(columns=['snippet_x', 'snippet_y', 'snippet_x_tmp', 'snippet_y_tmp', 'filename', 'order', 'postags_x', 'postags_y'])

In [31]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X = scaler.fit_transform(X)

  return self.partial_fit(X, y)


# Models

In [32]:
%load_ext cython

from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.layers import LSTM, GRU, Dense
from tensorflow.python.keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Conv2DTranspose
from tensorflow.python.keras.layers import Dropout, UpSampling2D
from tensorflow.python.keras.layers import Concatenate
from tensorflow.python.keras.layers import Masking
from tensorflow.python.keras.layers import Reshape
from tensorflow.python.keras.layers import Flatten
from tensorflow.python.keras.layers import Input, Layer
from tensorflow.python.keras.layers import Lambda
from tensorflow.python.keras.layers import GlobalMaxPooling1D
from tensorflow.python.keras.layers import RepeatVector
from tensorflow.python.keras.layers import Activation
from tensorflow.python.keras.layers import Permute, Add
from tensorflow.python.keras.layers import concatenate
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.models import model_from_json
from tensorflow.python.keras import regularizers
from tensorflow.python.keras.callbacks import Callback
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.layers import GaussianNoise
from tensorflow.python.keras.layers import UpSampling1D

from copy import deepcopy
from sklearn.metrics import f1_score
from tensorboardX import SummaryWriter

import math
from time import time

from sklearn.cluster import KMeans

from tensorflow.python.keras.layers import Conv2D, Conv2DTranspose, Flatten, Reshape, Layer, InputSpec
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.utils.vis_utils import plot_model
from datetime import datetime

from tensorflow.python.keras.callbacks import TensorBoard

The cython extension is already loaded. To reload it, use:
  %reload_ext cython


In [33]:
_INNER_SIZE = len(df.category_id.unique())

In [34]:
def noised_ae(input_shape):
    std_dev = 1e-1
    regul_constant = 1e-2
    
    def encode_plain_input(input_layer):
        input_layer = GaussianNoise(std_dev)(input_layer)
        enc_1 = Dense(input_shape[0] // 4, activation='tanh',
                      kernel_regularizer=regularizers.l2(regul_constant),
                      name='enc1')(input_layer)
        enc_2 = Dense(_INNER_SIZE, activation='tanh',
                      kernel_regularizer=regularizers.l2(regul_constant),
                      name='embedding')(enc_1)
        return enc_2
    
    def decode_plain_input(latent):
        dec_1 = Dense(input_shape[0] // 4, activation='tanh',
                      kernel_regularizer=regularizers.l2(regul_constant),
                      name='dec1')(latent)
        dec_2 = Dense(input_shape[0], activation='tanh',
                      kernel_regularizer=regularizers.l2(regul_constant),
                      name='dec2')(dec_1)
        return dec_2
        
    
    input_pair = Input(shape=input_shape, name='input_pair')
    latent = encode_plain_input(input_pair)
    decoded = decode_plain_input(latent)
    
    model = Model(inputs=[input_pair], 
                  outputs=[decoded])

    return model

## Test autoencoder

In [149]:
K.clear_session()

model = noised_ae((X.shape[1:]))
model.summary()

model.compile(optimizer='adadelta', loss='mse')

model.fit(x=[X],
          y=[X], epochs=200, batch_size=128)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_pair (InputLayer)      (None, 2590)              0         
_________________________________________________________________
gaussian_noise (GaussianNois (None, 2590)              0         
_________________________________________________________________
enc1 (Dense)                 (None, 647)               1676377   
_________________________________________________________________
enc2 (Dense)                 (None, 10)                6480      
_________________________________________________________________
dec1 (Dense)                 (None, 647)               7117      
_________________________________________________________________
dec2 (Dense)                 (None, 2590)              1678320   
Total params: 3,368,294
Trainable params: 3,368,294
Non-trainable params: 0
_________________________________________________________________


KeyboardInterrupt: 

## Train IDEC

In [35]:
import utils.deep_clustering as deep_clustering
from sklearn.metrics import v_measure_score

def loop(iteration):
    result = []
    
    while iteration:
        K.clear_session()

        save_dir = 'idec'
        ! mkdir $save_dir

        idec = deep_clustering.IDEC(input_shape=(X.shape[1:]),
                                    autoencoder_ctor=lambda input_shape: noised_ae(input_shape),#restore_rel(input_shape),  # select model here
                                    n_clusters=_INNER_SIZE,
                                    pretrain_epochs=2,
                                    maxiter=100,
                                    save_dir=save_dir, 
                                    log_dir=logPath)

        plot_model(idec._model, to_file=os.path.join(save_dir, 'idec_model.png'), show_shapes=True)
        idec.compile(gamma=.1)
        idec.fit([X], batch_size=512)
        result.append(v_measure_score(y[TARGET].values, idec._y_pred))
        iteration -= 1
        
    return np.array(result)

In [36]:
res = loop(5)

mkdir: cannot create directory ‘idec’: File exists


2019-12-17 12:40:09,007 [MainThread  ] [INFO ]  Initialized tolerance = 0.05.
2019-12-17 12:40:09,193 [MainThread  ] [INFO ]  Pretraining...
2019-12-17 12:40:11,343 [MainThread  ] [INFO ]  Pretraining time: 2.082761764526367
2019-12-17 12:40:11,550 [MainThread  ] [INFO ]  Pretrained weights are saved to idec/pretrain_cae_model.h5
2019-12-17 12:40:11,551 [MainThread  ] [INFO ]  Initializing cluster centers.
2019-12-17 12:40:12,621 [MainThread  ] [INFO ]  Cluster centers initialized: 1.0693936347961426
2019-12-17 12:40:12,622 [MainThread  ] [INFO ]  Training model.
2019-12-17 12:40:12,622 [MainThread  ] [INFO ]  Update interval 140
2019-12-17 12:40:12,623 [MainThread  ] [INFO ]  Save interval 52.0703125
2019-12-17 12:40:12,623 [MainThread  ] [INFO ]  Training model. Iteration #0.
2019-12-17 12:40:13,738 [MainThread  ] [INFO ]  saving model to: idec/dcec_model_0.h5
2019-12-17 12:40:15,193 [MainThread  ] [INFO ]  Done. 2.5710442066192627
2019-12-17 12:40:15,194 [MainThread  ] [INFO ]  Savi

mkdir: cannot create directory ‘idec’: File exists


2019-12-17 12:40:15,574 [MainThread  ] [INFO ]  Initialized tolerance = 0.05.
2019-12-17 12:40:15,775 [MainThread  ] [INFO ]  Pretraining...
2019-12-17 12:40:18,035 [MainThread  ] [INFO ]  Pretraining time: 2.188121795654297
2019-12-17 12:40:18,240 [MainThread  ] [INFO ]  Pretrained weights are saved to idec/pretrain_cae_model.h5
2019-12-17 12:40:18,241 [MainThread  ] [INFO ]  Initializing cluster centers.
2019-12-17 12:40:19,462 [MainThread  ] [INFO ]  Cluster centers initialized: 1.2207067012786865
2019-12-17 12:40:19,463 [MainThread  ] [INFO ]  Training model.
2019-12-17 12:40:19,463 [MainThread  ] [INFO ]  Update interval 140
2019-12-17 12:40:19,464 [MainThread  ] [INFO ]  Save interval 52.0703125
2019-12-17 12:40:19,464 [MainThread  ] [INFO ]  Training model. Iteration #0.
2019-12-17 12:40:20,559 [MainThread  ] [INFO ]  saving model to: idec/dcec_model_0.h5
2019-12-17 12:40:22,025 [MainThread  ] [INFO ]  Done. 2.561309576034546
2019-12-17 12:40:22,025 [MainThread  ] [INFO ]  Savin

mkdir: cannot create directory ‘idec’: File exists


2019-12-17 12:40:22,408 [MainThread  ] [INFO ]  Initialized tolerance = 0.05.
2019-12-17 12:40:22,604 [MainThread  ] [INFO ]  Pretraining...
2019-12-17 12:40:24,931 [MainThread  ] [INFO ]  Pretraining time: 2.120739698410034
2019-12-17 12:40:25,128 [MainThread  ] [INFO ]  Pretrained weights are saved to idec/pretrain_cae_model.h5
2019-12-17 12:40:25,129 [MainThread  ] [INFO ]  Initializing cluster centers.
2019-12-17 12:40:26,138 [MainThread  ] [INFO ]  Cluster centers initialized: 1.0084447860717773
2019-12-17 12:40:26,139 [MainThread  ] [INFO ]  Training model.
2019-12-17 12:40:26,139 [MainThread  ] [INFO ]  Update interval 140
2019-12-17 12:40:26,139 [MainThread  ] [INFO ]  Save interval 52.0703125
2019-12-17 12:40:26,140 [MainThread  ] [INFO ]  Training model. Iteration #0.
2019-12-17 12:40:27,205 [MainThread  ] [INFO ]  saving model to: idec/dcec_model_0.h5
2019-12-17 12:40:28,698 [MainThread  ] [INFO ]  Done. 2.559394121170044
2019-12-17 12:40:28,699 [MainThread  ] [INFO ]  Savin

mkdir: cannot create directory ‘idec’: File exists


2019-12-17 12:40:29,678 [MainThread  ] [INFO ]  Initialized tolerance = 0.05.
2019-12-17 12:40:29,872 [MainThread  ] [INFO ]  Pretraining...
2019-12-17 12:40:32,113 [MainThread  ] [INFO ]  Pretraining time: 2.170664072036743
2019-12-17 12:40:32,321 [MainThread  ] [INFO ]  Pretrained weights are saved to idec/pretrain_cae_model.h5
2019-12-17 12:40:32,322 [MainThread  ] [INFO ]  Initializing cluster centers.
2019-12-17 12:40:33,372 [MainThread  ] [INFO ]  Cluster centers initialized: 1.049647331237793
2019-12-17 12:40:33,373 [MainThread  ] [INFO ]  Training model.
2019-12-17 12:40:33,373 [MainThread  ] [INFO ]  Update interval 140
2019-12-17 12:40:33,374 [MainThread  ] [INFO ]  Save interval 52.0703125
2019-12-17 12:40:33,374 [MainThread  ] [INFO ]  Training model. Iteration #0.
2019-12-17 12:40:34,550 [MainThread  ] [INFO ]  saving model to: idec/dcec_model_0.h5
2019-12-17 12:40:36,010 [MainThread  ] [INFO ]  Done. 2.636526107788086
2019-12-17 12:40:36,010 [MainThread  ] [INFO ]  Saving

mkdir: cannot create directory ‘idec’: File exists


2019-12-17 12:40:36,396 [MainThread  ] [INFO ]  Initialized tolerance = 0.05.
2019-12-17 12:40:36,592 [MainThread  ] [INFO ]  Pretraining...
2019-12-17 12:40:38,757 [MainThread  ] [INFO ]  Pretraining time: 2.0952465534210205
2019-12-17 12:40:38,948 [MainThread  ] [INFO ]  Pretrained weights are saved to idec/pretrain_cae_model.h5
2019-12-17 12:40:38,948 [MainThread  ] [INFO ]  Initializing cluster centers.
2019-12-17 12:40:39,915 [MainThread  ] [INFO ]  Cluster centers initialized: 0.9664773941040039
2019-12-17 12:40:39,916 [MainThread  ] [INFO ]  Training model.
2019-12-17 12:40:39,917 [MainThread  ] [INFO ]  Update interval 140
2019-12-17 12:40:39,917 [MainThread  ] [INFO ]  Save interval 52.0703125
2019-12-17 12:40:39,918 [MainThread  ] [INFO ]  Training model. Iteration #0.
2019-12-17 12:40:41,207 [MainThread  ] [INFO ]  saving model to: idec/dcec_model_0.h5
2019-12-17 12:40:42,550 [MainThread  ] [INFO ]  Done. 2.6334660053253174
2019-12-17 12:40:42,551 [MainThread  ] [INFO ]  Sav

In [38]:
res.mean(), res.std()

(0.013608557674555419, 0.004258955299045326)

In [39]:
K.clear_session()

save_dir = 'idec'
! mkdir $save_dir

idec = deep_clustering.IDEC(input_shape=(X.shape[1:]),
                            autoencoder_ctor=lambda input_shape: noised_ae(input_shape),#restore_rel(input_shape),  # select model here
                            n_clusters=_INNER_SIZE,
                            pretrain_epochs=2,
                            maxiter=100,
                            save_dir=save_dir, 
                            log_dir=logPath)

plot_model(idec._model, to_file=os.path.join(save_dir, 'idec_model.png'), show_shapes=True)
idec.compile(gamma=.1)
idec.fit([X], batch_size=512)

mkdir: cannot create directory ‘idec’: File exists


2019-12-17 12:42:04,545 [MainThread  ] [INFO ]  Initialized tolerance = 0.05.
2019-12-17 12:42:04,816 [MainThread  ] [INFO ]  Pretraining...
2019-12-17 12:42:08,189 [MainThread  ] [INFO ]  Pretraining time: 3.2700369358062744
2019-12-17 12:42:08,469 [MainThread  ] [INFO ]  Pretrained weights are saved to idec/pretrain_cae_model.h5
2019-12-17 12:42:08,471 [MainThread  ] [INFO ]  Initializing cluster centers.
2019-12-17 12:42:09,748 [MainThread  ] [INFO ]  Cluster centers initialized: 1.2767457962036133
2019-12-17 12:42:09,749 [MainThread  ] [INFO ]  Training model.
2019-12-17 12:42:09,749 [MainThread  ] [INFO ]  Update interval 140
2019-12-17 12:42:09,750 [MainThread  ] [INFO ]  Save interval 52.0703125
2019-12-17 12:42:09,750 [MainThread  ] [INFO ]  Training model. Iteration #0.
2019-12-17 12:42:11,465 [MainThread  ] [INFO ]  saving model to: idec/dcec_model_0.h5
2019-12-17 12:42:13,456 [MainThread  ] [INFO ]  Done. 3.7070164680480957
2019-12-17 12:42:13,457 [MainThread  ] [INFO ]  Sav

In [40]:
v_measure_score(y[TARGET].values, idec._y_pred)

0.012207883152652052

In [41]:
df.pred = idec._y_pred

  """Entry point for launching an IPython kernel.


In [42]:
df[df.pred==9][TARGET].value_counts()

elaboration_r    180
joint_m          167
purpose_r         56
condition_r       53
cause_r           41
attribution_r     32
contrast_m        15
comparison_m      10
restatement_m     10
evidence_r         8
Name: category_id, dtype: int64