# EMNLP-IJCNLP 19 paper #806
#### Feature-Dependent Confusion Matrices for Low-Resource NER Labeling with Noisy Labels
#### Lukas Lange, Michael A. Hedderich, Dietrich Klakow

In [None]:
import sys
import logging
from collections import Counter

import numpy as np

import tensorflow as tf

import matplotlib

import keras
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Bidirectional
from keras.layers.merge import concatenate, add

from layers import *
from ner_datacode import DataCreation, WordEmbedding, WordCluster, LabelRepresentation, Evaluation
from experimentalsettings import ExperimentalSettings
from noisematrix import NoiseMatrix

In [None]:
SETTINGS = ExperimentalSettings.load_json('base')
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Settings

The settings are read from the given settings file. The 'config' directory already contains several example configurations. Following options can be set in the file:

### Data Settings
*   "NAME": "string"
*   "PATH_TRAIN_CLEAN": "str"    # e.g. data/eng.train
*   "PATH_TRAIN_NOISY": "str"    # e.g. data/eng.autom_labeled.train
*   "PATH_DEV": "str"            # e.g. data/eng.testa
*   "PATH_TEST": "str"           # e.g. data/eng.testb
*   "DATA_SEPARATOR": "char"     # column separator; most often space or tab
*   "WORD_EMBEDDING": "str"      # e.g. data/fasttext/cc.en.300.bin
*   "LABEL_FORMAT": "io"         # either "io" or "bio"

### Model Settings
*   "CONTEXT_LENGTH": int        # context size of #words to the left and right
*   "LSTM_SIZE": int             # e.g. 300
*   "DENSE_SIZE": int            # e.g. 100
*   "DENSE_ACTIVATION": "str"    # e.g. relu
*   "USE_CLEAN": bool            # either true or false
*   "USE_NOISY": bool            # either true or false
*   "NOISE_METHOD": "str"        # noise method; either "channel","cleaning" or "dynamic"
*   "USE_IDENTITY_MATRIX": bool  # either true or false. This is only used for channel models. 
*   "USE_WORD_CLUSTER": "str"    # either "brown", "kmeans" or "none"
*   "PATH_WORD_CLUSTER": "str"   # e.g. data/word_cluster/en_brown_25. This is only used for Brown Clustering. 
*   "NUM_WORD_CLUSTER": int      # e.g. 25. This is only used for kMeans Clustering. 
*   "CLEANING_DENSE_SIZE": int   # e.g. 30. This is only used for Noise Cleaning. 

### Training Settings
*   "SAMPLE_PCT_CLEAN": float    # Sample percentage of clean data; e.g. 0.01 for 1%
*   "SAMPLE_PCT_NOISY": float    # Sample percentage of noisy data; e.g. 0.01 for 1%
*   "EPOCHS": int                # e.g. 50
*   "BATCH_SIZE": int            # e.g. 100
*   "WORD_CLUSTER_SELECTION": float  # e.g. 1.0,
*   "WORD_CLUSTER_INTERPOLATION": float # e.g. 0.0,
*   "SAMPLE_SEED": int
*   "TRAINING_SEED": int

# Brown Cluster
For our experiments we used the brown clustering implementation from https://github.com/percyliang/brown-cluster
Set the PATH_WORD_CLUSTER argument to the resulting path file.
The number of clusters has to be specified in the -c argument. 
The NUM_WORD_CLUSTER setting is only used for kMeans clustering. 

# Data Loading and Preprocessing

In [None]:
# Loading of fastText word embeddings
word_embedding = WordEmbedding()
word_embedding.load_fasttext(SETTINGS["WORD_EMBEDDING"])

# LabelRepresentation, either BIO or IO (for testing always BIO)
label_representation = LabelRepresentation()
if SETTINGS["LABEL_FORMAT"] == "bio":
    label_representation.use_connl_bio_labels()
    test_label_representation = label_representation
elif SETTINGS["LABEL_FORMAT"] == "io":
    label_representation.use_connl_io_labels()
    test_label_representation = LabelRepresentation()
    test_label_representation.use_connl_bio_labels()

In [None]:
def load_and_preprocess_data(path_to_data, label_representation, remove_label_prefix):
    # load dataset
    data_creation = DataCreation(input_separator=SETTINGS["DATA_SEPARATOR"])
    instances = data_creation.load_connl_dataset(path_to_data, SETTINGS["CONTEXT_LENGTH"], remove_label_prefix)
    
    # embed words in vector representation
    word_embedding.embed_instances(instances)
    x = word_embedding.instances_to_vectors(instances)
    
    # convert BIO/IO labels to one hot vectors
    label_representation.embed_instances(instances)
    y = label_representation.instances_to_vectors(instances)
    
    return instances, x, y

remove_label_prefix = SETTINGS["LABEL_FORMAT"] == "io"
train_clean, train_clean_x, train_clean_y = load_and_preprocess_data(SETTINGS["PATH_TRAIN_CLEAN"], 
                                                                     label_representation, remove_label_prefix)
train_noisy, train_noisy_x, train_noisy_y = load_and_preprocess_data(SETTINGS["PATH_TRAIN_NOISY"], label_representation, remove_label_prefix)
dev, dev_x, dev_y = load_and_preprocess_data(SETTINGS["PATH_DEV"], test_label_representation, False) # we always test on BIO sheme
test, test_x, test_y = load_and_preprocess_data(SETTINGS["PATH_TEST"], test_label_representation, False)

del word_embedding.embedding_model

In [None]:
# Load word clusters
if SETTINGS["USE_WORD_CLUSTER"].lower() in ['brown', 'kmeans']:
    word_cluster = WordCluster()
    if SETTINGS["USE_WORD_CLUSTER"].lower() == 'brown':
        word_cluster.load_brown_cluster(SETTINGS["PATH_WORD_CLUSTER"])
    elif SETTINGS["USE_WORD_CLUSTER"].lower() == 'kmeans':
        emb_values = np.array([x[SETTINGS["CONTEXT_LENGTH"]] for x in train_clean_x])
        word_cluster.load_kmeans_cluster(train_clean, emb_values, SETTINGS["NUM_WORD_CLUSTER"])
    
    def add_cluster_information(instances, word_cluster):
        word_cluster.get_cluster(instances)
        c = [token.clusterID for token in instances]
        return c

    train_clean_c = add_cluster_information(train_clean, word_cluster)
    train_noisy_c = add_cluster_information(train_noisy, word_cluster)
    
else:
    train_clean_c = [-1 for i in range(len(train_clean))]
    train_noisy_c = [-1 for i in range(len(train_noisy))]

In [None]:
def create_subset(xs, ys, cs, size, random_state, sequential):
    """ Creates a subset of the data.
    
    Args: 
        xs: Tensor of the input data (x, word embeddings)
        ys: Tensor of the labels (vector form)
        cs: Tensor of the clusters
        size: Size of the subset. Samples at least 1 item if size < 1
        random_state: The subset is randomly sampled, instance of np.random_state
        sequential: If True, a random start point is picked and then a sequence of instances/words
                    is picked. If False, instances are picked randomly.
                    
    Returns:
        subsets of corresponding xs, ys and cs
    
    """
    assert len(xs) == len(ys)
    assert len(xs) >= size
    ind = _get_sample_indicies(len(xs), max(size, 1), random_state, sequential)
    xs_sub = np.array([x for i, x in enumerate(xs) if i in ind])
    ys_sub = np.array([y for i, y in enumerate(ys) if i in ind])
    cs_sub = np.array([y for i, y in enumerate(cs) if i in ind])
    return xs_sub, ys_sub, cs_sub

def _get_sample_indicies(num_items, num_samples, random_state, sequential):
    '''Returns a list of indicies that should be sampled.
    
    Args:
        num_items: integer value representing the pool size
        num_samples: integer value representing the number of items to be sampled
        random_state: numpy random state that should be used for random processes
        sequential: boolean value indicating whether the items should sampled sequentially or completely random
        
    Returns:
        A set of indices
    '''
    assert num_items >= num_samples
    numbers = list(range(num_items))
    if num_items == num_samples:
        return list(sorted(numbers))
    if sequential:
        start_number = random_state.randint(0, num_items)
        if start_number <= (num_items - num_samples):  # can generate one sequential sample
            indicies = numbers[start_number:start_number+num_samples]
        else:  # sampled would reach source list bondaries; need to generate two sequential samples
            indicies = numbers[start_number:]
            indicies.extend(numbers[:num_samples-(num_items-start_number)])
    else:
        indicies = random_state.randint(0, num_items-1, num_samples)
    assert len(indicies) == num_samples
    return set(indicies)

In [None]:
def compute_noise_matrix(clean_ys, noisy_ys):
    """
    Computes a noise or confusion matrix between clean and noisy labels.
    
    Args:
        clean_ys: Tensor or list of label vectors of the clean labels
        noisy_ys: Tensor or list of label vectors of the noisy labels
    
    Returns:
        A noise matrix of size num_labels x num_labels. Each row represents
        p(y_noisy| y_clean=i) for a specific clean label i
        (Formula 4 in the paper, without the log)
    """
    num_labels = label_representation.get_num_labels()
    assert num_labels == len(clean_ys[0]), f'Expected {num_labels} labels, but got: {len(clean_ys[0])}'
    assert len(clean_ys) == len(noisy_ys)
    
    noise_matrix = np.zeros((num_labels, num_labels))

    for clean_y, noisy_y in zip(clean_ys, noisy_ys):
        clean_y_idx = np.argmax(clean_y)
        noisy_y_idx = np.argmax(noisy_y)
                
        noise_matrix[clean_y_idx,noisy_y_idx] += 1

    for row in noise_matrix:
        row_sum = np.sum(row)
        if row_sum != 0:
            row /= row_sum
            
    return noise_matrix

# Noise Model Architectures

Implementations of different noise model architectures

In [None]:
def create_base_model():
    """
    Creates the base model.
    
    A Bi-LSTM model with a Dense layer and a Softmax layer for classification.
    """
    input_shape = (SETTINGS["CONTEXT_LENGTH"]*2+1, word_embedding.embedding_vector_size, )

    feature_input_layer = Input(shape=input_shape, name="input_text")
    bi_lstm_layer = Bidirectional(LSTM(SETTINGS["LSTM_SIZE"]), merge_mode='concat', name="bilstm")(feature_input_layer)
    dense_layer = Dense(SETTINGS["DENSE_SIZE"], activation=SETTINGS["DENSE_ACTIVATION"], name="dense")(bi_lstm_layer)
    softmax_output_layer = Dense(label_representation.get_num_labels(), activation='softmax', name="softmax_out")(dense_layer)
    
    model = Model(inputs=[feature_input_layer], outputs=softmax_output_layer)
    model.compile(loss=keras.losses.categorical_crossentropy, optimizer="nadam", metrics=['accuracy'])
    
    return model, feature_input_layer, bi_lstm_layer, softmax_output_layer

def create_global_noise_layer_model(feature_input_layer, softmax_output_layer, channel_weights):
    """
    Creates a noise-layer model with one global confusion matrix.
    
    "Global-CM" in the paper. This is an implementation of 
    Hedderich & Klakow: Training a Neural Network in a Low-Resource Setting on Automatically, 2018
    
    Also used for the feature-dependent noise models ("Brown-CM" and "K-Means-CM")
    """
    noise_channel = NoiseMatrixLayer(name='noisy-channel', weights=[channel_weights])
    channeled_output = noise_channel(softmax_output_layer)
    
    model = Model(inputs=[feature_input_layer], outputs=channeled_output)
    model.compile(loss=keras.losses.categorical_crossentropy, optimizer="nadam", metrics=['accuracy'])
    
    return model, noise_channel

def create_noise_cleaning_model(feature_input_layer, bi_lstm_layer):
    """
    Creates a noise-cleaning model that learns how to transform a noisy into a clean label.
    
    "Cleaning" in the paper. This is an implementation of 
    Veit & al.:  Learning  from  noisy  large-scale  datasets  with  minimal supervision, 2017
    adapted to this NER setting.
    """
    noisy_label_input = Input(shape=(label_representation.get_num_labels(),), name="noisy_label_input")
    feature_representation_input_to_noise_cleaning = bi_lstm_layer
    feature_representation_lower_dimension_dense = Dense(SETTINGS["CLEANING_DENSE_SIZE"], name="dense_feature_rep")(feature_representation_input_to_noise_cleaning)
    input_to_noisy_cleaning = concatenate([noisy_label_input, feature_representation_lower_dimension_dense])

    dense_noisy_cleaning = Dense(label_representation.get_num_labels(), name="dense_noisy_cleaning")(input_to_noisy_cleaning)
    identity_skip = add([dense_noisy_cleaning, noisy_label_input])
    clipping = ClipZeroOneLayer()(identity_skip)

    noise_cleaning_model = Model(inputs=[feature_input_layer, noisy_label_input], outputs=clipping)
    noise_cleaning_model.compile(loss=keras.losses.mean_absolute_error, optimizer="nadam", metrics=['accuracy'])

    return noise_cleaning_model

def create_dynamic_noise_model():
    """
    Creates a dynamic transition matrix model that creates a transition matrix per input.
    
    "Dynamic-CM" in the paper. This is an implementation of
    Luo et al.: Learning with noise:  Enhance distantly supervised relation extraction with 
                dynamic transition matrix, 2017
    adapted to this NER setting.            
    """
    input_shape = (SETTINGS["CONTEXT_LENGTH"]*2+1, word_embedding.embedding_vector_size, )
    feature_input_layer = Input(shape=input_shape, name="input_text")
    bi_lstm_layer = Bidirectional(LSTM(SETTINGS["LSTM_SIZE"]), merge_mode='concat', name="bilstm")(feature_input_layer)
    
    # Noise Modeling Part
    transition_matrices = DynamicTransitionMatrixGeneration(label_representation.get_num_labels())(bi_lstm_layer)
    
    #  Prediction Part
    dense_layer = Dense(SETTINGS["DENSE_SIZE"], activation=SETTINGS["DENSE_ACTIVATION"], name="dense")(bi_lstm_layer)
    ner_output = Dense(label_representation.get_num_labels(), activation='softmax', name="softmax_out")(dense_layer)
    
    # Combine correct predictions
    predict = TransitionMatrixApplication()([transition_matrices, ner_output])
    
    model = Model(inputs=[feature_input_layer], outputs=[predict]) 
    # Note that we do not use the trace loss proposed by the authors.
    # We found that the standard loss is giving better results in our experiments
    # which used fewer training instances than in the original work.
    # You can add the trace loss by using sum_loss(cross_entropy, trace_loss). More details
    # about sum_loss and trace_loss can be found in layers.py
    model.compile(loss=keras.losses.categorical_crossentropy, optimizer="nadam", metrics=['accuracy'])
    return model

# Training & Evaluation

In [None]:
training_state = np.random.RandomState(SETTINGS["TRAINING_SEED"])

def train_epoch(model, data_x, data_y):
    """ Train a single epoch for given X and Y data.
    
    Args:
        model: Keras model used for training
        data_x: Tensor of embedded words
        data_y: Tensor of of one-hot encoded labels
    """
    model.fit(data_x, data_y, batch_size=SETTINGS["BATCH_SIZE"],
              epochs=1, verbose=0, shuffle=True)
    
def train_on_batches(models, data_x, data_y, data_c, random_state=training_state):
    """ Train a single epoch for given X and Y data with the clusters.
    
    Args:
        models: A dictionary of type <clusterID: keras model>
        data_x: Tensor of embedded words
        data_y: Tensor of one-hot encoded labels
        data_c: List of clusterIDs used for splitting the data into batches
        random_state: The numpy random state used for shuffling the batches
    """
    batches, tmp_x, tmp_y = [], {}, {}
    for c in set(data_c):
        tmp_x[c], tmp_y[c] = [], []
        
    for x, y, c in zip(data_x, data_y, data_c):
        if len(tmp_x[c]) == SETTINGS["BATCH_SIZE"]:
            batches.append((c, np.array(tmp_x[c]), np.array(tmp_y[c])))
            tmp_x[c] = [x]
            tmp_y[c] = [y]
        else:
            tmp_x[c].append(x)
            tmp_y[c].append(y)
    
    # insert incomplete batches
    for c in set(data_c):
        if len(tmp_x[c]) > 0:
            batches.append((c, np.array(tmp_x[c]), np.array(tmp_y[c])))
    
    random_state.shuffle(batches)
    for c, xs, ys in batches:
        model = models[c]
        model.train_on_batch(xs, ys)

def simple_evaluation(model, data, data_x):
    evaluation_output = long_evaluation(model, data, data_x)
    return Evaluation.extract_f_score(evaluation_output)

def long_evaluation(model, data, data_x):
    predictions = model.predict(data_x)
    predictions = label_representation.predictions_to_labels(predictions)
    
    # if predictions are in IO format, convert to BIO used for evaluation when working on test set
    if SETTINGS["LABEL_FORMAT"] == "io":
        predictions = LabelRepresentation.convert_io_to_bio_labels(predictions)

    evaluation = Evaluation(separator=SETTINGS["DATA_SEPARATOR"])
    connl_evaluation_string = evaluation.create_connl_evaluation_format(data, predictions)
    return evaluation.evaluate_evaluation_string(connl_evaluation_string)

In [None]:
def train_and_evaluate():
    logging.info("Create Models")
    
    # sample a low-resource setting from the training data
    sampling_state = np.random.RandomState(SETTINGS["SAMPLE_SEED"])
    clean_x, clean_y, clean_c = create_subset(train_clean_x, train_clean_y, train_clean_c,
                                              size=int(len(train_clean_x)*SETTINGS["SAMPLE_PCT_CLEAN"]), 
                                              random_state=sampling_state, sequential=True)
    
    # reset to get same instances of noisy data
    # pairs of clean and noisy labels are used to initalize the noise model
    sampling_state = np.random.RandomState(SETTINGS["SAMPLE_SEED"]) 
    clean_noisy_x, clean_noisy_y, clean_noisy_c = create_subset(train_noisy_x, train_noisy_y, train_noisy_c,
                                              size=int(len(train_noisy_x)*SETTINGS["SAMPLE_PCT_CLEAN"]), 
                                              random_state=sampling_state, sequential=True)
    
    
    # create model architecture
    if SETTINGS["NOISE_METHOD"].lower() == 'dynamic':
        base_model = create_dynamic_noise_model()
    else:
        base_model, feature_input_layer, bi_lstm_layer, softmax_output_layer = create_base_model()
        
        if SETTINGS["NOISE_METHOD"].lower() == 'channel':
            global_matrix = compute_noise_matrix(clean_y, clean_noisy_y)
            matrix_figure = NoiseMatrix.visualize_matrix(global_matrix, idx_to_label_name_map=label_representation.label_idx_to_label_name_map)
        
            # *-Freq functionality in the paper
            if SETTINGS["WORD_CLUSTER_SELECTION"] < 1.0:
                selection = SETTINGS["WORD_CLUSTER_SELECTION"]
                logging.info(f'Selecting {int(len(set(clean_c)) * selection)} noise groups ({selection*100} %)')
                freq_groups = Counter(clean_c).most_common(int(len(set(clean_c)) * selection))
                freq_groups = set(x[0] for x in freq_groups)
                clean_c = np.array([c if c in freq_groups else -1 for c in clean_c])
            else:
                freq_groups = set(clean_c)
            
            # *-IP functionality in the paper
            if SETTINGS["WORD_CLUSTER_INTERPOLATION"] > 0:
                epsilon = SETTINGS["WORD_CLUSTER_INTERPOLATION"]
                logging.info(f'Interpolating with epsilon = {epsilon}')
            else:
                epsilon = 0.0
        
            channel_models = {} 
            for clusterID in set(clean_c):
                if SETTINGS["USE_IDENTITY_MATRIX"]:
                    noise_matrix = np.eye(label_representation.get_num_labels(),
                                          label_representation.get_num_labels())
                else:
                    if clusterID == -1:  # we use all instances for this matrix
                        noise_matrix = global_matrix
                    else:
                        sample = [(y, n) for y, c, n in zip(clean_y, clean_c, clean_noisy_y) if c == clusterID]
                        clean_ys, noisy_ys = [i[0] for i in sample], [i[1] for i in sample]
                        noise_matrix = compute_noise_matrix(clean_ys, noisy_ys)
                    if epsilon > 0:
                        noise_matrix = epsilon * global_matrix + (1-epsilon) * noise_matrix
         
                channel_weights = np.log(noise_matrix + 1e-8)
                model, _ = create_global_noise_layer_model(feature_input_layer, softmax_output_layer, channel_weights)
                channel_models[clusterID] = model
            logging.info(f"Created 1 Base and {len(channel_models)} Noise Models")
    
        elif SETTINGS["NOISE_METHOD"].lower() == 'cleaning':
            cleaning_model = create_noise_cleaning_model(feature_input_layer, bi_lstm_layer)
            logging.info(f"Created Noise Cleaning Model")
        
    # training loop
    best_dev = -1
    for epoch in range(SETTINGS["EPOCHS"]):
        noisy_x, noisy_y, noisy_c = create_subset(train_noisy_x, train_noisy_y, train_noisy_c,
                                                  size=int(len(train_noisy_x)*SETTINGS["SAMPLE_PCT_NOISY"]), 
                                                  random_state=sampling_state, sequential=False)
        
        if SETTINGS["NOISE_METHOD"].lower() == 'cleaning':
            x_cleaning = {'noisy_label_input': clean_noisy_y,
                          'input_text': clean_x}
            y_cleaning = clean_y
        
            # train cleaning component on clean data
            cleaning_model.fit(x_cleaning, y_cleaning, batch_size=SETTINGS["BATCH_SIZE"], 
                               epochs=1, shuffle=True, verbose=0)
            
            # predict cleaned labels for noisy data
            pred_noisy_x = {'noisy_label_input': noisy_y,
                            'input_text': noisy_x}
            pred_y = cleaning_model.predict(pred_noisy_x)
            
            cleaned_x = np.concatenate((clean_x, noisy_x), axis=0)
            cleaned_y= np.concatenate((clean_y, pred_y), axis=0)
            train_epoch(base_model, cleaned_x, cleaned_y)
            
        elif SETTINGS["USE_CLEAN"]:
            train_epoch(base_model, clean_x, clean_y)
        
        eval_dev = simple_evaluation(base_model, dev, dev_x)
        eval_test = simple_evaluation(base_model, test, test_x)
        logging.info(f'Epoch {epoch+1}\tCurrent F1 for DEV: {eval_dev}\tTEST: {eval_test}')
        
        # test performance of the model with the best dev performance is used
        if eval_dev > best_dev:
            best_dev = eval_dev
            best_epoch = epoch
            test_evaluation = eval_test
            long_test_evaluation = long_evaluation(base_model, test, test_x)
            
        if SETTINGS["USE_NOISY"] and SETTINGS["NOISE_METHOD"].lower() != 'cleaning':
            if SETTINGS["NOISE_METHOD"].lower() == 'channel':
                noisy_c = np.array([c if c in freq_groups else -1 for c in noisy_c])
                train_on_batches(channel_models, noisy_x, noisy_y, noisy_c)
            elif SETTINGS["NOISE_METHOD"].lower() in ['none', 'dynamic']:
                train_epoch(base_model, noisy_x, noisy_y)

    logging.info(long_test_evaluation)
    return base_model

train_and_evaluate()