# Neural Network Model

In [1]:
execution_mode = 'restricted'

## Table of Contents

- [Data Takeover](#Data-Takeover)
    - [Train/Test Split](#Train/Test-Split)
- [Neural Network Implementation](#Neural-Network-Implementation)
    - [Performance Measurement](#Performance-Measurement)

## Data Takeover

Read in DataFrame from chapter [Feature Matrix Generation](./3_FeatureMatrixGeneration.ipynb) as input for processing in this chapter.

In [2]:
import os
import pandas as pd

path_goldstandard = './daten_goldstandard'

# Restore results so far
df_labelled_feature_matrix = pd.read_pickle(os.path.join(path_goldstandard,
                                                         'labelled_feature_matrix.pkl'),
                                 compression=None)

df_attribute_with_sim_feature = pd.read_pickle(os.path.join(
    path_goldstandard, 'labelled_feature_matrix_full.pkl'), compression=None
                                              )

df_labelled_feature_matrix.describe()

Unnamed: 0,duplicates,coordinate_E_delta,coordinate_N_delta,corporate_full_delta,doi_delta,edition_delta,exactDate_delta,format_postfix_delta,format_prefix_delta,isbn_delta,...,musicid_delta,part_delta,person_100_delta,person_245c_delta,person_700_delta,pubinit_delta,scale_delta,ttlfull_245_delta,ttlfull_246_delta,volumes_delta
count,260733.0,260733.0,260733.0,260733.0,260733.0,260733.0,260733.0,260733.0,260733.0,260733.0,...,260733.0,260733.0,260733.0,260733.0,260733.0,260733.0,260733.0,260733.0,260733.0,260733.0
mean,0.005649,-0.094006,-0.093732,-0.073601,-0.094386,-0.083392,0.364475,0.430583,0.420806,0.376157,...,-0.074878,-0.01082,0.201556,0.433603,0.175853,0.188908,-0.093757,0.562582,-0.083317,0.207096
std,0.07495,0.034547,0.039364,0.068478,0.020315,0.05923,0.175742,0.330969,0.493689,0.484414,...,0.071869,0.18861,0.361736,0.252787,0.310796,0.293114,0.039281,0.109808,0.069896,0.342763
min,0.0,-0.1,-0.1,-0.1,-0.1,-0.1,0.0,0.0,0.0,0.0,...,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,0.0,-0.1,-0.1
25%,0.0,-0.1,-0.1,-0.1,-0.1,-0.1,0.25,0.111111,0.0,0.0,...,-0.1,-0.1,-0.05,0.438095,-0.05,-0.05,-0.1,0.505947,-0.1,-0.05
50%,0.0,-0.1,-0.1,-0.1,-0.1,-0.1,0.3125,0.428571,0.0,0.0,...,-0.1,-0.05,-0.05,0.523228,-0.05,-0.05,-0.1,0.544974,-0.1,0.0
75%,0.0,-0.1,-0.1,-0.05,-0.1,-0.05,0.5,0.428571,1.0,1.0,...,-0.05,-0.05,0.535227,0.58021,0.52924,0.491522,-0.1,0.599688,-0.1,0.555556
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
print('Part of duplicates (1) on uniques (2) in units of [%]')
print(df_labelled_feature_matrix.duplicates.value_counts(normalize=True)*100)

Part of duplicates (1) on uniques (2) in units of [%]
0    99.435054
1     0.564946
Name: duplicates, dtype: float64


### Train/Test Split

The train/test split will be implemented here as a general function to be called in the models chapters.

In [4]:
import classifier_fitting_funcs as cff

X_tr, _, X_te, y_tr, _, y_te, idx_tr, _, idx_te = cff.split_feature_target(
    df_labelled_feature_matrix, 'train_test')

X_tr[:5], y_tr[:5], idx_tr[:5]

(array([[-0.1       , -0.1       , -0.1       , -0.1       , -0.1       ,
          0.5       ,  0.42857143,  0.        ,  1.        , -0.1       ,
         -0.05      , -0.05      , -0.05      ,  0.50165426, -0.05      ,
          0.48593074, -0.1       ,  0.60439973, -0.1       ,  0.        ],
        [-0.1       , -0.1       , -0.05      , -0.1       , -0.1       ,
          0.        ,  0.42857143,  0.        ,  0.        , -0.1       ,
         -0.1       , -0.1       , -0.05      ,  0.54435379, -0.1       ,
         -0.05      , -0.1       ,  0.54177001, -0.1       , -0.05      ],
        [-0.1       , -0.1       , -0.05      , -0.1       , -0.1       ,
          0.        ,  1.        ,  1.        ,  0.        , -0.1       ,
         -0.1       , -0.05      , -0.05      ,  0.6020276 ,  0.53663004,
          0.49448622, -0.1       ,  0.57046955, -0.1       , -0.05      ],
        [-0.1       , -0.1       , -0.1       , -0.1       , -0.1       ,
          0.25      ,  1.        , 

In [5]:
X_tr.shape

(208586, 20)

## Neural Network Implementation

In [6]:
def build_and_compile_nn(params):
    # Input layer
    hidden1 = Dense(
#        12, # Number of units for hidden layer
        params['number_of_hidden1_layers'], # Number of units for hidden layer
        input_shape=(X_tr.shape[1],),
        activation='relu',
        kernel_initializer=VarianceScaling(scale=2.0, seed=0),
        kernel_regularizer=l2(params['l2_alpha']),
        bias_initializer='zeros'
    )

    # With dropout layer
    dropout = Dropout(
        rate=params['dropout_rate'],
        seed=0
    )

    if params['number_of_hidden2_layers'] > 0 :
        hidden2 = Dense(
    #        12, # Number of units for hidden layer
            params['number_of_hidden2_layers'], # Number of units for hidden layer
            input_shape=(params['number_of_hidden1_layers'],),
            activation='relu',
            kernel_initializer=VarianceScaling(scale=2.0, seed=0),
            kernel_regularizer=l2(params['l2_alpha']),
            bias_initializer='zeros'
        )

    # Output layer
    output = Dense(
        2,
    #    activation='sigmoid', # Bring out 0 or 1 values
        activation='softmax', # Bring out 0 or 1 values
        kernel_initializer=VarianceScaling(scale=1, seed=0),
        kernel_regularizer=l2(params['l2_alpha']),
        bias_initializer='zeros'
    )

    # Create model with sequential API
    model = Sequential()
    model.add( hidden1 ) # Hidden layer 1
    model.add( dropout ) # Dropout
    if params['number_of_hidden2_layers'] > 0 :
        model.add( hidden2 ) # Hidden layer 2
    model.add( output ) # Output layer

    # The optimizer is to be SGD
    #sgd = SGD(lr=params['sgd_learnrate'])
    model_optimizer = Adam(lr=params['sgd_learnrate'])

    # Compile model
    model.compile(
        loss='categorical_crossentropy',
        optimizer=model_optimizer,
        metrics=['accuracy']
    #    metrics=['categorical_accuracy']
    )

    return model

In [7]:
import matplotlib.pyplot as plt

def plot_result(history, params):
    plt.plot(history.history['accuracy'], label='training accuracy')
    plt.plot(history.history['val_accuracy'], label='validation accuracy')
    plt.legend()
    plt.title(r'Bilayer neural network with lr = {} and $\alpha=${}'.format(
        params['sgd_learnrate'], params['l2_alpha'])
    )
    plt.xlabel('epoch')
    plt.ylim(0.99, 1.0)
    plt.ylabel('accuracy')
    plt.show()
    
    return

In [12]:
from sklearn.utils import class_weight
import numpy as np
 
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_tr), y_tr)

if execution_mode == 'full' :
    parameter_dictionary = {
        'l2_alpha' : [0.0#, 0.01, 0.05, 0.1, 0.5
                     ],
        # As soon as a l2_alpha > 0 comes in, the network does not converge anymore.
        #  => Go on with l2_alpha = 0, only.
        'sgd_learnrate' : [0.001, 0.002, 0.003],
        # A learning rate of 0.001 and slightly slower gives good results.
        'dropout_rate' : [#0.0, 
            0.1, 0.2],
        'class_weight' : [#None, 
            class_weights],
        'number_of_hidden1_layers' : [#2, 8, 15, 20, 25, 40, 
            45, 50, 55, 60, 65, 70, 75],
        # A number of hidden layers of 2 is too small. The bigger the number of hidden layers,
        #  the slower the learning rate. There are 20 features.
        'number_of_hidden2_layers' : [0, 45, 50, 55, 60]
    }
elif execution_mode == 'restricted' :
    parameter_dictionary = {
        'l2_alpha' : [0.0],
        'sgd_learnrate' : [0.002],
        'dropout_rate' : [0.1],
        'class_weight' : [None, class_weights],
        'number_of_hidden1_layers' : [40, 60],
        'number_of_hidden2_layers' : [0, 70]
    }

# Grid of values for nn with 1 hidden layer
grid = cff.generate_parameter_grid(parameter_dictionary)

The grid parameters are ...
l2_alpha [0.0]
sgd_learnrate [0.001, 0.002, 0.003]
dropout_rate [0.1, 0.2]
class_weight [array([ 0.50283981, 88.53395586])]
number_of_hidden1_layers [45, 50, 55, 60, 65, 70, 75]
number_of_hidden2_layers [0, 45, 50, 55, 60]
 => Number of combinations : 210


In [9]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.initializers import VarianceScaling
from keras.regularizers import l2
from keras.optimizers import SGD, Adam
from keras.utils import to_categorical
import numpy as np

import time
start_time = time.time()

# Save accuracy on test set
no_last = 5 # Take the last 5 due to the upwards shape of the curve at its end
test_scores = []

# Parameters for fitting with batches and epochs
epochs = 100
batch_size = 320 # Default batch_size = 32
# Playing around with batch sizes of [3, 30, None=32, 320, 3200] shows the effect
#  that the bigger the size, the faster the calculation performance,
#  ... the worse the convergence. => Batch size has effect of learning rate.
# 320 seems to be the perfect value.

for params_dict in grid :

    model = build_and_compile_nn(params_dict)
    
    print('Fitting with parameters', params_dict)
    # Fit the model
    history = model.fit(
        x=X_tr, y=to_categorical(y_tr),
        batch_size=batch_size, 
        epochs=epochs,
        validation_data=(X_te, to_categorical(y_te)),
        class_weight=params_dict['class_weight']
        , verbose=0
    )

    # Save accuracy on train set and validation set
    params_dict['accuracy_tr'] = np.mean(history.history['accuracy'][-no_last:])
    params_dict['accuracy_val'] = np.mean(history.history['val_accuracy'][-no_last:])
    params_dict['log_accuracy_tr'] = np.log(1-np.mean(history.history['accuracy'][-no_last:]))
    params_dict['log_accuracy_val'] = np.log(1-np.mean(history.history['val_accuracy'][-no_last:]))

    print(' => validation score {:.3f}%'.format(100*params_dict['accuracy_val']))
    # Save result
    test_scores.append(params_dict)
    
    plot_result(history, params_dict)
    print("--- %s seconds ---" % (time.time() - start_time))
    
# Save measured accuracies
df_test_scores_nn = pd.DataFrame(test_scores).sort_values('accuracy_val', ascending=False)

Using TensorFlow backend.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Fitting with parameters {'class_weight': None, 'dropout_rate': 0.1, 'l2_alpha': 0.0, 'number_of_hidden1_layers': 45, 'number_of_hidden2_layers': 0, 'sgd_learnrate': 0.001}



KeyboardInterrupt: 

The curves of the validation accuracy above show the behaviour of approaching their constant maximum value only very slowly. A very high number of epochs is needed to reach the maximum value of the validation accuracy. Some simulation have been done in the course of the project with a number of epochs of 500. Even with this high number of epochs, there could still be seen a slight slope in the validation accurace, indicating that the model was still learning and improving. The validation accuracy has never surpassed a value of 99.93%, though. The latter observation led to the conviction that the accuracy of the model would not be increased significantly, increasing the number of epochs even over 500.

In [None]:
pd.set_option("display.max_rows", len(grid))

pd.DataFrame(test_scores).sort_values('accuracy_val', ascending=False)

In [None]:
best_params = cff.get_best_parameters(test_scores, parameter_dictionary)

model_best = build_and_compile_nn(best_params)

# Check Model configuration
model_best.get_config()

# Parameters for fitting with batches and epochs
epochs = 300
batch_size = 320

# Fit the model
history_best = model_best.fit(
    x=X_tr, y=to_categorical(y_tr),
    batch_size=batch_size, epochs=epochs,
    validation_data=(X_te, to_categorical(y_te))
    , verbose=0
)
y_pred = model_best.predict_classes(X_te)

plot_result(history_best, best_params)

In [None]:
from keras.utils import plot_model

path_model_graphics = './documentation'
model_png = os.path.join(path_model_graphics,'model.png')

plot_model(model_best, show_shapes=True, dpi=72, to_file=model_png)

In [None]:
no_last = 5 # I take the last 5 due to the upwards shape of the curve at its end
print('Mean last {:d} validation accuracy : {:.3f}'.format(
    no_last, np.mean(history_best.history['val_accuracy'][-no_last:])
))

print('Neural network accuracy (test set): {:.3f}'.format(
    model.evaluate(X_te, to_categorical(y_te),
                   verbose=0)[1] # Loss is at index=0, accuracy at index=1
))

### Performance Measurement

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_te, y_pred)

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from keras.metrics import accuracy

#print('Score {:.1f}%'.format(100*accuracy_score(X_te, y_te)))
print('Area under the curve {:.1f}% - accuracy {:.1f}% - precision {:.1f}% - recall {:.1f}%'.format(100*roc_auc_score(y_te, y_pred),
                100*accuracy_score(y_te, y_pred),
                100*precision_score(y_te, y_pred),
                100*recall_score(y_te, y_pred)
               ))

In [None]:
import results_analysis_funcs as raf

In [None]:
df_feature_base_full_te = df_attribute_with_sim_feature.iloc[idx_te]
df_feature_base_full_tr = df_attribute_with_sim_feature.iloc[idx_tr]

# Extend display to number of columns of DataFrame
pd.options.display.max_columns = len(df_feature_base_full_te.columns)

df_feature_base_full_te.sort_index().sample(n=20)

In [None]:
import results_saving_funcs as rsf

idx = {}
idx['true_predicted_uniques'], idx['true_predicted_duplicates'], idx['false_predicted_uniques'], idx['false_predicted_duplicates'] = raf.get_confusion_matrix_indices(y_te, y_pred)

wrong_prediction_groups = ['false_predicted_uniques', 'false_predicted_duplicates']

for i in wrong_prediction_groups :
    rsf.add_wrong_predictions(path_goldstandard, 
                              model_best, i, df_feature_base_full_te.loc[idx[i]])

## Results Handover

In [None]:
rsf.add_result_to_results(path_goldstandard, df_test_scores_nn, model_best, X_te, y_te, y_pred)