In [0]:
from google.colab import drive
drive.mount('/content/drive/')

In [0]:
#Import all the libraries

import pandas as pd
import numpy as np
import tensorflow as tf
import xgboost as xgb
from scipy import stats
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline,FeatureUnion
from math import sqrt
import random
import time
import os
import shutil

In [0]:
#import the training file and split for train and test

df = pd.read_excel('./drive/My Drive/Colab Notebooks/Regression/Train Data.xlsx', encoding='latin-1')
df_train, df_test = train_test_split(df, train_size = 0.8, test_size = 0.2, random_state = 7)

In [0]:
#create a class to replace the 'NA' with NaN

class Replace_NAs(TransformerMixin,BaseEstimator):
    feature_names = []
    def __init__(self):
        pass
    def fit(self, X, y):
        return self
    def transform(self, X, y=None):
        X.replace({'N/A': ''}, regex=True)
        self.feature_names = X.columns.tolist()
        return X
    def get_feature_names(self):
        return self.feature_names

In [0]:
#create classes for selecting the various data types for preprocessing

class Date_Select(TransformerMixin,BaseEstimator):
    feature_names = []
    def __init__(self):
        pass
    def fit(self, X, y):
        self.feature_names = []
        return self
    def transform(self, X, y=None):
        X = X.select_dtypes(include=['datetime'])
        self.feature_names = X.columns.tolist()
        return X
    def get_feature_names(self):
        return self.feature_names

class Cat_Select(TransformerMixin,BaseEstimator):
    feature_names = []
    def __init__(self):
        pass
    def fit(self, X, y):
        self.feature_names = []
        return self
    def transform(self, X, y=None):
        X = X.select_dtypes(include=['object', 'bool'])
        self.feature_names = X.columns.tolist()
        return X
    def get_feature_names(self):
        return self.feature_names
    

class Num_Select(TransformerMixin,BaseEstimator):
    feature_names = []
    def __init__(self):
        pass
    def fit(self, X, y):
        self.feature_names = []
        return self
    def transform(self, X, y=None):
        X = X.select_dtypes(include=['int64','float64'])
        self.feature_names = X.columns.tolist()
        return X
    def get_feature_names(self):
        return self.feature_names

In [0]:
# create a transformer to extract the month and year from the dates

class Date_Extract(TransformerMixin,BaseEstimator):
    feature_names = []
    def __init__(self):
        pass
    def fit(self, X, y):
        self.feature_names = []
        return self
    def transform(self, X, y=None):
        m, y, wd = 1, 1, 1
        for date in X.columns:
            X[date+'_m'], X[date+'_y'], X[date+'_wd'] = X[date].dt.month, X[date].dt.year, X[date].dt.weekday
            m = m + 1
            y = y + 1
            wd = wd + 1
            X = X.drop([date],axis=1)
        self.feature_names = X.columns.tolist()
        return X
    def get_feature_names(self):
        return self.feature_names

In [0]:
#create a class to fill the missing values in the categorical features (with the name of the feature and prefix '_other'

class Fill_CatNaNs(TransformerMixin,BaseEstimator):
    feature_names = []
    def __init__(self):
        pass
    def fit(self, X, y):
        self.feature_names = []
        return self
    def transform(self, X, y=None):
        X = X.fillna(value = pd.concat([pd.DataFrame(X.columns[X.isna().any()],columns = ['Dict']),
                'other_' + pd.DataFrame(X.columns[X.isna().any()]).astype(str)],
                axis=1).set_index('Dict').to_dict()[0])
        self.feature_names = X.columns.tolist()
        return X
    def get_feature_names(self):
        return self.feature_names

In [0]:
#create a custom class for the transformer classes, to keep column names

class Custom_Indicator(TransformerMixin,BaseEstimator):
    
    Indicator = MissingIndicator(missing_values=np.NaN, error_on_new = False)
    feature_names = []
    
    def __init__(self):
        pass
    def fit(self, X, y = None):
        self.feature_names = []
        self.Indicator.fit(X)
        return self
    def transform(self, X, y=None):
        X = pd.DataFrame(data = self.Indicator.transform(X), index = X.index).add_prefix('ind_')
        self.feature_names = X.columns.tolist()
        return X
    def get_feature_names(self):
        return self.feature_names
    
class Custom_Imputer(TransformerMixin,BaseEstimator):
    
    Imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')
    feature_names = []
    
    def __init__(self):
        pass
    def fit(self, X, y = None):
        self.feature_names = []
        self.Imputer.fit(X)
        return self
    def transform(self, X, y=None):
        X = pd.DataFrame(data = self.Imputer.transform(X), index = X.index, columns = X.columns)
        self.feature_names = X.columns.tolist()
        return X
    def get_feature_names(self):
        return self.feature_names

class Custom_Scaler(TransformerMixin,BaseEstimator):
    
    Scaler = MinMaxScaler()
    feature_names = []
    
    def __init__(self):
        pass
    def fit(self, X, y = None):
        self.feature_names = []
        self.Scaler.fit(X)
        return self
    def transform(self, X, y=None):
        X = pd.DataFrame(data = self.Scaler.transform(X), index = X.index, columns = X.columns)
        self.feature_names = X.columns.tolist()
        return X
    def get_feature_names(self):
        return self.feature_names
    
class Custom_Encoder(TransformerMixin,BaseEstimator):
    
    Encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')
    ohe_features = []
    
    def __init__(self):
        pass
    def fit(self, X, y = None):
        self.ohe_features = []
        self.Encoder.fit(X)
        self.ohe_features = self.Encoder.get_feature_names()
        return self
    def transform(self, X, y=None):
        X = pd.DataFrame(data = self.Encoder.transform(X), index = X.index, columns = self.ohe_features)
        return X
    def get_feature_names(self):
        return self.ohe_features
    
class Date_Encoder(TransformerMixin,BaseEstimator):
    
    Encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')
    ohe_features = []
    
    def __init__(self):
        pass
    def fit(self, X, y = None):
        self.ohe_features = []
        self.Encoder.fit(X)
        self.ohe_features = self.Encoder.get_feature_names()
        return self
    def transform(self, X, y=None):
        X = pd.DataFrame(data = self.Encoder.transform(X), index = X.index, columns = self.ohe_features)
        return X
    def get_feature_names(self):
        return self.ohe_features

class Custom_Selector(TransformerMixin,BaseEstimator):
    
    Selector = SelectKBest(f_regression, k=64)
    column_names = []
    
    def __init__(self):
        pass
    def fit(self, X, y):
        self.column_names = []
        self.Selector.fit(X,y)
        mask = self.Selector.get_support()
        for bool, feature in zip(mask, list(X.columns.values)):
            if bool:
                self.column_names.append(feature)     
        return self
    def transform(self, X, y=None):
        X = pd.DataFrame(data = self.Selector.transform(X), index = X.index, columns = self.column_names)
        return X
    def get_feature_names(self):
        return self.column_names

In [0]:
#define the steps in the primary pipelines

MissingInd_Pipe = Pipeline(steps = [('Num_Select', Num_Select()),
                                    ('Imp_Indicate', Custom_Indicator())])

Numerical_Pipe = Pipeline(steps = [('Num_Select', Num_Select()),
                                   ('Imputer', Custom_Imputer()),
                                   ('Scaler', Custom_Scaler())])

Date_Pipe = Pipeline(steps = [('Date_Select',Date_Select()),
                              ('Date_Extract',Date_Extract()),
                             ('Date_Encode',Date_Encoder())])

Cat_Pipe = Pipeline(steps = [('Cat_Select',Cat_Select()),
                             ('Fill_CatNaNs',Fill_CatNaNs()),
                             ('Encoder', Custom_Encoder())])

Union = FeatureUnion([('Missing_Pipe',MissingInd_Pipe),
                      ('Numerical_Pipe',Numerical_Pipe),
                      ('Date_Pipe', Date_Pipe),
                      ('Cat_Pipe',Cat_Pipe)])

features_all = Pipeline([#('replace_nas', Replace_NAs()),
                         ('all_features',FeatureUnion([('Missing_Pipe',MissingInd_Pipe),
                                                ('Numerical_Pipe',Numerical_Pipe),
                                                ('Date_Pipe', Date_Pipe),
                                                ('Cat_Pipe',Cat_Pipe)]))])

In [0]:
class Feature_Names(TransformerMixin,BaseEstimator):
    feature_names = []
    def __init__(self):
        pass
    def fit(self, X, y):
        return self
    def transform(self, X, y=None):
        X1 = pd.DataFrame(data = X, columns = np.concatenate((features_all.named_steps['all_features'].transformer_list[0][1].named_steps['Imp_Indicate'].get_feature_names(),
                                                             features_all.named_steps['all_features'].transformer_list[1][1].named_steps['Scaler'].get_feature_names(),
                                                             features_all.named_steps['all_features'].transformer_list[2][1].named_steps['Date_Encode'].get_feature_names(),
                                                             features_all.named_steps['all_features'].transformer_list[3][1].named_steps['Encoder'].get_feature_names()),
                                                            axis = None))
        return X1
    def get_feature_names(self):
        return self.feature_names

In [0]:
named_features = Pipeline([('feats',Union),
                         ('feat_names', Feature_Names()),
                         ('Selector',Custom_Selector())])

In [0]:
#Create the data reader to read and batch the npz files

class SP_Data_Reader():
    # Dataset is a mandatory arugment, while the batch_size is optional
    # If you don't input batch_size, it will automatically take the value: None
    def __init__(self, dataset, batch_size = None):
    
        # The dataset that loads is one of "train", "validation", "test".
        # e.g. if I call this class with x('train',5), it will load 'SP_Data_train.npz' with a batch size of 5.
        npz = np.load('SP_data_{0}.npz'.format(dataset))

        self.inputs, self.targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.float)

        if batch_size is None:
            self.batch_size = self.inputs.shape[0]
        else:
            self.batch_size = batch_size
        self.curr_batch = 0
        self.batch_count = self.inputs.shape[0] // self.batch_size
    
    def __next__(self):
        if self.curr_batch >= self.batch_count:
            self.curr_batch = 0
            raise StopIteration()
            
        batch_slice = slice(self.curr_batch * self.batch_size, (self.curr_batch + 1) * self.batch_size)
        inputs_batch = self.inputs[batch_slice]
        targets_batch = self.targets[batch_slice]
        self.curr_batch += 1
        
        return inputs_batch, targets_batch
    
    def __iter__(self):
        return self

In [0]:
#create a class fit, test and predict with the primary DNN model

class Primary_DNN(TransformerMixin,BaseEstimator):
    def __init__(self, name):
        self.name = name
        pass
    def fit(self, X, y):
        
        #split the primary data to train and validation, incl. startifying
        
        tf.reset_default_graph()

        x_train, x_val, y_train, y_val = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 7)
        
        np.savez('SP_data_train',inputs = x_train,targets = y_train)
        np.savez('SP_data_validate',inputs = x_val,targets = y_val)
        
        # set the parameters for the primary DNN model
        
        primary_output_size = 1
        primary_input_size = x_train.shape[1]
        
        dropout_rate = random.choice([0.05, 0.1, 0.2])
        layers = random.randrange(3, 5)
        lr = random.choice([0.01, 0.001, 0.0001])
        funnel = random.choice([1, 2])
        shape = random.choice([256, 512, 1024, 2048])
        
        primary_inputs = tf.placeholder(tf.float32,[None, primary_input_size], name = 'primary_inputs')
        primary_targets = tf.placeholder(tf.float32,[None])
        
        primary_weights_1 = tf.get_variable("primary_weights_1",[primary_input_size, shape])
        primary_biases_1 = tf.get_variable("primary_biases_1",[shape])
        
        primary_drop_1 = tf.nn.dropout(primary_inputs, rate = dropout_rate)
        primary_outputs_1 = tf.nn.relu(tf.matmul(primary_drop_1,primary_weights_1)+primary_biases_1)

        primary_weights_2 = tf.get_variable("primary_weights_2",[shape, shape/funnel])
        primary_biases_2 = tf.get_variable("primary_biases_2",[shape/funnel])
        
        primary_drop_2 = tf.nn.dropout(primary_outputs_1, rate = dropout_rate)
        primary_outputs_2 = tf.nn.relu(tf.matmul(primary_drop_2,primary_weights_2)+primary_biases_2)

        primary_weights_3 = tf.get_variable("primary_weights_3",[shape/funnel, shape/funnel**2])
        primary_biases_3 = tf.get_variable("primary_biases_3",[shape/funnel**2])
        
        primary_drop_3 = tf.nn.dropout(primary_outputs_2, rate = dropout_rate)
        primary_outputs_3 = tf.nn.relu(tf.matmul(primary_drop_3,primary_weights_3)+primary_biases_3)

        primary_weights_4 = tf.get_variable("primary_weights_4",[shape/funnel**2, shape/funnel**3])
        primary_biases_4 = tf.get_variable("primary_biases_4",[shape/funnel**3])
        
        primary_drop_4 = tf.nn.dropout(primary_outputs_3, rate = dropout_rate)
        primary_outputs_4 = tf.nn.relu(tf.matmul(primary_drop_4,primary_weights_4)+primary_biases_4)

        primary_weights_5 = tf.get_variable("primary_weights_5",[shape/funnel**3, primary_output_size])
        primary_biases_5 = tf.get_variable("primary_biases_5",[primary_output_size])
        
        if layers == 3:
          primary_outputs = tf.matmul(primary_outputs_2,primary_weights_3)+primary_biases_3
        elif layers == 4:
          primary_outputs = tf.matmul(primary_outputs_3,primary_weights_4)+primary_biases_4
        elif layers == 5:
          primary_outputs = tf.matmul(primary_outputs_4,primary_weights_5)+primary_biases_5
      
        primary_predictions = tf.math.reduce_sum(primary_outputs, axis = 1, name = 'primary_predictions')
        
        #set the parameters for the primary DNN model
        #reduce_sum at final output to 'collapse' vector to single value

        primary_loss = tf.losses.mean_squared_error(predictions = primary_predictions, labels = primary_targets)
        primary_mean_loss = tf.reduce_mean(primary_loss)
        primary_optimize = tf.train.AdamOptimizer(learning_rate=lr).minimize(primary_mean_loss)
        primary_accuracy = tf.reduce_mean(abs(primary_predictions - primary_targets), name = 'primary_accruacy')
        sess = tf.InteractiveSession()
        init_g = tf.global_variables_initializer()
        sess.run(init_g)

        batch_size = 64
        max_epochs = 50

        primary_train_data = SP_Data_Reader('train',batch_size)
        primary_validation_data = SP_Data_Reader('validate')

        agg_validation_loss = [9999999.]
        primary_patience = -4
        
        #train the primary DNN model
        start_time = time.time()
        for epoch_counter in range(max_epochs):
            curr_epoch_loss = 0.
            for inputs_batch, targets_batch in primary_train_data:
                _, batch_loss = sess.run([primary_optimize, primary_mean_loss],feed_dict={primary_inputs: inputs_batch,
                                                                                          primary_targets: targets_batch})
                curr_epoch_loss = curr_epoch_loss + batch_loss
            curr_epoch_loss = curr_epoch_loss/primary_train_data.batch_count

            primary_validation_loss = 0.
            primary_validation_accuracy = 0.

            for inputs_batch, targets_batch in primary_validation_data:
                primary_validation_loss, primary_validation_accuracy = sess.run([primary_mean_loss,primary_accuracy],
                                                                                feed_dict={primary_inputs: inputs_batch,
                                                                                           primary_targets: targets_batch})
            
                print('Primary DNN'+str(self.name)+', Epoch: '+str(epoch_counter+1)+
                  '. Training loss: '+'{0:.3f}'.format(curr_epoch_loss)+
                  '. Validation loss: '+'{0:.3f}'.format(primary_validation_loss)+
                  '. Validation accuracy abs: '+'{0:.2f}'.format(primary_validation_accuracy)+'m')
            
            #set the early stop mechanism and saving the best-performing model
            
            if primary_validation_loss < min(agg_validation_loss[:epoch_counter+1]):
                if os.path.isdir(r'./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary'+str(self.name)):
                    shutil.rmtree(r'./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary'+str(self.name))

                primary_builder = tf.saved_model.builder.SavedModelBuilder \
                (r'/content/drive/My Drive/Colab Notebooks/Regression/DNNs/Primary'+str(self.name))
                primary_builder.add_meta_graph_and_variables(sess,["tag"],
                                                                 signature_def_map={"model":tf.saved_model.signature_def_utils.predict_signature_def(
                            inputs= {"primary_inputs": primary_inputs},
                            outputs= {"primary_predictions": primary_predictions})})
                primary_builder.save()
            
            agg_validation_loss.append(primary_validation_loss)
            
            if epoch_counter > abs(primary_patience):
                if min(agg_validation_loss[primary_patience:]) > min(agg_validation_loss[:primary_patience]):
                    break
            
        print('End of PDNN training.')
        print("Training time: %s seconds" % (time.time() - start_time))
        sess.close()
        return self
    
    def transform(self, X, y=None):
        return self
    
    def predict(self, X, y=None):
        with tf.Session(graph=tf.Graph()) as sess:
            tf.saved_model.loader.load(sess, ["tag"], r'./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary'+str(self.name))
            graph = tf.get_default_graph()
            primary_inputs = graph.get_tensor_by_name("primary_inputs:0")
            primary_predictions = graph.get_tensor_by_name("primary_predictions:0")
            return sess.run([primary_predictions], feed_dict={primary_inputs:X})[0]
        
    def score(self, X, y):
        with tf.Session(graph=tf.Graph()) as sess:
            tf.saved_model.loader.load(sess, ["tag"], r'./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary'+str(self.name))
            graph = tf.get_default_graph()
            primary_inputs = graph.get_tensor_by_name("primary_inputs:0")
            primary_predictions = graph.get_tensor_by_name("primary_predictions:0")
            return print('R^2: {}'.format(r2_score(y,pd.DataFrame(sess.run([primary_predictions],
                                             feed_dict={primary_inputs:X})).transpose())))

In [0]:
# build a class to combine the base predictions for the meta DNN

class Meta_Features(TransformerMixin,BaseEstimator):
    
    feats = named_features
    
    lr_meta = GridSearchCV(Ridge(),
                                param_grid = {'alpha':[1]},
                                cv=2, n_jobs = -1, refit = True)

    rf_meta = GridSearchCV(RandomForestRegressor(),
                                param_grid = {'n_estimators':[2]},
                                cv=2, n_jobs = -1, refit = True)

    xgb_meta = GridSearchCV(xgb.XGBRegressor(),
                                    param_grid = {'n_estimators':[2], 'max_depth': [2], 'alpha': [1], 'objective': ['reg:squarederror']},
                                    cv=2, n_jobs = -1, refit = True)

    PDNN_number = 5
    PDNN_Networks = []
    for network in range(PDNN_number):
        PDNN_Networks.append(Primary_DNN(name = len(PDNN_Networks)+1))
    
    def __init__(self):
        pass
    
    def transform(self, X):
        
        X1 = X.copy()
        X1.set_index('ID Number', inplace = True)
        X, y = X1.drop(['Market 1 Sold (vol::date)'],axis=1), X1.loc[:,'Market 1 Sold (vol::date)']
        
        meta_df = pd.DataFrame(self.feats.transform(X)).set_index(y.index)
    
        PDNN_Predictions = []
        for network in self.PDNN_Networks:
            PDNN_Predictions.append(pd.Series(network.predict(meta_df),
                                              name = 'PDNN'+str(self.PDNN_Networks.index(network)+1), index = meta_df.index))

        PDNN_Predictions = pd.concat(PDNN_Predictions, axis = 1)
        
        X, y, z = pd.concat([meta_df,
                             PDNN_Predictions,
                          pd.Series(self.lr_meta.predict(meta_df), name = 'LR', index = meta_df.index),
                          pd.Series(self.rf_meta.predict(meta_df), name = 'RF', index = meta_df.index),
                          pd.Series(self.xgb_meta.predict(meta_df), name = 'XGB', index = meta_df.index)],
                            axis = 1, join='inner'),y, self.PDNN_number
        
        return X, y, z
    
    def fit_transform(self, X, y):
        
        X1 = X.copy()
        X1.set_index('ID Number', inplace = True)
        X, y = X1.drop(['Market 1 Sold (vol::date)'],axis=1), X1.loc[:,'Market 1 Sold (vol::date)']
        
        x_train_meta, x_test_meta, y_train_meta, y_test_meta = train_test_split(X, y, train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 7)

        # cut outliers and log-transform the targets
        
#        z_score = np.abs(stats.zscore(y_train_meta))
#        resultant_df = big_df[(big_df.index not in small_df.index)]
#        x_train_pruned = x_train_meta.loc[y_train_pruned.index]

        y_train_log = np.log(y_train_meta)
        y_test_log = np.log(y_test_meta)

        self.feats.fit(x_train_meta, y_train_log)
        train_df = pd.DataFrame(self.feats.transform(x_train_meta)).set_index(y_train_log.index)
                        
        self.lr_meta.fit(train_df, y_train_log)
        self.rf_meta.fit(train_df, y_train_log)
        self.xgb_meta.fit(train_df, y_train_log)
        for network in self.PDNN_Networks:
            network.fit(train_df, y_train_log)
        
        meta_df = pd.DataFrame(self.feats.transform(x_test_meta)).set_index(y_test_log.index)
        
        PDNN_Predictions = []
        for network in self.PDNN_Networks:
            PDNN_Predictions.append(pd.Series(network.predict(meta_df),
                                              name = 'PDNN'+str(self.PDNN_Networks.index(network)+1), index = meta_df.index))

        PDNN_Predictions = pd.concat(PDNN_Predictions, axis = 1)
        
        X, y, z = pd.concat([meta_df,
                             PDNN_Predictions,
                          pd.Series(self.lr_meta.predict(meta_df), name = 'LR', index = meta_df.index),
                          pd.Series(self.rf_meta.predict(meta_df), name = 'RF', index = meta_df.index),
                          pd.Series(self.xgb_meta.predict(meta_df), name = 'XGB', index = meta_df.index)],
                            axis = 1, join='inner'),y_test_log, self.PDNN_number
        
        return X, y, z

In [0]:
#create a class fit, test and predict with the meta DNN model

class Meta_DNN(TransformerMixin,BaseEstimator):
    def __init__(self):
        pass
    def fit(self, X, y = None):
        
        X, y, z = X[0], X[1], X[2]
        
        #split the meta data to train and validation

        x_train_meta, x_val_meta, y_train_meta, y_val_meta = train_test_split(X, y,
                                                                             train_size = 0.8, test_size = 0.2, random_state = 7)
        
        np.savez('SP_data_train_meta',inputs = x_train_meta,targets = y_train_meta)
        np.savez('SP_data_validate_meta',inputs = x_val_meta,targets = y_val_meta)
        
        # set the parameters for the meta DNN model
        
        output_size = 1
        input_size = x_train_meta.shape[1]
        
        hidden_layer_size = 512

        tf.reset_default_graph()
        
        inputs = tf.placeholder(tf.float32,[None, input_size], name = 'inputs')
        targets = tf.placeholder(tf.float32,[None])

        weights_1 = tf.get_variable("weights_1",[input_size, hidden_layer_size])
        biases_1 = tf.get_variable("biases_1",[hidden_layer_size])

        drop_1 = tf.nn.dropout(inputs, rate = 0.01)
        outputs_1 = tf.nn.relu(tf.matmul(drop_1,weights_1)+biases_1)

        weights_2 = tf.get_variable("weights_2",[hidden_layer_size, hidden_layer_size/2])
        biases_2 = tf.get_variable("biases_2",[hidden_layer_size/2])

        drop_2 = tf.nn.dropout(outputs_1, rate = 0.01)
        outputs_2 = tf.nn.relu(tf.matmul(drop_2,weights_2)+biases_2)

        weights_3 = tf.get_variable("weights_3",[hidden_layer_size/2, hidden_layer_size/4])
        biases_3 = tf.get_variable("biases_3",[hidden_layer_size/4])

        drop_3 = tf.nn.dropout(outputs_2, rate = 0.01)
        outputs_3 = tf.nn.relu(tf.matmul(drop_3,weights_3)+biases_3)

        weights_4 = tf.get_variable("weights_4",[hidden_layer_size/4, hidden_layer_size/8])
        biases_4 = tf.get_variable("biases_4",[hidden_layer_size/8])

        drop_4 = tf.nn.dropout(outputs_3, rate = 0.01)
        outputs_4 = tf.nn.relu(tf.matmul(drop_4,weights_4)+biases_4)

        weights_5 = tf.get_variable("weights_5",[hidden_layer_size/8, output_size])
        biases_5 = tf.get_variable("biases_5",[output_size])

        outputs = tf.matmul(outputs_4,weights_5)+biases_5

        #set the parameters for the meta DNN model
        #reduce_sum at final output to 'collapse' vector to single value

        predictions = tf.math.reduce_sum(outputs, axis = 1, name = 'predictions')
        loss = tf.losses.mean_squared_error(predictions = predictions, labels = targets)
        mean_loss = tf.reduce_mean(loss)
        optimize = tf.train.AdamOptimizer(learning_rate=0.001).minimize(mean_loss)
        accuracy = tf.reduce_mean(abs(predictions - targets))
        sess_meta = tf.InteractiveSession()
        initializer = tf.global_variables_initializer()
        sess_meta.run(initializer)

        batch_size = 16
        max_epochs = 50

        train_data = SP_Data_Reader('train_meta',batch_size)
        validation_data = SP_Data_Reader('validate_meta')

        agg_validation_loss = [9999999.]
        meta_patience = -4

        #train the meta DNN model
        start_time = time.time()
        for epoch_counter in range(max_epochs):
            curr_epoch_loss = 0.
            for inputs_batch, targets_batch in train_data:
                _, batch_loss = sess_meta.run([optimize, mean_loss],feed_dict={inputs: inputs_batch,targets: targets_batch})
                curr_epoch_loss = curr_epoch_loss + batch_loss
            curr_epoch_loss = curr_epoch_loss/train_data.batch_count

            validation_loss = 0.
            validation_accuracy = 0.

            for inputs_batch, targets_batch in validation_data:
                validation_loss, validation_accuracy = sess_meta.run([mean_loss,accuracy],feed_dict={inputs: inputs_batch,targets: targets_batch})

            print('Meta Epoch '+str(epoch_counter+1)+
                  '. Training loss: '+'{0:.3f}'.format(curr_epoch_loss)+
                  '. Validation loss: '+'{0:.3f}'.format(validation_loss)+
                  '. Validation accuracy abs: '+'{0:.2f}'.format(validation_accuracy)+'m')

            #set the early stop mechanism and saving the best-performing model


            if validation_loss < min(agg_validation_loss):
                if os.path.isdir(r'./drive/My Drive/Colab Notebooks/Regression/DNNs/Meta'):
                    shutil.rmtree(r'./drive/My Drive/Colab Notebooks/Regression/DNNs/Meta')

                meta_builder = tf.saved_model.builder.SavedModelBuilder \
                (r'/content/drive/My Drive/Colab Notebooks/Regression/DNNs/Meta')
                meta_builder.add_meta_graph_and_variables(sess_meta,["tag"],
                                                          signature_def_map={"meta_model": tf.saved_model.signature_def_utils.predict_signature_def(
                            inputs= {"inputs": inputs},
                            outputs= {"predictions": predictions})})
                meta_builder.save()

            agg_validation_loss.append(validation_loss)

            if epoch_counter > abs(meta_patience):
                if min(agg_validation_loss[meta_patience:]) > min(agg_validation_loss[:meta_patience]):
                    break

        print('End of MDNN training.')
        print("Training time: %s seconds" % (time.time() - start_time))
        sess_meta.close()
        return self
    
    def predict(self, X, y = None):
        X = X[0]
        with tf.Session(graph=tf.Graph()) as sess:
            tf.saved_model.loader.load(sess, ["tag"], r'./drive/My Drive/Colab Notebooks/Regression/DNNs/Meta')
            graph = tf.get_default_graph()
            inputs = graph.get_tensor_by_name("inputs:0")
            predictions = graph.get_tensor_by_name("predictions:0")
            return np.exp(sess.run([predictions], feed_dict={inputs:X})[0])
        
    def score(self, X, y = None):
        X, y, z = X[0], X[1], X[2]
        
        with tf.Session(graph=tf.Graph()) as sess:
            tf.saved_model.loader.load(sess, ["tag"], r'./drive/My Drive/Colab Notebooks/Regression/DNNs/Meta')
            graph = tf.get_default_graph()
            inputs = graph.get_tensor_by_name("inputs:0")
            predictions = graph.get_tensor_by_name("predictions:0")
            result = pd.concat([X.merge(y.to_frame(),how = 'left',left_index = True,
                                     right_index = True),
                             pd.DataFrame(np.exp(sess.run([predictions],
                                             feed_dict={inputs:X}))).transpose().set_index(X.index)], axis = 1)
        
        for model in range(z+3):
            result.iloc[:,(model+3)*-1] = np.exp(result.iloc[:,(model+3)*-1])
        
        result.to_csv('./drive/My Drive/Colab Notebooks/Regression/Ensemble_Test_Results.csv')
        
        Model_name = []
        Abs_dev_score = []
        RMSLE_score = []
        R2_score = []
        
        for model in range(z+3):
            Model_name.append(result.columns[(model+3)*-1])
        
        for model in range(z+3):
            Abs_dev_score.append(round(sum(abs(result.iloc[:,-2]-result.iloc[:,(model+3)*-1]))/len(result.iloc[:,-2]),2))

        for model in range(z+3):
            RMSLE_score.append(round(sqrt(mean_squared_error(np.log(result.iloc[:,-2]),np.log(result.iloc[:,(model+3)*-1]))),2))

        for model in range(z+3):
            R2_score.append(round(r2_score(result.iloc[:,-2],result.iloc[:,(model+3)*-1]),2))

        Scores = pd.DataFrame(data = [round(sum(abs(result.iloc[:,-2]-result.iloc[:,-1]))/len(result.iloc[:,-2]),2),
                                      round(sqrt(mean_squared_error(np.log(result.iloc[:,-2]),np.log(result.iloc[:,-1]))),2),
                                      round(r2_score(result.iloc[:,-2],result.iloc[:,-1]),2)],
                                    columns = ['Meta DNN'],index = ['Absolute Deviation','RMSLE','R^2'])
        
        scores_data = pd.DataFrame(data = [Abs_dev_score, RMSLE_score, R2_score], columns = Model_name,
                                   index = ['Absolute Deviation','RMSLE','R^2'])
        
        Scores = pd.concat([scores_data, Scores],axis = 1)
        
        return Scores

In [0]:
#define the steps in the meta pipelines

meta_ppl = Pipeline([('meta_feats',Meta_Features()),
                     ('MDNN',Meta_DNN())])

In [0]:
#fit the pipeline with the train set

meta_ppl.fit(df_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
  if getattr(data, 'base', None) is not None and \


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Primary DNN1, Epoch: 1. Training loss: 1.968. Validation loss: 1.766. Validation accuracy abs: 1.06m
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: /content/drive/My Drive/Colab Notebooks/Regression/DNNs/Primary1/saved_model.pb
Primary DNN1, Epoch: 2. Training loss: 1.713. Validation loss: 1.698. Validation accuracy abs: 1.04m
INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: /content/drive/My Drive/Colab Notebooks/Regression/DNNs/Primary1/saved_model.pb
Primary DNN1, Epoch: 3. Training loss: 1.660. Validation loss: 1.686. Validation accuracy abs: 1.03m
INFO:tensorflow:No asse

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary1/variables/variables
INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary2/variables/variables
INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary3/variables/variables
INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary4/variables/variables
INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary5/variables/variables
Meta Epoch 1. Training loss: 1.064. Validation loss: 1.056. Validation accuracy abs: 0.77m
INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: /content/drive/My Drive/Colab Notebooks/Regression/DNNs/Meta/saved_model.pb
Meta Epoch 2. Training loss: 1.017. Validation loss: 1.043. Validation accuracy abs: 0.77m
INFO:tensorflow:No assets to sav

Pipeline(memory=None,
         steps=[('meta_feats', Meta_Features()), ('MDNN', Meta_DNN())],
         verbose=False)

In [0]:
#score the pipeline with the test set

meta_ppl.score(df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary1/variables/variables
INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary2/variables/variables
INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary3/variables/variables
INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary4/variables/variables
INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary5/variables/variables
INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Meta/variables/variables


Unnamed: 0,XGB,RF,LR,PDNN5,PDNN4,PDNN3,PDNN2,PDNN1,Meta DNN
Absolute Deviation,3.27,2.81,3.03,2.82,2.89,2.88,2.87,2.92,2.61
RMSLE,1.4,1.12,1.24,1.13,1.14,1.15,1.14,1.17,1.01
R^2,-0.05,0.22,0.06,0.17,0.09,0.12,0.13,0.11,0.27


In [0]:
#import the new data for inference via the pipeline

df_new = pd.read_excel('./drive/My Drive/Colab Notebooks/Regression/Pipeline_Test.xlsx', encoding='latin-1')
df_new.head()

Unnamed: 0,ID Number,Market 1 Sold (vol::date),Market 1 Country,Bond Providers,Market 1 Wrapper,Market 1 Currency,Product Type,Strike Levels (underlying,Payoff Type,Payoff Group,Asset Class(omitted),Income Type,Date,Term,Headline Rate,Minimum Return: Lower,Sales Commission,Participation Rate,Barrier 1,Digital Coupon,Base Interest Rate,Deposit Rate,Implied Volatility,Most Recent GDP growth,PMM vs T3MA
0,68,2.86344,asdf,Bank of Icecream,Note,CAD,Growth,Share Basket (Unspecified),"Callable, Uncapped Call",Exotic,Equity (Share Basket),,2015-03-11,9.008219,,100.0,,100.0,,,1.0,1.25,16.870001,-0.544198,0.791069
1,117,2.882887,Canada,gfdh,Note,CAD,Growth,iShares S&P/TSX Capped Energy Index ETF,"Knock Out, Protected Tracker",Participation,Equity (Single Index),,2015-04-03,3.008219,,1.0,,5.0,-30.0,,1.0,1.25,13.67,-0.269524,0.988702
2,132,3.803878,Canada,Bank of Muffins,hgfj,CAD,Growth,,"Capped Call, Putable",Exotic,Equity (Share Basket),,2015-06-24,5.005479,,102.5,,100.0,,,1.0,1.25,13.19,-0.269524,1.130888
3,210,4.418104,Canada,Bank of Chocolate,Note,jhgk,Growth,Eurostoxx 50,"Knock Out, Protected Tracker",Capital Protection,Equity (Single Index),,2015-07-13,5.005479,,1.0,2.5,5.0,-25.0,,0.75,1.25,16.790001,0.350715,0.92768
4,227,4.505167,Canada,Bank of Icecream,Note,CAD,kjhgk,Eurostoxx 50,"Digital, Knock Out, Protected Tracker",Participation,Equity (Single Index),Variable,2015-07-05,6.030137,0.0,1.0,,5.0,-25.0,3.075,0.75,1.25,17.01,0.350715,0.940003


In [0]:
#execute the pipeline on the new data and extract new predictions with the file

ppl_prediction = meta_ppl.predict(df_new)
New_pred = pd.Series(ppl_prediction,index = df_new.index)
New_est = df_new.merge(New_pred.to_frame(name = 'meta'),how = 'left',left_index = True, right_index = True)
New_est.to_csv('./drive/My Drive/Colab Notebooks/Regression/New_Preds.csv')
New_est.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary1/variables/variables
INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary2/variables/variables
INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary3/variables/variables
INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary4/variables/variables
INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary5/variables/variables
INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Meta/variables/variables




Unnamed: 0,ID Number,Market 1 Sold (vol::date),Market 1 Country,Bond Providers,Market 1 Wrapper,Market 1 Currency,Product Type,Strike Levels (underlying,Payoff Type,Payoff Group,Asset Class(omitted),Income Type,Date,Term,Headline Rate,Minimum Return: Lower,Sales Commission,Participation Rate,Barrier 1,Digital Coupon,Base Interest Rate,Deposit Rate,Implied Volatility,Most Recent GDP growth,PMM vs T3MA,meta
0,68,2.86344,asdf,Bank of Icecream,Note,CAD,Growth,Share Basket (Unspecified),"Callable, Uncapped Call",Exotic,Equity (Share Basket),,2015-03-11,9.008219,,100.0,,100.0,,,1.0,1.25,16.870001,-0.544198,0.791069,2.067596
1,117,2.882887,Canada,gfdh,Note,CAD,Growth,iShares S&P/TSX Capped Energy Index ETF,"Knock Out, Protected Tracker",Participation,Equity (Single Index),,2015-04-03,3.008219,,1.0,,5.0,-30.0,,1.0,1.25,13.67,-0.269524,0.988702,4.169425
2,132,3.803878,Canada,Bank of Muffins,hgfj,CAD,Growth,,"Capped Call, Putable",Exotic,Equity (Share Basket),,2015-06-24,5.005479,,102.5,,100.0,,,1.0,1.25,13.19,-0.269524,1.130888,9.419992
3,210,4.418104,Canada,Bank of Chocolate,Note,jhgk,Growth,Eurostoxx 50,"Knock Out, Protected Tracker",Capital Protection,Equity (Single Index),,2015-07-13,5.005479,,1.0,2.5,5.0,-25.0,,0.75,1.25,16.790001,0.350715,0.92768,6.02834
4,227,4.505167,Canada,Bank of Icecream,Note,CAD,kjhgk,Eurostoxx 50,"Digital, Knock Out, Protected Tracker",Participation,Equity (Single Index),Variable,2015-07-05,6.030137,0.0,1.0,,5.0,-25.0,3.075,0.75,1.25,17.01,0.350715,0.940003,3.499247


In [0]:
with tf.Session(graph=tf.Graph()) as sess:
            tf.saved_model.loader.load(sess, ["tag"], r'./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary3')
            graph = tf.get_default_graph()
            primary_inputs = graph.get_tensor_by_name("primary_inputs:0")
            primary_predictions = graph.get_tensor_by_name("primary_predictions:0")
            var = tf.trainable_variables()
            
var

INFO:tensorflow:Restoring parameters from ./drive/My Drive/Colab Notebooks/Regression/DNNs/Primary3/variables/variables


[<tf.Variable 'primary_weights_1:0' shape=(64, 1024) dtype=float32_ref>,
 <tf.Variable 'primary_biases_1:0' shape=(1024,) dtype=float32_ref>,
 <tf.Variable 'primary_weights_2:0' shape=(1024, 512) dtype=float32_ref>,
 <tf.Variable 'primary_biases_2:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'primary_weights_3:0' shape=(512, 256) dtype=float32_ref>,
 <tf.Variable 'primary_biases_3:0' shape=(256,) dtype=float32_ref>,
 <tf.Variable 'primary_weights_4:0' shape=(256, 128) dtype=float32_ref>,
 <tf.Variable 'primary_biases_4:0' shape=(128,) dtype=float32_ref>,
 <tf.Variable 'primary_weights_5:0' shape=(128, 1) dtype=float32_ref>,
 <tf.Variable 'primary_biases_5:0' shape=(1,) dtype=float32_ref>]

In [0]:
import tensorflow as tf