In [1]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [2]:
import math
import numpy as np
import pandas as pd
import torch
import torch.distributions as td
from torch import nn, optim
from torch.nn import functional as F
from tqdm import tqdm
from tqdm.auto import trange
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import mean_squared_error 
from SinkhornDistance import SinkhornDistance

class DisentangledVAE:
    def __init__(
        self, n_epochs, input_dimension, latent_dimension, hidden_layer_width, number_of_labels=3, weight=[1,1], device=None
    ):
        self.n_epochs=n_epochs
        self.hidden_layer_width = hidden_layer_width
        self.input_dimension = input_dimension
        self.latent_dimension = latent_dimension
        self.number_of_labels = number_of_labels #supervised dimension
        self.pred_weight = weight
        self.beta = 1
        self.recon_weight = 0.1
        self.KL_weight=0.1
        self.z_var = 1
        self.reg_weight = 1
        self.wasserstein=1
        self.decoder = nn.Sequential(
            torch.nn.Linear(self.latent_dimension, self.hidden_layer_width),
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_layer_width, self.hidden_layer_width),
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_layer_width, self.hidden_layer_width),
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_layer_width, self.input_dimension),
        ).to(device)
        self.encoder = nn.Sequential(
            torch.nn.Linear(self.input_dimension, self.hidden_layer_width),
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_layer_width, self.hidden_layer_width),
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_layer_width, self.hidden_layer_width),
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_layer_width, (2 * self.latent_dimension)),
        ).to(device)
        self.optimizer = optim.Adam(
            list(self.encoder.parameters()) + list(self.decoder.parameters()), lr=1e-5
        )
        self.mse = nn.MSELoss(reduction='mean')
        self.device=device
        self.batch_size = 64
        self.early_stopper = EarlyStopper(patience=2, min_delta=0.05)

        self.generate_data=False
        self.kl_loss = nn.KLDivLoss(reduction="batchmean", log_target=True)
        self.K=10
        self.sinkhorn = SinkhornDistance(eps=0.1, 
                                         max_iter=100, 
                                         device= device, 
                                         reduction=None)
        self.plot=False
        #deep fake loss
        self.sample_ratio=1
        self.generate_data=False

    def weights_init(self, layer):
        if type(layer) == nn.Linear:
            torch.nn.init.orthogonal_(layer.weight)

    def matrix_log_density_gaussian(self, x, mu, logvar):
        # broadcast to get probability of x given any instance(row) in (mu,logvar)
        # [k,:,:] : probability of kth row in x from all rows in (mu,logvar)
        x = x.view(self.batch_size, 1, self.latent_dimension)
        mu = mu.view(1, self.batch_size, self.latent_dimension)
        logvar = logvar.view(1, self.batch_size, self.latent_dimension)
        return td.Normal(loc=mu, scale=(torch.exp(logvar)) ** 0.5).log_prob(x)

    def log_importance_weight_matrix(self):
        """
        Calculates a log importance weight matrix
        Parameters
        ----------
        batch_size: int
            number of training images in the batch
        dataset_size: int
        number of training images in the dataset
        """
        N = self.n_data
        M = self.batch_size - 1
        strat_weight = (N - M) / (N * M)
        W = torch.Tensor(self.batch_size, self.batch_size).fill_(1 / M)
        W.view(-1)[:: M + 1] = 1 / N
        W.view(-1)[1 :: M + 1] = strat_weight
        W[M - 1, 0] = strat_weight
        return W.log()

    def get_log_qz_prodzi(self, latent_sample, latent_dist, is_mss=True):
        mat_log_qz = self.matrix_log_density_gaussian(
            latent_sample,
            latent_dist[..., : self.latent_dimension],
            latent_dist[..., self.latent_dimension :],
        )
        if is_mss:
            # use stratification
            log_iw_mat = self.log_importance_weight_matrix().to(
                latent_sample.device
            )
            mat_log_qz = mat_log_qz + log_iw_mat.view(self.batch_size, self.batch_size, 1)
            log_qz = torch.logsumexp(
                log_iw_mat + mat_log_qz.sum(2), dim=1, keepdim=False
            )
            log_prod_qzi = torch.logsumexp(
                log_iw_mat.view(self.batch_size, self.batch_size, 1) + mat_log_qz,
                dim=1,
                keepdim=False,
            ).sum(1)

        else:
            log_prod_qzi = (
                torch.logsumexp(
                    mat_log_qz, dim=1, keepdim=False
                )  # sum of probabilities in each latent dimension
                - math.log(self.batch_size * self.n_data)
            ).sum(1)
            log_qz = torch.logsumexp(
                mat_log_qz.sum(2),  # sum of probabilities across all latent dimensions
                dim=1,
                keepdim=False,
            ) - math.log(self.batch_size * self.n_data)

        return log_qz, log_prod_qzi

    def _kl_normal_loss(self, mean, logvar):
        """
        Calculates the KL divergence between a normal distribution
        with diagonal covariance and a unit normal distribution.
        Parameters
        ----------
        mean : torch.Tensor
            Mean of the normal distribution. Shape (batch_size, latent_dim) where
            D is dimension of distribution.
        logvar : torch.Tensor
            Diagonal log variance of the normal distribution. Shape (batch_size,
            latent_dim)
        storer : dict
            Dictionary in which to store important variables for vizualisation.
        """
        latent_dim = mean.size(1)
        # batch mean of kl for each latent dimension
        latent_kl = 0.5 * (-1 - logvar + mean.pow(2) + logvar.exp()).mean(dim=0)
        total_kl = latent_kl.sum()

        return total_kl
    def generate_fake(self, x1):
        out_encoder = self.encoder(x1.to(self.device))
        treatment = td.Independent(td.Normal(loc=out_encoder[:, :self.latent_dimension],
                                            scale=torch.exp(out_encoder[:, self.latent_dimension :]) ** 0.5,
                                            )
                                   ,1)
        x2 = treatment.rsample([self.sample_ratio]).reshape(
            (-1,self.latent_dimension)
        )
        return self.decoder(x2).cpu()

    def new_data(self,xhat_0,yhat_0,ite) :
        if yhat_0[:,0].sum()*2>len(yhat_0):
            minority = xhat_0[yhat_0[:,0]==0]
            minority_label = yhat_0[yhat_0[:,0]==0]
            minority_ite = ite[yhat_0[:,0]==0]
        else:
            minority = xhat_0[yhat_0[:,0]==1]
            minority_label = yhat_0[yhat_0[:,0]==1]
            minority_ite = ite[yhat_0[:,0]==0]
        upsampled_data = [xhat_0]
        upsampled_label = [yhat_0]
        upsampled_ite = [ite]
        for i,sample_ind in enumerate(np.array_split(np.arange(len(minority)),100)):
            upsampled_data.append(self.generate_fake(minority[sample_ind,:]))
            upsampled_label.append(minority_label[sample_ind,:].repeat(self.sample_ratio,1))
            upsampled_ite.append(minority_ite[sample_ind].repeat(self.sample_ratio))
        epoch_data = torch.cat(upsampled_data, dim=0)
        epoch_label = torch.cat(upsampled_label, dim=0)
        epoch_ite = torch.cat(upsampled_ite, dim=0)
        return epoch_data,epoch_label ,epoch_ite    
    def pred_loss(self,targets,out_encoder):
        # when classification: cn_loss = nn.BCEWithLogitsLoss().cuda()
        pred_losses = []       
        if len(targets[targets[:,0]==1,1])>0:
            loc = out_encoder[targets[:,0]==1,1].reshape((-1,1)) *self.std_treat + self.mean_treat
            truth = targets[targets[:,0]==1,1].reshape((-1,1)) *self.std_treat + self.mean_treat
            pred_losses.append(self.mse(loc,truth))
        else:
            pred_losses.append(torch.tensor([float('0')]).to(self.device))
        if len(targets[targets[:,0]==0,2])>0:
            loc = out_encoder[targets[:,0]==0,2].reshape((-1,1))*self.std_control + self.mean_control
            truth = targets[targets[:,0]==0,2].reshape((-1,1))*self.std_control + self.mean_control
            pred_losses.append(self.mse(loc,truth))
        else:
            pred_losses.append(torch.tensor([float('0')]).to(self.device))
        return pred_losses
    def compute_mmd(self,z1, z2, reg_weight):
        prior_z__kernel = self.compute_kernel(z1, z1)
        z__kernel = self.compute_kernel(z2, z2)
        priorz_z__kernel = self.compute_kernel(z1, z2)

        mmd = reg_weight * prior_z__kernel.mean() + \
              reg_weight * z__kernel.mean() - \
              2 * reg_weight * priorz_z__kernel.mean()
        return mmd
    def compute_kernel(self,x1,x2,kernel_type='rbf'):
        # Convert the tensors into row and column vectors
        D = x1.size(1)
        N = x1.size(0)
        x1 = x1.unsqueeze(-2) # Make it into a column tensor
        x2 = x2.unsqueeze(-3) # Make it into a row tensor

        """
        Usually the below lines are not required, especially in our case,
        but this is useful when x1 and x2 have different sizes
        along the 0th dimension.
        """
        x1 = x1.expand(N, N, D)
        x2 = x2.expand(N, N, D)
        if kernel_type == 'rbf':
            result = self.compute_rbf(x1, x2)
        elif kernel_type == 'imq':
            result = self.compute_inv_mult_quad(x1, x2)
        else:
            raise ValueError('Undefined kernel type.')
        return result

    def compute_rbf(self,x1,x2,eps = 1e-7):
        """
        Computes the RBF Kernel between x1 and x2.
        :param x1: (Tensor)
        :param x2: (Tensor)
        :param eps: (Float)
        :return:
        """
        z_dim = x2.size(-1)
        sigma = 2. * z_dim * self.z_var
        result = torch.exp(-((x1 - x2).pow(2).mean(-1) / sigma))
        return result

    def compute_inv_mult_quad(self,x1,x2,eps= 1e-7):
        """
        Computes the Inverse Multi-Quadratics Kernel between x1 and x2,
        given by
                k(x_1, x_2) = \sum \frac{C}{C + \|x_1 - x_2 \|^2}
        :param x1: (Tensor)
        :param x2: (Tensor)
        :param eps: (Float)
        :return:
        """
        z_dim = x2.size(-1)
        C = 2 * z_dim * self.z_var
        kernel = C / (eps + C + (x1 - x2).pow(2).sum(dim = -1))
        # Exclude diagonal elements
        result = kernel.sum() - kernel.diag().sum()
        return result
    def _trainer(self, train_data, targets):
        train_data=train_data.view(-1, self.input_dimension)
        targets = targets.view(-1,self.number_of_labels)
        self.batch_size = train_data.shape[0]
        torch.cuda.empty_cache()
        self.optimizer.zero_grad()
        self.encoder.zero_grad()
        self.decoder.zero_grad()
        [
            loss,
            pred_loss,
            recon_loss,
            original_KL,
            tc_loss,
            mmd_loss,
        ] = self.compute_loss(data=train_data,targets = targets)
        loss.backward()
        self.optimizer.step()
        return (
            loss.item(),
            pred_loss,
            recon_loss,
            original_KL,
            tc_loss,
            mmd_loss
        )
    def trainer(self,train_data,test_data,train_label,eval_data, ite_train, ite_eval,ite_test,
                ):
        self.encoder.apply(self.weights_init)
        self.decoder.apply(self.weights_init)
        train_loss = []
        val_loss = []
        test_score=-1
        
        train_data = torch.Tensor(train_data)
        test_data = torch.Tensor(test_data).to(self.device)
        train_label = torch.Tensor(train_label)
        eval_data = torch.Tensor(eval_data).to(self.device)
        ite_train = torch.Tensor(ite_train)
        ite_eval = torch.Tensor(ite_eval).to(self.device)
        ite_test = torch.Tensor(ite_test).to(self.device)
        
        self.mean_treat  = torch.mean(train_label[:,1])
        self.std_treat  = torch.std(train_label[:,1])
        self.mean_control = torch.mean(train_label[:,2])
        self.std_control = torch.std(train_label[:,2])
        train_label[:,1] = (train_label[:,1]- self.mean_treat) / self.std_treat
        train_label[:,2] = (train_label[:,2] - self.mean_control) / self.std_control
        epoch_label = train_label.detach().clone()
        epoch_data = train_data.detach().clone()
        epoch_ite = ite_train.detach().clone()
        for epoch in range(self.n_epochs):
            self.epoch = epoch
            epoch_train_loss=[]
            train_set = torch.utils.data.TensorDataset(epoch_data, epoch_label,epoch_ite)
            train_loader = DataLoader(train_set, shuffle=True,
                                      num_workers=1, drop_last=True, 
                                      batch_size=self.batch_size)
            self.n_data = len(epoch_data)
            for i, batch_data in enumerate(train_loader,0):
                data,label,ite = batch_data
                loss, pred_loss, recon_loss,original_KL,tc_loss, mmd_loss = self._trainer(data.to(self.device),
                                                                                          label.to(self.device),
                                                                                         )
                epoch_train_loss.append([loss, pred_loss,recon_loss,original_KL,
                                         tc_loss, mmd_loss])
            
            epoch_pred = self.encoder(eval_data)
            epoch_pred[:,1] = epoch_pred[:,1] *self.std_treat + self.mean_treat
            epoch_pred[:,2] = epoch_pred[:,2] *self.std_control + self.mean_control
            preds= epoch_pred[:,1] - epoch_pred[:,2]
            
            epoch_val_loss=self.mse(preds.reshape(-1,1) ,
                                    ite_eval.reshape(-1,1)
                                   ).item()**0.5 
            train_loss.append(np.mean(epoch_train_loss,axis=0))
            val_loss.append(epoch_val_loss)      
            if self.generate_data:
                with torch.no_grad():
                    epoch_data,epoch_label,epoch_ite = self.new_data(train_data.detach().clone(),
                                                           train_label.detach().clone(),
                                                          ite_train.detach().clone())   
            if self.early_stopper.early_stop(epoch_val_loss):
                encoded_test = self.encoder(test_data) 
                encoded_test[:,1] = encoded_test[:,1] *self.std_treat + self.mean_treat
                encoded_test[:,2] = encoded_test[:,2] *self.std_control + self.mean_control
                preds= encoded_test[:,1] - encoded_test[:,2]
                test_score = self.mse(preds.reshape(-1,1),
                                      ite_test.reshape(-1,1)).item()**0.5
                break
        if self.plot:
            f = plt.figure(figsize=(10,5))
            train_losses= np.array(train_loss).reshape(-1,6)
            for i, loss in enumerate(['combined_loss','pred_loss','recon_loss','original KL',
                                      'tc_loss','mmd_loss'+str(self.wasserstein)]):
                train = train_losses[:,i]
                ax = f.add_subplot(1,1,1)
                plt.title(loss)
                plt.plot(train,label='train')
                if i ==1:
                    plt.plot(val_loss,label='eval' )
                plt.legend()
                plt.show()
        return train_loss,val_loss,test_score
    def simple_mmd_loss(self,X_treat, X_control):
        """Calculate Maximum Mean Discrepancy loss."""
        return 2 * torch.norm(X_treat.mean(axis=0) - X_control.mean(axis=0))

    def compute_loss(self, data,targets):
        out_encoder = self.encoder(data)
        #resample latent variables 
        q_zgivenxobs = td.Independent(
            td.Normal(
                loc=out_encoder[..., : self.latent_dimension],
                scale=torch.exp(out_encoder[..., self.latent_dimension :]) ** 0.5,
            ),
            1,
        )  # each row is a latent vector
        zgivenx_flat = q_zgivenxobs.rsample()
        zgivenx = zgivenx_flat.reshape((-1, self.latent_dimension))

        # calculate reconstruction loss
        out_decoder = self.decoder(zgivenx)
        recon_loss = self.mse(out_decoder, data)
        
        #calculate mmd_loss
        if len(targets[targets[:,0]==1,:])==0 or len(targets[targets[:,0]==1,:])==len(targets):
            mmd_loss = torch.tensor([float('0')]).to(self.device)
        else:
            control = td.Independent(td.Normal(loc=out_encoder[targets[:,0]==0, 
                                                               3:self.latent_dimension],
                                                scale=torch.exp(
                                                    out_encoder[targets[:,0]==0,
                                                                self.latent_dimension+3 :]) ** 0.5,
                                                )
                                       ,1)
            #x1 shape: (K*bs, 1, latent_dim)
            x1 = control.rsample([self.K]).view(-1,1,
                                           self.latent_dimension-self.number_of_labels)
            
            target = control.log_prob(x1).mean(axis=1).view(-1,1) 
            #target shape: (K*bs, control.size(0))
            treatment = td.Independent(td.Normal(loc=out_encoder[targets[:,0]==1, 
                                                                 3:self.latent_dimension],
                                                scale=torch.exp(
                                                    out_encoder[targets[:,0]==1, 
                                                                self.latent_dimension+3 :]) ** 0.5,
                                                )
                                       ,1)
            #prob_treatment shape: (K*bs, treatment.size(0))
            prob_treatment = treatment.log_prob(x1).mean(axis=1).view(-1,1)
            if self.wasserstein==1: 
                bias_corr = self.batch_size *  (self.batch_size - 1)
                reg_weight = self.reg_weight / bias_corr
                mmd_loss = 10**6*self.compute_mmd(target, 
                                            prob_treatment,
                                            reg_weight)
            elif self.wasserstein==2: 
                mmd_loss=10*self.simple_mmd_loss(target,
                                              prob_treatment)
                
            elif self.wasserstein==3:
                mmd_loss, _,_ = self.sinkhorn(target, 
                                              prob_treatment)
                mmd_loss = 10*mmd_loss
            else: 
                mmd_loss = 10**8*self.kl_loss(F.log_softmax(torch.exp(prob_treatment),dim=0),
                                        F.log_softmax(torch.exp(target),dim=0)
                                        )
       
        # calculate the original KL in VAE
        original_KL = self._kl_normal_loss(
            out_encoder[..., self.number_of_labels: self.latent_dimension],
            out_encoder[..., self.latent_dimension +self.number_of_labels :],
        )
        # prob of z given observations x
        log_pz = (
            td.Independent(
                td.Normal(
                    loc=torch.zeros_like(zgivenx), scale=torch.ones_like(zgivenx)
                ),
                1,
            )
            .log_prob(zgivenx)
            .mean()
        )
        log_q_zCx = q_zgivenxobs.log_prob(zgivenx).mean()

        log_qz, log_prod_qzi = self.get_log_qz_prodzi(
            zgivenx, out_encoder
        )
        # I[z;x] = KL[q(z,x)||q(x)q(z)] = E_x[KL[q(z|x)||q(z)]]
        mi_loss = (log_q_zCx - log_qz).mean()
        # TC[z] = KL[q(z)||\prod_i z_i]
        tc_loss = (log_qz - log_prod_qzi).mean()
        # dw_kl_loss is KL[q(z)||p(z)] instead of usual KL[q(z|x)||p(z))]
        dw_kl_loss = (log_prod_qzi - log_pz).mean()
        
        prediction_losses = self.pred_loss(targets,out_encoder)
        
        loss_prediction=  (1 + 0.2*self.epoch)*(prediction_losses[1]*self.pred_weight[1] + \
        prediction_losses[0]*self.pred_weight[0])
        neg_bound =  loss_prediction+ recon_loss*self.recon_weight + original_KL*self.KL_weight \
        + tc_loss * self.beta + mmd_loss*self.gamma
        return (
            neg_bound,
            loss_prediction.item(),
            recon_loss.item()*self.recon_weight,
            original_KL.item()*self.KL_weight,
            tc_loss.item()* self.beta,
            mmd_loss.item()*self.gamma
        )

        

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
path = 'csv_files/' #path of IHDP files, downloaded from https://www.fredjo.com/
def processed_data(i):
    train_data = pd.read_csv(path+'1000_train'+str(i)+'.csv').values
    train_data[:,13] = train_data[:,13]-1 #processing according to CEVAE
    test_data = pd.read_csv(path+'1000_test'+str(i)+'.csv').values
    test_data[:,13] = test_data[:,13]-1 #processing according to CEVAE
    train_label = np.zeros((len(train_data),3))
    t= pd.read_csv(path+'1000_train_t'+str(i)+'.csv').values.flatten()
    train_label[:,0] = t
    yf = pd.read_csv(path+'1000_train_yf'+str(i)+'.csv').values.flatten()
    ycf = pd.read_csv(path+'1000_train_ycf'+str(i)+'.csv').values.flatten()
    train_label[:,1] = np.where(t==1,yf, ycf) #treatment effect
    train_label[:,2] = np.where(t==0,yf, ycf) #non-treatment effect
    train_ite = np.zeros((len(train_data),1))
    train_ite = np.where(t==1,yf-ycf, ycf-yf)
    ite_test = pd.read_csv(path+'1000_test_ite'+str(i)+'.csv').values
    train_eval_split = int(0.8*len(train_data))
    indices = np.random.permutation(train_data.shape[0])
    training_idx, eval_idx = indices[:train_eval_split], indices[train_eval_split:]
    return train_data[training_idx,:],test_data,train_label[training_idx,:],\
            train_data[eval_idx,:],train_ite[training_idx],train_ite[eval_idx], \
            ite_test
    

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

VAE = DisentangledVAE(n_epochs=150, number_of_labels=3,input_dimension =25 ,
                      latent_dimension = 10, 
                      hidden_layer_width=500,device=device)
VAE.batch_size=32
VAE.optimizer = optim.Adam(
    list(VAE.encoder.parameters()) + list(VAE.decoder.parameters()), lr=1e-4
    # ,weight_decay=1e-5
)
#prediction loss
VAE.pred_weight = [1,1]
# KL loss
VAE.KL_weight=1
#TC loss
VAE.beta = .1
#mmd loss
VAE.gamma=.1
VAE.plot=False
#recon loss
VAE.recon_weight = .1

VAE.early_stopper = EarlyStopper(patience=4, min_delta=0.05)

In [5]:
train_losses=[]
eval_losses=[]
test_losses=[]

#in this demo, we take all 1000 replications
for i in trange(1000, position=0, desc="replication", leave=True, colour='black',):
    train_data,test_data,train_label,eval_data, ite_train, ite_eval,ite_test = processed_data(i)   
    VAE.wasserstein = 3
    VAE.pred_weight[0] =min(1, 0.1*train_label[train_label[:,0]==1,1].std())
    VAE.pred_weight[1] =min(1, 0.1*train_label[train_label[:,0]==0,2].std())
    VAE.early_stopper = EarlyStopper(patience=3, min_delta=0.05)
    score_train,score_eval,test_score = VAE.trainer(train_data,test_data,train_label,
                                                    eval_data, ite_train, ite_eval,
                                                    ite_test)
    train_losses.append(score_train)
    eval_losses.append(score_eval)
    test_losses.append(test_score)
    if i%20==0:
        print(i, np.mean(test_losses))


replication:   0%|          | 0/1000 [00:00<?, ?it/s]

0 0.7558301579942495
20 1.1997330698872024
40 1.4015571646085876
60 1.2460875413109855
80 1.16341406855023
100 1.1909069341294314
120 1.1736943499885182
140 1.174061009167624
160 1.161895283457155
180 1.1269594246207764
200 1.0835938837892087
220 1.1104491536499124
240 1.0839014553708495
260 1.1039305370824604
280 1.0876450563651754
300 1.0785998109015622
320 1.084301519174837
340 1.109668150221493
360 1.134661119247293
380 1.1261475927454196
400 1.1242566855494145
420 1.1248608941387763
440 1.126160569088464
460 1.114024931802719
480 1.1267691322525049
500 1.1176694250793278
520 1.122576967763496
540 1.1151985180076633
560 1.106914793126009
580 1.098937150577528
600 1.1238941403263083
620 1.1225157900381366
640 1.1235515792763264
660 1.1393948576810426
680 1.1469535552016352
700 1.1457089801705835
720 1.1439502751905228
740 1.140806109263491
760 1.1373349094842602
780 1.162481946892535
800 1.1634805794643193
820 1.1757626732643385
840 1.1769200244935225
860 1.171856044557904
880 1.173

In [6]:
#print the results
from scipy.stats import sem
results = np.array(test_losses)
print(
    '\n The average rpehe of test data are: ',
    np.mean(results),
    'The standard error of epehe in test data are: ',
    sem(results))


 The average rpehe of test data are:  1.2000013613424507 The standard error of epehe in test data are:  0.0409820255155511
