In [1]:
import torch.nn as nn

class Autoencoder(nn.Module):
    def __init__(self, dim,theta):
        super(Autoencoder, self).__init__()
        self.dim = dim

        self.drop_out = nn.Dropout(p=0.1)

        self.encoder = nn.Sequential(
            nn.Linear(dim + theta * 0, dim + theta * 1),
            nn.Tanh(),
            nn.Linear(dim + theta * 1, dim + theta * 2),
            nn.Tanh(),
            nn.Linear(dim + theta * 2, dim + theta * 3)
        )

        self.decoder = nn.Sequential(
            nn.Linear(dim + theta * 3, dim + theta * 2),
            nn.Tanh(),
            nn.Linear(dim + theta * 2, dim + theta * 1),
            nn.Tanh(),
            nn.Linear(dim + theta * 1, dim + theta * 0)
        )

    def forward(self, x):
        x = x.view(-1, self.dim)
        x_missed = self.drop_out(x)

        z = self.encoder(x_missed)
        out = self.decoder(z)

        out = out.view(-1, self.dim)

        return out

In [3]:
#基础包
import os
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.utils.data
from pandas import isnull
from logger import logger
#继承类和model
from utils.tools import Solver
from utils.normalizer import NORMALIZERS,RECOVER

#超参数选择
# from hyperopt import hp
from ray import tune
from ray.tune.suggest.hyperopt import HyperOptSearch
from utils.handle_missingdata import gene_missingdata
# space = {
#     "lr": hp.loguniform("lr", 1e-10, 0.1),
#     "momentum": hp.uniform("momentum", 0.1, 0.9),
# }

#baseline插补方法
from ycimpute.imputer import  mice
from ycimpute.utils import evaluate
from utils.base_impute import random_inpute
from fancyimpute import IterativeImputer, SimpleFill
imputation = {'median':SimpleFill("median").fit_transform,'random':random_inpute,'mice':mice.MICE().complete,'ii':IterativeImputer().fit_transform}

from matplotlib import pyplot as plt
# plotting params
%matplotlib inline
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 10
plt.rcParams['axes.titlesize'] = 10
plt.rcParams['xtick.labelsize'] = 8
plt.rcParams['ytick.labelsize'] = 8
plt.rcParams['legend.fontsize'] = 10
plt.rcParams['figure.titlesize'] = 12
plt.rcParams['figure.figsize'] = (8.0, 8.0)
def draw_train_loss(Batchs, train_loss):
    title="training loss"
    plt.title(title, fontsize=24)
    plt.xlabel("batch", fontsize=14)
    plt.ylabel("loss", fontsize=14)
    plt.plot(Batchs, train_loss, color='red', label='training loss')
    plt.legend()
    plt.grid()
    plt.show()


class TAI(Solver):
    def __init__(
            self,
            theta=5,
            epochs=50,
            use_cuda=False,
            batch_size=64,
            early_stop=1e-06,
            normalizer='min_max',
            iterations=10,
            first_imputation_method='ii',
            learning_rate=0.0001,
            verbose=True):

        Solver.__init__(
            self,
            normalizer=normalizer)

        self.theta = theta
        self.epochs = epochs
        self.use_cuda = use_cuda
        self.batch_size = batch_size
        self.verbose = verbose
        self.iterations=iterations
        self.early_stop = early_stop
        self.learning_rate=learning_rate
        self.first_imputation_method=first_imputation_method

        self.device = torch.device("cuda:0" if self.use_cuda else "cpu")

    def training(self, training_data,missing_mask):
        n_features = training_data.shape[1]

        training_mask=1-missing_mask
        training_data = torch.from_numpy(training_data).float()
        train_loader = torch.utils.data.DataLoader(dataset=training_data,
                                                   batch_size=self.batch_size,
                                                   shuffle=True)
        model = Autoencoder(dim=n_features,
                            theta=self.theta).to(self.device)
        criterion = torch.nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(),lr=self.learning_rate)
        
        Batch=0
        Batchs=[]
        cost_list = []
        early_stop = False
        for epoch in range(self.epochs):
            for batch_idx, batch_data in enumerate(train_loader):
                inputs = batch_data.to(self.device)
                inputs = inputs.resize_(self.batch_size, n_features)
                dec = model(inputs)
                loss = criterion(dec[training_mask], inputs[training_mask])
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                if batch_idx!=0 and batch_idx%50==0:
                    Batch = Batch+50 
                    Batchs.append(Batch)
                    all_train_loss.append(loss.item())
                # early stopping rule 1 : MSE < 1e-06
                if loss.item() < 1e-06:
                    early_stop = True
                    break
                if batch_idx % 50 == 0:
                    print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                        epoch, batch_idx * len(batch_data), len(train_loader.dataset),
                               100. * batch_idx / len(train_loader), loss.item()))
                cost_list.append(loss.item())

            if early_stop:
                break
        print(all_train_loss)
        draw_train_loss(Batchs,all_train_loss)
        return model
    def get_missing_loc(self,missdata):
        miss_mark = []  # 记录缺失位置
        for i in missdata:
            temp = []
            for index, j in enumerate(i):
                if np.isnan(j):
                    temp.append(index)
            miss_mark.append(temp)
        return miss_mark
    def revise(self,data,model,missing_mask):
        criterion = torch.nn.MSELoss()

        n_feature=data.shape[1]
        inputs = torch.ones(self.batch_size,n_feature, requires_grad=True, device=self.device)
        for num in range(len(data)):
            for ind in range(len(data[num])):
                inputs.data[num,ind] = data[num][ind]

        for t in range(self.iterations):
            outputs = model(inputs)
            loss = criterion(outputs, inputs)
            loss.backward()
            with torch.no_grad():
                inputs.data -= self.learning_rate * inputs.grad.data
                # for mark in miss_loaction[num]:
                #     inputs.data -= self.learning_rate * inputs.grad.data
                inputs.grad.zero_()
        final_num = data*(1-missing_mask)+inputs.data.cpu().numpy()*missing_mask
        return final_num
    def solve(self, X,missing_mask):
        model = self.training(self.fill(X.copy(),missing_mask,'zero'),missing_mask)
        model.eval()
        pred_data = imputation[self.first_imputation_method](X)
        filled_data =self.revise(data=pred_data.copy(),model=model,missing_mask=missing_mask)
        return filled_data, pred_data

    def complete(self, x):
        """
        Expects 2d float matrix with NaN entries signifying missing values

        Returns completed matrix without any NaNs.
        """
        self._check_input(x)
        self._check_missing_value_mask(isnull(x))
        x, missing_mask = self.prepare_input_data(x)

        x_zero_replaced = self.fill(x.copy(),missing_mask,'zero')
        if self.normalizer is not None:
            normalizer = NORMALIZERS[self.normalizer]
            x_zero_replaced, min_record, max_record = normalizer(x_zero_replaced)
        x_zero_replaced[missing_mask]=np.nan
        x_filled,first_filled = self.solve(x_zero_replaced,missing_mask)
        revocer = RECOVER[self.normalizer]
        x_filled = revocer(x_filled, min_record, max_record)
        first_filled= revocer(first_filled, min_record, max_record)
        return x_filled,first_filled

In [7]:
import os
import impyute
import numpy as np
import pandas as pd

from logger import logger
from ycimpute.utils import evaluate
from matplotlib import pyplot as plt
from utils.handle_missingdata import gene_missingdata,gene_missingdata_taxa_bias,gene_missingdata_chara_bias,gene_missingdata_block_bias
from dnn.mida import MIDA
from dnn.gain import GAIN
from dnn.tai import TAI
from ycimpute.imputer import knnimput, mice, EM
from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, IterativeImputer, BiScaler, SimpleFill

path = r'public_data/'
pciturePath = r'G:\labWork\imputation_plt\pub_my_method'

file='1_Iris.xlsx'
logger.info("**********************{}********************".format(file))
data = pd.read_excel(os.path.join(path, file), sheet_name="dataset")
dt = np.array(data.values)
data = dt.astype('float')
origin_data = data[:, :-1]
target = data[:, -1]



miss_data=gene_missingdata_block_bias(rate=0.4, data=origin_data)
imputed_data , first_imputed_data= TAI(first_imputation_method='mice',batch_size=len(miss_data),epochs=10000,theta=int(len(miss_data[0])/2),iterations=100).complete(miss_data)
score = evaluate.RMSE(origin_data, imputed_data)
score1 = evaluate.RMSE(origin_data, first_imputed_data)
score2 = evaluate.RMSE(imputed_data, first_imputed_data)
logger.info("TAI mice first missing rate:{},RMSE:{}".format(i, score1))
logger.info("TAI mice missing rate:{},RMSE:{}".format(i, score))
logger.info("TAI mice missing rate:{},changed RMSE:{}".format(i, score))
tai_mice_rmse.append(score)

2020-08-05 11:21:07.096 | INFO     | __main__:<module>:20 - **********************1_Iris.xlsx********************




















































































2020-08-05 11:21:25.638 | INFO     | __main__:<module>:37 - TAI mice first missing rate:0.05,RMSE:0.48356885238815756
2020-08-05 11:21:25.639 | INFO     | __main__:<module>:38 - TAI mice missing rate:0.05,RMSE:0.39639964997312016
2020-08-05 11:21:25.639 | INFO     | __main__:<module>:39 - TAI mice missing rate:0.05,changed RMSE:0.39639964997312016
