In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from sklearn.decomposition import PCA
import math
import random
import numpy as np
import pandas as pd
import scipy.io as sio
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.utils.np_utils import to_categorical
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score,roc_auc_score,matthews_corrcoef
import warnings
warnings.filterwarnings("ignore")

In [2]:
#data1 = pd.read_csv("../dataset/NASA/CM1.csv")
#data1.head()

In [3]:
import numpy as np
from sklearn import preprocessing
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import datetime

def show_accuracy(predictLabel,Label):
    Label = np.ravel(Label).tolist()
    predictLabel = predictLabel.tolist()
    count = 0
    for i in range(len(Label)):
        if Label[i] == predictLabel[i]:
            count += 1
    return (round(count/len(Label),5))

class node_generator(object):
    def __init__(self, whiten = False):
        self.Wlist = []
        self.blist = []
        self.function_num = 0
        self.whiten = whiten

    def sigmoid(self, x):
        return 1.0/(1 + np.exp(-x))

    def relu(self, x):
        return np.maximum(x, 0)

    def tanh(self, x):
        return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))

    def linear(self, x):
        return x

    def orth(self, W):
        """
        目前看来，这个函数应该配合下一个generator函数是生成权重的
        """
        for i in range(0, W.shape[1]):
            w = np.mat(W[:,i].copy()).T
            w_sum = 0
            for j in range(i):
                wj = np.mat(W[:,j].copy()).T
                w_sum += (w.T.dot(wj))[0,0]*wj
            w -= w_sum
            w = w/np.sqrt(w.T.dot(w))
            W[:,i] = np.ravel(w)

        return W

    def generator(self, shape, times):
        for i in range(times):
            random.seed(i)
            W = 2*np.random.random(size=shape)-1
            if self.whiten == True:
                W = self.orth(W)   # 只在增强层使用
            b = 2*np.random.random() -1
            yield (W, b)

    def generator_nodes(self, data, times, batchsize, function_num):
        # 按照bls的理论，mapping layer是输入乘以不同的权重加上不同的偏差之后得到的
        # 若干组，所以，权重是一个列表，每一个元素可作为权重与输入相乘
        self.Wlist = [elem[0] for elem in self.generator((data.shape[1], batchsize), times)]
        self.blist = [elem[1] for elem in self.generator((data.shape[1], batchsize), times)]

        self.function_num = {'linear':self.linear,
                        'sigmoid': self.sigmoid,
                        'tanh':self.tanh,
                        'relu':self.relu }[function_num]  # 激活函数供不同的层选择
        # 下面就是先得到一组mapping nodes，再不断叠加，得到len(Wlist)组mapping nodes
        nodes = self.function_num(data.dot(self.Wlist[0]) + self.blist[0])
        for i in range(1, len(self.Wlist)):
            nodes = np.column_stack((nodes, self.function_num(data.dot(self.Wlist[i])+self.blist[i])))
        return nodes

    def transform(self,testdata):
        testnodes = self.function_num(testdata.dot(self.Wlist[0])+self.blist[0])
        for i in range(1,len(self.Wlist)):
            testnodes = np.column_stack((testnodes, self.function_num(testdata.dot(self.Wlist[i])+self.blist[i])))
        return testnodes

class scaler:
    def __init__(self):
        self._mean = 0
        self._std = 0
    
    def fit_transform(self,traindata):
        self._mean = traindata.mean(axis = 0)
        self._std = traindata.std(axis = 0)
        return (traindata-self._mean)/(self._std+0.001)
    
    def transform(self,testdata):
        return (testdata-self._mean)/(self._std+0.001)

class broadnet:
    def __init__(self, 
                 maptimes = 10, 
                 enhencetimes = 10,
                 map_function = 'linear',
                 enhence_function = 'linear',
                 batchsize = 'auto', 
                 reg = 0.001):
        
        self._maptimes = maptimes
        self._enhencetimes = enhencetimes
        self._batchsize = batchsize
        self._reg = reg
        self._map_function = map_function
        self._enhence_function = enhence_function
        
        self.W = 0
        self.pesuedoinverse = 0
        self.normalscaler = scaler()
        self.onehotencoder = preprocessing.OneHotEncoder(sparse = False)
        self.mapping_generator = node_generator()
        self.enhence_generator = node_generator(whiten = True)

    def fit(self,data,label,c):
        if self._batchsize == 'auto':
            self._batchsize = data.shape[1]
        data = self.normalscaler.fit_transform(data)
        label1=label
        label = self.onehotencoder.fit_transform(np.mat(label).T)
        
        mappingdata = self.mapping_generator.generator_nodes(data,self._maptimes,self._batchsize,self._map_function)
        enhencedata = self.enhence_generator.generator_nodes(mappingdata,self._enhencetimes,self._batchsize,self._enhence_function)
        
        #print('number of mapping nodes {0}, number of enhence nodes {1}'.format(mappingdata.shape[1],enhencedata.shape[1]))
        #print('mapping nodes maxvalue {0} minvalue {1} '.format(round(np.max(mappingdata),5),round(np.min(mappingdata),5)))
        #print('enhence nodes maxvalue {0} minvalue {1} '.format(round(np.max(enhencedata),5),round(np.min(enhencedata),5)))
        
        inputdata = np.column_stack((mappingdata,enhencedata))
        S = self.LDA_dimensionality(inputdata,label1)
        pesuedoinverse = self.pinv2(inputdata,self._reg,S,c)
        self.W =  pesuedoinverse.dot(label)
        
        #print('W:', self.W)
        #print('W:', self.W.shape)  
    
    #改写伪逆矩阵算法，将权重输入
    def pinv(self,A,reg,weight):
        return np.mat(reg*np.eye(A.shape[1])+A.T.dot(weight).dot(A)).I.dot(A.T).dot(weight)
    
    def pinv2(self,A,reg,S,c):
        return np.mat(reg*np.eye(A.shape[1])+A.T.dot(A)+c*S).I.dot(A.T)
    
    def LDA_dimensionality(self,X, y):
        '''
        X为数据集，y为label，k为目标维数
        '''
        y=y.tolist()
        label_ = list(set(y))

        X_classify = {}

        for label in label_:
            X1 = np.array([X[i] for i in range(len(X)) if y[i] == label])
            X_classify[label] = X1

            mju = np.mean(X, axis=0)
            mju_classify = {}

        for label in label_:
            mju1 = np.mean(X_classify[label], axis=0)
            mju_classify[label] = mju1

            #St = np.dot((X - mju).T, X - mju)

            Sw = np.zeros((len(mju), len(mju)))  # 计算类内散度矩阵

        for i in label_:
            Sw += np.dot((X_classify[i] - mju_classify[i]).T,
                         X_classify[i] - mju_classify[i])

        # Sb=St-Sw

        Sb = np.zeros((len(mju), len(mju)))  # 计算类间散度矩阵
        for i in label_:
            Sb += len(X_classify[i]) * np.dot((mju_classify[i] - mju).reshape(
                (len(mju), 1)), (mju_classify[i] - mju).reshape((1, len(mju))))

        return (Sw-Sb)
    
    def decode(self,Y_onehot):
        Y = []
        for i in range(Y_onehot.shape[0]):
            lis = np.ravel(Y_onehot[i,:]).tolist()
            Y.append(lis.index(max(lis)))
        return np.array(Y)
    
    def accuracy(self,predictlabel,label):
        label = np.ravel(label).tolist()
        predictlabel = predictlabel.tolist()
        count = 0
        for i in range(len(label)):
            if label[i] == predictlabel[i]:
                count += 1
        return (round(count/len(label),5))
        
    def predict(self,testdata):
        testdata = self.normalscaler.transform(testdata)
        test_mappingdata = self.mapping_generator.transform(testdata)
        test_enhencedata = self.enhence_generator.transform(test_mappingdata)
        
        test_inputdata = np.column_stack((test_mappingdata,test_enhencedata)) 
        #print('*predictlabel shape:',self.decode(test_inputdata.dot(self.W)).shape)
        #print('*predictlabel:', self.decode(test_inputdata.dot(self.W)))
        #print('*accuracy:',show_accuracy(self.decode(test_inputdata.dot(self.W)),testlabel))
        return self.decode(test_inputdata.dot(self.W))      

In [4]:
if __name__ == '__main__':
    data = pd.read_csv("../../dataset/NASA/KC3.csv")  
  
    le = preprocessing.LabelEncoder()
    for item in data.columns:
        data[item] = le.fit_transform(data[item])
    #print(data)
    
    label = data[' Defective'].values
    
    data = data.drop(' Defective',axis=1)
    data = data.values
    print(data.shape,max(label)+1)

    traindata,testdata,trainlabel,testlabel = train_test_split(data,label,test_size=0.4,random_state = 0)
    print(traindata.shape,trainlabel.shape,testdata.shape,testlabel.shape)
    
    

    k_acc_list, k_f1_list, k_auc_list, k_recall_list, k_mcc_list, k_Gm_list=[],[],[],[],[],[]
    #这里设置shuffle设置为ture就是打乱顺序在分配
    kf = KFold(n_splits=10,shuffle=True,random_state=42)
    for map_times in np.arange(15,36,5):
        acc_list_tmp, f1_list_tmp, auc_list_tmp, recall_list_tmp, mcc_list_tmp,Gm_list_tmp=[],[],[],[],[],[]
        for enhance_times in np.arange(15, 36, 5):
            for k, (train, test) in enumerate(kf.split(traindata, trainlabel)):
                # kf.split输出的是索引，所以由索引获取交叉后的训练集和测试集及标签
                k_train_data,k_train_label = traindata[train], trainlabel[train]
                k_test_data,k_test_label = traindata[test], trainlabel[test]
                
                #k_test_label=np.transpose(k_test_label)
                #k_train_label=np.transpose(k_train_label)
                #k_test_label=k_test_label[0]
                #k_train_label=k_train_label[0]

                bls = broadnet(maptimes = map_times, 
                           enhencetimes = enhance_times,
                           map_function = 'relu',
                           enhence_function = 'relu',
                           batchsize =100,
                           reg = 0.001)
                
                
                
                #训练
                starttime = datetime.datetime.now()
                bls.fit(k_train_data,k_train_label,1)
                endtime = datetime.datetime.now()
                #print('the training time of BLS is {0} seconds'.format((endtime - starttime).total_seconds()))

                #print('k_test_label:', k_test_label)
                #预测
                k_predict_label = bls.predict(k_test_data)
                #print('k_predict_label:', k_predict_label)

                #评价指标计算
                acc=accuracy_score(k_test_label,k_predict_label, normalize=True)
                fmeasure=f1_score(k_test_label,k_predict_label, average='weighted', labels=np.unique(k_test_label))
                try:
                    auc=roc_auc_score(k_test_label,k_predict_label, average='weighted', sample_weight=None)
                except ValueError:
                    pass
                recall=recall_score(k_test_label, k_predict_label, average='weighted')
                MCC=matthews_corrcoef(k_test_label,k_predict_label)
                Gmeasure=geometric_mean_score(k_test_label,k_predict_label, average='weighted')

                #将此次的十折交叉验证的结果 (10个)保存到pi_list_tmp中
                acc_list_tmp.append(acc)
                f1_list_tmp.append(fmeasure)
                auc_list_tmp.append(auc)
                recall_list_tmp.append(recall)
                mcc_list_tmp.append(MCC)
                Gm_list_tmp.append(Gmeasure)
                
            #求平均保存到k_acc_list中   
            k_average_acc=np.mean(acc_list_tmp)
            k_average_acc=round(k_average_acc,5)
            k_acc_list.append(k_average_acc)
            
            #求平均保存到k_f1_list中   
            k_average_f1=np.mean(f1_list_tmp)
            k_average_f1=round(k_average_f1,5)
            k_f1_list.append(k_average_f1)
            
            #求平均保存到k_auc_list中   
            k_average_auc=np.mean(auc_list_tmp)
            k_average_auc=round(k_average_auc,5)
            k_auc_list.append(k_average_auc)
            
            #求平均保存到k_recall_list中   
            k_average_recall=np.mean(recall_list_tmp)
            k_average_recall=round(k_average_recall,5)
            k_recall_list.append(k_average_recall)
            
            #求平均保存到k_mcc_list中   
            k_average_mcc=np.mean(mcc_list_tmp)
            k_average_mcc=round(k_average_mcc,5)
            k_mcc_list.append(k_average_mcc)
            
            #求平均保存到k_Gm_list中   
            k_average_Gm=np.mean(Gm_list_tmp)
            k_average_Gm=round(k_average_Gm,5)
            k_Gm_list.append(k_average_Gm)
            print(f'maptiems:{map_times}\tenhancetimes:{enhance_times}\tk_average_acc:{k_average_acc}\tk_average_f1:{k_average_f1}\tk_average_auc:{k_average_auc}\tk_average_recall:{k_average_recall}\tk_average_mcc:{k_average_mcc}\tk_average_Gm:{k_average_Gm}\t')

    k_acc_array=np.array(k_acc_list)
    k_acc_array=k_acc_array.reshape(5,5)
    # 一维最大值索引
    #idx_max_ravel = np.argmax(k_acc_array)
    # true索引
    #idx_max = np.unravel_index(idx_max_ravel, k_acc_array.shape)
    #print('max times:',idx_max)
    
    k_f1_array=np.array(k_f1_list)
    k_f1_array=k_f1_array.reshape(5,5)
    
    k_auc_array=np.array(k_auc_list)
    k_auc_array=k_auc_array.reshape(5,5)
    
    k_recall_array=np.array(k_recall_list)
    k_recall_array=k_recall_array.reshape(5,5)
    
    k_mcc_array=np.array(k_mcc_list)
    k_mcc_array=k_mcc_array.reshape(5,5)
    
    k_Gm_array=np.array(k_Gm_list)
    k_Gm_array=k_Gm_array.reshape(5,5)
    
    sio.savemat('./data_remember/0.75LPBLS_KC3_MEbest.mat',{'acc':k_acc_array,'f1':k_f1_array,'auc':k_auc_array,'recall':k_recall_array,'mcc':k_mcc_array,'Gm':k_Gm_array})
    

(194, 39) 2
(116, 39) (116,) (78, 39) (78,)
maptiems:15	enhancetimes:15	k_average_acc:0.53106	k_average_f1:0.57755	k_average_auc:0.46035	k_average_recall:0.53106	k_average_mcc:-0.07575	k_average_Gm:0.42253	
maptiems:15	enhancetimes:20	k_average_acc:0.53977	k_average_f1:0.58471	k_average_auc:0.46859	k_average_recall:0.53977	k_average_mcc:-0.05318	k_average_Gm:0.4275	
maptiems:15	enhancetimes:25	k_average_acc:0.54596	k_average_f1:0.59023	k_average_auc:0.46878	k_average_recall:0.54596	k_average_mcc:-0.05584	k_average_Gm:0.42648	
maptiems:15	enhancetimes:30	k_average_acc:0.5447	k_average_f1:0.58837	k_average_auc:0.46609	k_average_recall:0.5447	k_average_mcc:-0.06214	k_average_Gm:0.42285	
maptiems:15	enhancetimes:35	k_average_acc:0.54409	k_average_f1:0.58749	k_average_auc:0.46257	k_average_recall:0.54409	k_average_mcc:-0.0655	k_average_Gm:0.41951	
maptiems:20	enhancetimes:15	k_average_acc:0.53409	k_average_f1:0.57797	k_average_auc:0.49737	k_average_recall:0.53409	k_average_mcc:-0.04564	k_av

In [None]:
# acc=accuracy_score(predictlabel, testlabel, normalize=True)
# precision=precision_score(predictlabel, testlabel, average='weighted',zero_division=1)
# recall=recall_score(predictlabel, testlabel, average='weighted')
# fmeasure=f1_score(predictlabel, testlabel, average='weighted', labels=np.unique(testlabel))
# auc=roc_auc_score(predictlabel, testlabel, average='weighted', sample_weight=None)
# MCC=matthews_corrcoef(predictlabel, testlabel)
# Gmeasure=geometric_mean_score(predictlabel, testlabel, average='weighted')

In [None]:
#print('acc：%f,precision：%f,recall：%f,fmeasure：%f,auc：%f,,MCC：%f,Gmeasure：%f'%(acc,precision,recall,fmeasure,auc,MCC,Gmeasure))