In [38]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
import math
import random
import numpy as np
import pandas as pd
import scipy.io as sio
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.utils.np_utils import to_categorical
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score,roc_auc_score,matthews_corrcoef
import warnings
warnings.filterwarnings("ignore")

In [39]:
#data1 = pd.read_csv("../dataset/NASA/CM1.csv")
#data1.head()

In [40]:
import numpy as np
from sklearn import preprocessing
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import datetime
import csv

def show_accuracy(predictLabel,Label):
    Label = np.ravel(Label).tolist()
    predictLabel = predictLabel.tolist()
    count = 0
    for i in range(len(Label)):
        if Label[i] == predictLabel[i]:
            count += 1
    return (round(count/len(Label),5))

class node_generator(object):
    def __init__(self, whiten = False):
        self.Wlist = []
        self.blist = []
        self.function_num = 0
        self.whiten = whiten

    def sigmoid(self, x):
        return 1.0/(1 + np.exp(-x))

    def relu(self, x):
        return np.maximum(x, 0)

    def tanh(self, x):
        return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))

    def linear(self, x):
        return x

    def orth(self, W):
        """
        目前看来，这个函数应该配合下一个generator函数是生成权重的
        """
        for i in range(0, W.shape[1]):
            w = np.mat(W[:,i].copy()).T
            w_sum = 0
            for j in range(i):
                wj = np.mat(W[:,j].copy()).T
                w_sum += (w.T.dot(wj))[0,0]*wj
            w -= w_sum
            w = w/np.sqrt(w.T.dot(w))
            W[:,i] = np.ravel(w)

        return W

    def generator(self, shape, times):
        for i in range(times):
            random.seed(i)
            W = 2*np.random.random(size=shape)-1
            if self.whiten == True:
                W = self.orth(W)   # 只在增强层使用
            b = 2*np.random.random() -1
            yield (W, b)

    def generator_nodes(self, data, times, batchsize, function_num):
        # 按照bls的理论，mapping layer是输入乘以不同的权重加上不同的偏差之后得到的
        # 若干组，所以，权重是一个列表，每一个元素可作为权重与输入相乘
        self.Wlist = [elem[0] for elem in self.generator((data.shape[1], batchsize), times)]
        self.blist = [elem[1] for elem in self.generator((data.shape[1], batchsize), times)]

        self.function_num = {'linear':self.linear,
                        'sigmoid': self.sigmoid,
                        'tanh':self.tanh,
                        'relu':self.relu }[function_num]  # 激活函数供不同的层选择
        # 下面就是先得到一组mapping nodes，再不断叠加，得到len(Wlist)组mapping nodes
        nodes = self.function_num(data.dot(self.Wlist[0]) + self.blist[0])
        for i in range(1, len(self.Wlist)):
            nodes = np.column_stack((nodes, self.function_num(data.dot(self.Wlist[i])+self.blist[i])))
        return nodes

    def transform(self,testdata):
        testnodes = self.function_num(testdata.dot(self.Wlist[0])+self.blist[0])
        for i in range(1,len(self.Wlist)):
            testnodes = np.column_stack((testnodes, self.function_num(testdata.dot(self.Wlist[i])+self.blist[i])))
        return testnodes

class scaler:
    def __init__(self):
        self._mean = 0
        self._std = 0
    
    def fit_transform(self,traindata):
        self._mean = traindata.mean(axis = 0)
        self._std = traindata.std(axis = 0)
        return (traindata-self._mean)/(self._std+0.001)
    
    def transform(self,testdata):
        return (testdata-self._mean)/(self._std+0.001)

class broadnet:
    def __init__(self, 
                 maptimes = 10, 
                 enhencetimes = 10,
                 map_function = 'linear',
                 enhence_function = 'linear',
                 batchsize = 'auto', 
                 reg = 0.001):
        
        self._maptimes = maptimes
        self._enhencetimes = enhencetimes
        self._batchsize = batchsize
        self._reg = reg
        self._map_function = map_function
        self._enhence_function = enhence_function
        
        self.W = 0
        self.pesuedoinverse = 0
        self.normalscaler = scaler()
        self.onehotencoder = preprocessing.OneHotEncoder(sparse = False)
        self.mapping_generator = node_generator()
        self.enhence_generator = node_generator(whiten = True)

    def fit(self,data,label,weight):
        if self._batchsize == 'auto':
            self._batchsize = data.shape[1]
        data = self.normalscaler.fit_transform(data)
        label = self.onehotencoder.fit_transform(np.mat(label).T)
        
        mappingdata = self.mapping_generator.generator_nodes(data,self._maptimes,self._batchsize,self._map_function)
        enhencedata = self.enhence_generator.generator_nodes(mappingdata,self._enhencetimes,self._batchsize,self._enhence_function)
        
        #print('number of mapping nodes {0}, number of enhence nodes {1}'.format(mappingdata.shape[1],enhencedata.shape[1]))
        #print('mapping nodes maxvalue {0} minvalue {1} '.format(round(np.max(mappingdata),5),round(np.min(mappingdata),5)))
        #print('enhence nodes maxvalue {0} minvalue {1} '.format(round(np.max(enhencedata),5),round(np.min(enhencedata),5)))
        
        inputdata = np.column_stack((mappingdata,enhencedata))
        pesuedoinverse = self.pinv(inputdata,self._reg,weight)
        self.W =  pesuedoinverse.dot(label)
        
        #print('W:', self.W)
        #print('W:', self.W.shape)  
    
    #改写伪逆矩阵算法，将权重输入
    def pinv(self,A,reg,weight):
        return np.mat(reg*np.eye(A.shape[1])+A.T.dot(weight).dot(A)).I.dot(A.T).dot(weight)
    
    def decode(self,Y_onehot):
        Y = []
        for i in range(Y_onehot.shape[0]):
            lis = np.ravel(Y_onehot[i,:]).tolist()
            Y.append(lis.index(max(lis)))
        return np.array(Y)
    
    def accuracy(self,predictlabel,label):
        label = np.ravel(label).tolist()
        predictlabel = predictlabel.tolist()
        count = 0
        for i in range(len(label)):
            if label[i] == predictlabel[i]:
                count += 1
        return (round(count/len(label),5))
        
    def predict(self,testdata):
        testdata = self.normalscaler.transform(testdata)
        test_mappingdata = self.mapping_generator.transform(testdata)
        test_enhencedata = self.enhence_generator.transform(test_mappingdata)
        
        test_inputdata = np.column_stack((test_mappingdata,test_enhencedata)) 
        #print('*predictlabel shape:',self.decode(test_inputdata.dot(self.W)).shape)
        #print('*predictlabel:', self.decode(test_inputdata.dot(self.W)))
        #print('*accuracy:',show_accuracy(self.decode(test_inputdata.dot(self.W)),testlabel))
        return self.decode(test_inputdata.dot(self.W))      

In [42]:
if __name__ == '__main__':
    data = pd.read_csv("../../dataset/NASA/MC1.csv")  
  
    le = preprocessing.LabelEncoder()
    for item in data.columns:
        data[item] = le.fit_transform(data[item])
    #print(data)
    
    label = data[' Defective'].values
    
    data = data.drop(' Defective',axis=1)
    data = data.values
    print(data.shape,max(label)+1)

    Evaluat_list=[]
    t=0
    for i in np.arange(0,10):
        t,trainingTime2,acc2,fmeasure2,auc2,MCC2,Gmeasure2,recall2=0,0,0,0,0,0,0,0
        for k in range(-5,5):
            #将数据集划分为训练集和测试集
            traindata,testdata,trainlabel,testlabel = train_test_split(data,label,test_size=0.25,random_state =1)
            bls = broadnet(maptimes = 35, 
                       enhencetimes = 35,
                       map_function = 'relu',
                       enhence_function = 'relu',
                       batchsize =100,
                       reg = 2**k)

            #对BLS设置加权，按照训练集中的label比例进行权重设置
            #求训练数据集中label为1的个数
            count=0
            for z in range(len(trainlabel)):
                #print ('个数 %d ' %z)
                #print('k_train_label:', k_train_label[z])
                if trainlabel[z]==1:
                    count=count+1
            #print ('label为1的个数 %d ' %j)
            #print(count)
            #print(k_train_label.shape)
            #print(len(k_train_label))
            #权重设置 
            weight = np.zeros((len(trainlabel),len(trainlabel)))
            for i in range(len(trainlabel)):
                if trainlabel[i]==1:
                    #weight[i,i]=0.618/count
                    weight[i,i]=2*(len(trainlabel)-count)/len(trainlabel)
                else:
                    #weight[i,i]=1/(len(k_train_label)-count)
                    weight[i,i]=2*count/len(trainlabel)
            #print('Weight:', weight)

            #训练
            starttime = datetime.datetime.now()
            bls.fit(traindata,trainlabel,weight)
            endtime = datetime.datetime.now()
            trainingTime=(endtime - starttime).total_seconds()
            #print('the training time of BLS is {0} seconds'.format((endtime - starttime).total_seconds()))

            #预测
            predictlabel = bls.predict(testdata)

            #评价指标计算
            acc=accuracy_score(testlabel,predictlabel,normalize=True)
            fmeasure=f1_score(testlabel,predictlabel, average='weighted', labels=np.unique(testlabel))
            try:
                auc=roc_auc_score(testlabel,predictlabel, average='macro', sample_weight=None)
            except ValueError:
                pass
            MCC=matthews_corrcoef(testlabel,predictlabel)
            Gmeasure=geometric_mean_score(testlabel,predictlabel, average='weighted')
            recall=recall_score(testlabel, predictlabel, average='weighted')

            if auc>t:
                t=auc
                trainingTime2=trainingTime
                acc2=acc
                fmeasure2=fmeasure
                auc2=auc
                MCC2=MCC
                Gmeasure2=Gmeasure
                recall2=recall
        print('*trainingTime：%f,acc：%f,fmeasure：%f,auc：%f,recall：%f,MCC：%f,Gmeasure：%f'%(trainingTime2,acc2,fmeasure2,auc2,recall2,MCC2,Gmeasure2))
        Evaluat_tuple=(trainingTime2,acc2,fmeasure2,auc2,recall2,MCC2,Gmeasure2)    
        Evaluat_list.append(Evaluat_tuple)

    # 表头
    header = ['trainTime', 'acc', 'fmeasure', 'auc','recall', 'MCC', 'Gmeasure']
    with open('./data_remember/0.75WBLS_MC1_Evaluat.csv', 'w', encoding='utf-8', newline='') as file_obj:
        # 创建对象
        writer = csv.writer(file_obj)
        # 写表头
        writer.writerow(header)
        # 3.写入数据(一次性写入多行)
        writer.writerows(Evaluat_list)


        #sio.savemat('./data_remember/WBLS_Smote_CM1_test.mat',{'time':format((endtime - starttime).total_seconds()),'acc':acc,'f1':fmeasure,'auc':auc,'mcc':MCC,'Gm':Gmeasure})

(1988, 38) 2
*trainingTime：9.943948,acc：0.951710,fmeasure：0.959930,auc：0.708754,recall：0.951710,MCC：0.292366,Gmeasure：0.665811
*trainingTime：10.250180,acc：0.965795,fmeasure：0.967794,auc：0.671530,recall：0.965795,MCC：0.305033,Gmeasure：0.603623
*trainingTime：10.671374,acc：0.957746,fmeasure：0.962693,auc：0.667415,recall：0.957746,MCC：0.263654,Gmeasure：0.600958
*trainingTime：10.604918,acc：0.953722,fmeasure：0.961170,auc：0.709783,recall：0.953722,MCC：0.300094,Gmeasure：0.666547


KeyboardInterrupt: 

In [5]:
# acc=accuracy_score(predictlabel, testlabel, normalize=True)
# precision=precision_score(predictlabel, testlabel, average='weighted',zero_division=1)
# recall=recall_score(predictlabel, testlabel, average='weighted')
# fmeasure=f1_score(predictlabel, testlabel, average='weighted', labels=np.unique(testlabel))
# auc=roc_auc_score(predictlabel, testlabel, average='weighted', sample_weight=None)
# MCC=matthews_corrcoef(predictlabel, testlabel)
# Gmeasure=geometric_mean_score(predictlabel, testlabel, average='weighted')

In [None]:
#print('acc：%f,precision：%f,recall：%f,fmeasure：%f,auc：%f,,MCC：%f,Gmeasure：%f'%(acc,precision,recall,fmeasure,auc,MCC,Gmeasure))