# 重复100次实验  取平均值

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE                # 过抽样处理库SMOTE
from imblearn.under_sampling import RandomUnderSampler  # 欠抽样处理库RandomUnderSample
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA           #加载PCA算法包
from sklearn.metrics import mean_squared_error as mse
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
data_names = ['data.xlsx','AES_T400.xlsx','AES_T700.xlsx','AES_T800.xlsx','AES_T900.xlsx',
                  'AES_T1200.xlsx','AES_T1300.xlsx','AES_T1400.xlsx','AES_T2000.xlsx',]

def create_numerics(data):
    # Get nominal columns
    nominal_cols = data.select_dtypes(include='object').columns.tolist()

    # Turn nominal to numeric
    for nom in nominal_cols:
        enc = LabelEncoder()
        enc.fit(data[nom])
        data[nom] = enc.transform(data[nom])
    return data
    
def prepare_data(case = 0 , feature_choice = 0, data_id = 0, ispca = 0):

    data = pd.read_excel(data_names[data_id])
    data = data.dropna()

    data = create_numerics(data)
    data = shuffle(data)
    
    data = data.values
    y = data[:,-1]
    x = data[:,0:-1]
    if(ispca == 1):
        pca = PCA(n_components=0.9)
        pca.fit(x)
        x = pca.transform(x)
    
    if(feature_choice == 1):    #我们的  先验知识
        l=[0,4,8,12]
        x = data[:,l]
    elif(feature_choice == 2):  # 非均衡下 SHAP的结果  和   任何情况下 PI 的结果
        l=[0,1,3,4,8,12]  
        x = data[:,l]    
    elif(feature_choice == 3):  # 均衡下  SHAP 的结果  
        l=[0,4,8,10,12,15]
        x = data[:,l]    
    if(data_id == 0):    
        print("Original samples:",len(y))
        if(case ==1):
            model_smote = SMOTE(random_state=42)                      # 建立SMOTE模型对象
            x, y = model_smote.fit_sample(x, y)         # 输入数据做过抽样处理
        elif(case ==2):
            model_RandomUnderSample = RandomUnderSampler(random_state=42)   # 建立RandomUnderSampler模型对象
            x, y = model_RandomUnderSample.fit_sample(x, y)   # 输入数据做欠抽样处理
        elif(case ==3):
            oversample = BorderlineSMOTE(random_state=42)
            x, y = oversample.fit_resample(x, y)
        elif(case ==4):
            smt = SMOTETomek(random_state=42)
            x,y =  smt.fit_sample(x, y)
        elif(case ==5):
            smt = SMOTEENN(random_state=42)
            x,y =  smt.fit_sample(x, y)    
        print("New samples:",len(y))
        print("Tro samples:",y.sum())
        
    normal = len(y) - y.sum()
    trojan = y.sum()
    scaler = MinMaxScaler(feature_range=(0, 1))
    x = scaler.fit_transform(x)
    return(x,y,normal,trojan)
#     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=ratio, random_state=1)
#     return(x_train, x_test, y_train, y_test)

# 重复实验

In [3]:
# sampling cases: 0不处理    1 过采样Smote  2 随机欠采样      3 Borderline Smote  4 Smote+Toek 5 SmoteEEN

# 特征选择      ：0： 全选   1： 1 4 8 12   2：1 2 4 5 9 13   3：1 4 5 9 13 16

# 数据选择 data_names = ['data.xlsx','AES_T400.xlsx','AES_T700.xlsx','AES_T800.xlsx','AES_T900.xlsx',
# 'AES_T1200.xlsx','AES_T1300.xlsx','AES_T1400.xlsx','AES_T2000.xlsx',]
# 0 是训练集  其他 1-8 是测试集

# PCA处理       ：0 则不PCA  1  则PCA


if __name__ == '__main__':
    sampling = 5     # 过采样技术  建议 5
    id_features = 1  # 降维技术    建议 2
    is_pca = 0       # 是否PCA     建议 0 
    normal = []
    trojan = []
    for i in range(0,100):
        x_train, y_train,n,t = prepare_data(case = sampling , feature_choice = id_features, data_id = 0, ispca = is_pca)
        normal.append(n)
        trojan.append(t)

Original samples: 6915
New samples: 11898
Tro samples: 5914
Original samples: 6915
New samples: 11915
Tro samples: 5931
Original samples: 6915
New samples: 11930
Tro samples: 5946
Original samples: 6915
New samples: 11918
Tro samples: 5934
Original samples: 6915
New samples: 11910
Tro samples: 5926
Original samples: 6915
New samples: 11919
Tro samples: 5935
Original samples: 6915
New samples: 11927
Tro samples: 5943
Original samples: 6915
New samples: 11928
Tro samples: 5944
Original samples: 6915
New samples: 11928
Tro samples: 5944
Original samples: 6915
New samples: 11927
Tro samples: 5943
Original samples: 6915
New samples: 11916
Tro samples: 5932
Original samples: 6915
New samples: 11919
Tro samples: 5935
Original samples: 6915
New samples: 11909
Tro samples: 5925
Original samples: 6915
New samples: 11919
Tro samples: 5935
Original samples: 6915
New samples: 11915
Tro samples: 5931
Original samples: 6915
New samples: 11921
Tro samples: 5937
Original samples: 6915
New samples: 1192

In [4]:
    print("平均正常数据规模",np.mean(normal))
    print("平均木马数据规模",np.mean(trojan))

平均正常数据规模 5924.16
平均木马数据规模 5933.62
