In [1]:
import numpy as np
import pandas as pd
import csv
import pickle
from joblib import dump, load
from sklearn import utils
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from gplearn.genetic import SymbolicTransformer, SymbolicRegressor

import datetime
import time

KeyboardInterrupt: 

# normalization 

In [None]:
def scaling(x):
    sc = StandardScaler()
    x = sc.fit_transform(x)
    return x

# read data

In [None]:
rawData = pd.read_excel('list-FUC.xlsx')
# 将原始数据分割为两部分
inputData = rawData.iloc[:,1:]
inputData = inputData.drop(['GH'],axis=1)
inputData = scaling(inputData)
outputData = rawData.iloc[:,1]

# initial parameter setting

In [None]:
best_fitness = float('-inf')
best_pc = 0
best_pcoef = 0
best_ps = 0
best_features = []
best_gp = []
best_mean = float('inf')
function_set = ['add', 'sub', 'mul', 'div', 'abs', 'sqrt']
pc = np.arange(0.1, 0.51, 0.1)
pcoef =  np.arange(0.001, 0.006, 0.001)
pc_grid, pcoef_grid = np.meshgrid(pc, pcoef)
pc_grid = np.array(pc_grid).flatten()
pcoef_grid = np.array(pcoef_grid).flatten()
dimNum = len(pc_grid)
count = 0

# hyperparameter optimization

In [None]:
for i in range(dimNum):
    pc0 = pc_grid[i]
    pcoef0 = pcoef_grid[i]
    ps_max = 1-pc0; ps_min = 0.92-pc0
    ps = np.arange(ps_min, ps_max, 0.01)
    ps = ps/3
    dimNum_ps = len(ps)
    for j in range(dimNum_ps):
        ps0 = ps[j]
        ph0 = ps0
        pp0 = 0.999999999-pc0-ph0-ps0
        gp = SymbolicTransformer(population_size=5000,  function_set=function_set, generations=20, n_components=5,
                        p_crossover=pc0, p_subtree_mutation=ps0,parsimony_coefficient=pcoef0, p_hoist_mutation=ph0, p_point_mutation=pp0,
                         const_range=(-1.0, 1.0), metric='pearson',tournament_size=20, verbose=1)
        print('********************************Round %d********************************' %(count+1))
        print('----------The round of pc=%.6f ps=%.6f pcoef=%.6f pp=%.6f ph=%.6f-----------'
              %(pc0, ps0, pcoef0, pp0, ph0))
        gp.fit(inputData, outputData)
        fitness = gp.run_details_['best_fitness'][-1]
        print('The round of fitness=%.6f\n'%fitness)
        features_pd = pd.DataFrame(best_features)
        PCC_GH_1 = abs(features_pd.corr('spearman').iloc[0,1:])
        PCC_GH_2 = abs(features_pd.corr('spearman').iloc[1,[0,2,3,4]])
        PCC_GH_3 = abs(features_pd.corr('spearman').iloc[2,[0,1,3,4]])
        PCC_GH_4 = abs(features_pd.corr('spearman').iloc[3,[0,1,2,4]])
        PCC_GH_5 = abs(features_pd.corr('spearman').iloc[4,[0,1,2,3]])
        PCC_GH_mean = np.mean(PCC_GH_1)+np.mean(PCC_GH_2)+np.mean(PCC_GH_3)+np.mean(PCC_GH_4)+np.mean(PCC_GH_5)
        PCC_GH_mean
        count = count+1
        if fitness > best_fitness and PCC_GH_mean < best_mean:
            best_pc = pc0
            best_pcoef = pcoef0
            best_ps = ps0
            best_features = gp.transform(inputData)
            best_fitness = fitness
            best_gp = gp
            best_mean = PCC_GH_mean

# output result

In [None]:
print('The best fitness=%.3f;The best pc=%.6f;The best pcoef=%.6f;The best ps=%.6f'%(best_fitness, best_pc, best_pcoef, best_ps))
print('The best features: %s'%best_features)
print('The best formula: %s'%best_gp)

# saving model and data

In [None]:
path = "_5D"
data = pd.DataFrame(best_features)
data.to_csv("list_Gpfeature"+path+".csv",index=False,sep=',')
demension = path.split('_')[1]
pickle.dump(best_gp, open(demension+"\\gp"+path+".pickle.dat", "wb"))
data = pd.DataFrame(best_features)
data.to_csv(demension+"list_Gpfeature"+path+".csv",index=False,sep=',')
# best_gp2 = pickle.load(open(demension+"\\gp"+path+".pickle.dat", "rb"))
# dump(best_fitness, demension+"\\best_fitness"+path+".pkl")
# dump(best_pc, demension+"\\best_pc"+path+".pkl")
# dump(best_pcoef, demension+"\\best_pcoef"+path+".pkl")
# dump(best_ps, demension+"\\best_ps"+path+".pkl")
# dump(best_features, demension+"\\best_features"+path+".pkl")
# best_fitness2 = load(demension+"\\best_fitness"+path+".pkl")
# best_pc2 = load(demension+"\\best_pc"+path+".pkl")
# best_pcoef2 = load(demension+"\\best_pcoef"+path+".pkl")
# best_ps2 = load(demension+"\\best_ps"+path+".pkl")
# best_features2 = load(demension+"\\best_features"+path+".pkl")