In [None]:
#Import the adapted code base
import pandas as pd
import csv
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
#Read in positive and negative sample data
z = pd.read_csv(r'D:/LSM/1JieXi2/1true/z.csv')
f = pd.read_csv(r'D:/LSM/1JieXi2/1true/f.csv')

In [None]:
#"Set labels"
z['Class'] = '1'
f['Class'] = '0'

In [None]:
#Read in positive and negative sample data
zon = pd.read_csv(r'D:/LSM/1JieXi2/1test/zon.csv')

In [None]:
#Rename each feature value
column_mapping = {
    'DEM': 'Elevation',
    'Slope': 'Slope',
    'Aspect': 'Aspect',
    'JXtwi': 'TWI',
    'RDLS': 'Topographic relief',
    'ys': 'Strata',
    'TPI': 'Slope position',
    'curvature': 'Slope curvature',
    'landcover': 'Land cover',
    'NDVI2': 'NDVI',
    'Driver': 'Distance to river',
    'rainfallzh': 'Accumulated rainfall'
}

z.rename(columns=column_mapping, inplace=True)

In [None]:
#Rename each feature value
column_mapping = {
    'DEM': 'Elevation',
    'Slope': 'Slope',
    'Aspect': 'Aspect',
    'JXtwi': 'TWI',
    'RDLS': 'Topographic relief',
    'ys': 'Strata',
    'TPI': 'Slope position',
    'curvature': 'Slope curvature',
    'landcover': 'Land cover',
    'NDVI2': 'NDVI',
    'Driver': 'Distance to river',
    'rainfallzh': 'Accumulated rainfall'
}

f.rename(columns=column_mapping, inplace=True)

In [None]:
#Clear missing values
is_minus_9999 = (f == -9999)

minus_9999_count = is_minus_9999.sum()

print(minus_9999_count)

for column in f.columns:
    if minus_9999_count[column] > 0:
        print(f"Column: {column} has {minus_9999_count[column]} occurrences of -9999")
        print(f[is_minus_9999[column]])

In [None]:
#Clear missing values
is_minus_9999 = (z == -9999)

minus_9999_count = is_minus_9999.sum()

print(minus_9999_count)

for column in z.columns:
    if minus_9999_count[column] > 0:
        print(f"Column: {column} has {minus_9999_count[column]} occurrences of -9999")
        print(z[is_minus_9999[column]])

In [None]:
#Be clear about the null value data of the test file
f = f.replace(-9999, pd.NA).dropna()
zon = pd.concat([f, z], ignore_index=True)
zon.to_csv(r'D:/LSM/1JieXi2/1true/train.csv')

In [None]:
#Read in positive and negative sample data
train = pd.read_csv(r'D:/LSM/1JieXi2/1true/train.csv')
test = pd.read_csv(r'D:/LSM/1JieXi2/1true/test.csv')

In [None]:
#Divide the test set proportionally
#Be clear about the null value data of the test file
train = train.drop(train.columns[0], axis=1)
test = test.drop(test.columns[0], axis=1)
test=test.iloc[0:20000000,0:12]

In [None]:
#Divide the sample set proportionally
train,test=train_test_split(train,test_size=0.3,random_state=5)

In [None]:
#Extract the labeled and feature data
y_train = train['Class'];
X_train = train.drop(labels=['Class'], axis=1, level=None)

y_test = test['Class'];
X_test = test.drop(labels=['Class'], axis=1, level=None)

In [None]:
#Set an optimization time of 300 seconds for automatic optimization
from flaml import AutoML
automl = AutoML()
settings = {
    "time_budget": 300,  
    "metric":  'roc_auc', 
    "task": 'classification', 
    "estimator_list": ['extra_tree'],
    "log_file_name": 'class.log', 
}
# Train with labeled input data
automl.fit(X_train=X_train, y_train=y_train,**settings)
print(automl.predict_proba(X_train).shape)
# Export the best model
print(automl.model)

In [None]:
#View historical records
automl.config_history

In [None]:
#Output the specific values of the evaluation indicators
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))
print(automl.model.estimator)

In [None]:
#Test test data

y_pred5 = automl.predict(X_test)
print('Predicted labels', y_pred5)
print('True labels', y_test)
y_pred_proba5 = automl.predict_proba(X_test)[:,1]
gl5 =automl.predict_proba(X_test)
gl5 = pd.DataFrame(gl5)
gl5

In [None]:
#Output the values of the three evaluation indicators
from flaml.ml import sklearn_metric_loss_score
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred5, y_test))
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba5, y_test))
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba5, y_test))

In [None]:
from sklearn.metrics import classification_report,recall_score,accuracy_score,f1_score
print(classification_report(y_test,y_pred5))

In [None]:
#Draw the ROC curve of a single model
from sklearn import  metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False 

plt.figure(figsize = (5,5) ,dpi = 300)
fpr,tpr,th=roc_curve(y_test,y_pred_proba5)
plt.plot(fpr,tpr, color='blue', label='ET_Best_AUC={:.4f}'.format(metrics.roc_auc_score(y_test,y_pred_proba5))) # roc curve
plt.legend()

plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')

In [None]:
#Record the optimization process
from flaml.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
    get_output_from_log(filename=settings['log_file_name'], time_budget=300)
for config in config_history:
    print(config)

In [None]:
#Draw the optimization curve
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False 

plt.figure(figsize = (5,5) ,dpi = 300)
plt.title('Learning Curve')
plt.xlabel('Wall Clock Time (s)')
plt.ylabel('Validation Accuracy')
plt.scatter(time_history, 1 - np.array(valid_loss_history))
plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
plt.show()

In [None]:
extra_tree=automl.model.estimator#Save the model

In [None]:
#Set an optimization time of 600 seconds for automatic optimization
from flaml import AutoML
automl = AutoML()
## 参数设定
settings = {
    "time_budget": 600,  
    "metric":  'roc_auc',
    "estimator_list": ['extra_tree'],
    "log_file_name": 'class.log', 
}
# Train with labeled input data
automl.fit(X_train=X_train, y_train=y_train,**settings)
print(automl.predict_proba(X_train).shape)
# Export the best model
print(automl.model)

In [None]:
#View historical records
automl.config_history

In [None]:
#Output the specific values of the evaluation indicators
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))
print(automl.model.estimator)

In [None]:
#Test test data

y_pred5 = automl.predict(X_test)
print('Predicted labels', y_pred5)
print('True labels', y_test)
y_pred_proba5 = automl.predict_proba(X_test)[:,1]
gl5 =automl.predict_proba(X_test)
gl5 = pd.DataFrame(gl5)
gl5

In [None]:
#Output the values of the three evaluation indicators
from flaml.ml import sklearn_metric_loss_score
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred5, y_test))
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba5, y_test))
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba5, y_test))

In [None]:
from sklearn.metrics import classification_report,recall_score,accuracy_score,f1_score
print(classification_report(y_test,y_pred5))

In [None]:
#Draw the ROC curve of a single model
from sklearn import  metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False 

plt.figure(figsize = (5,5) ,dpi = 300)
fpr,tpr,th=roc_curve(y_test,y_pred_proba5)
plt.plot(fpr,tpr, color='orange', label='ET_Best_AUC={:.4f}'.format(metrics.roc_auc_score(y_test,y_pred_proba5))) # roc curve
plt.legend()

plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')

In [None]:
#Record the optimization process
from flaml.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
    get_output_from_log(filename=settings['log_file_name'], time_budget=300)
for config in config_history:
    print(config)

In [None]:
#Draw the optimization curve
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False 

plt.figure(figsize = (5,5) ,dpi = 300)
plt.title('Learning Curve')
plt.xlabel('Wall Clock Time (s)')
plt.ylabel('Validation Accuracy')
plt.scatter(time_history, 1 - np.array(valid_loss_history))
plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
plt.show()

In [None]:
extra_tree=automl.model.estimator#Save the model

In [None]:
#Set an optimization time of 1800 seconds for automatic optimization
from flaml import AutoML
automl = AutoML()

settings = {
    "time_budget": 1800, 
    "metric":  'roc_auc',
    "estimator_list": ['extra_tree'],
    "log_file_name": 'class.log',
}
# Train with labeled input data
automl.fit(X_train=X_train, y_train=y_train,**settings)
print(automl.predict_proba(X_train).shape)
# Export the best model
print(automl.model)

In [None]:
#View historical records
automl.config_history

In [None]:
#Output the specific values of the evaluation indicators
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))
print(automl.model.estimator)

In [None]:
#Test test data

y_pred5 = automl.predict(X_test)
print('Predicted labels', y_pred5)
print('True labels', y_test)
y_pred_proba5 = automl.predict_proba(X_test)[:,1]
gl5 =automl.predict_proba(X_test)
gl5 = pd.DataFrame(gl5)
gl5

In [None]:
#Output the values of the three evaluation indicators
from flaml.ml import sklearn_metric_loss_score
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred5, y_test))
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba5, y_test))
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba5, y_test))

In [None]:
from sklearn.metrics import classification_report,recall_score,accuracy_score,f1_score
print(classification_report(y_test,y_pred5))

In [None]:
#Draw the ROC curve of a single model
from sklearn import  metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False 

plt.figure(figsize = (5,5) ,dpi = 300)
fpr,tpr,th=roc_curve(y_test,y_pred_proba5)
plt.plot(fpr,tpr, color='orange', label='ET_Best_AUC={:.4f}'.format(metrics.roc_auc_score(y_test,y_pred_proba5))) # roc curve
plt.legend()

plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')

In [None]:
#Record the optimization process
from flaml.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
    get_output_from_log(filename=settings['log_file_name'], time_budget=300)
for config in config_history:
    print(config)

In [None]:
#Draw the optimization curve
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False 

plt.figure(figsize = (5,5) ,dpi = 300)
plt.title('Learning Curve')
plt.xlabel('Wall Clock Time (s)')
plt.ylabel('Validation Accuracy') 
plt.scatter(time_history, 1 - np.array(valid_loss_history))
plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
plt.show()