#### Accuracy vs. Temp (and spread) for the perturbed data by individual gas abundances (CO, CH4, NH3, H2O)

In [1]:
import os
import numpy as np
import pandas as pd
import glob
import json
import sklearn.model_selection as ms
from sklearn import metrics
import xgboost as xgb
import matplotlib.pyplot as plt
import time

In [2]:
def XGB_accuracy(X, Y):
    # Split for training and testing
    x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.2, random_state=0)
    eval_set = [(x_train, y_train), (x_test, y_test)]

    # Fit the decision tree
    classifier = xgb.XGBClassifier(objective="multi:softprob", min_child_wight=10, max_depth=5, n_estimators=1000)
    classifier = classifier.fit(x_train, y_train, early_stopping_rounds=100, eval_set=eval_set,
                                eval_metric=["merror", "mlogloss"], verbose=False)
    
    # Predictions
    y_pred = classifier.predict(x_test)
    return metrics.accuracy_score(y_test, y_pred)

In [3]:
header = ['Metallicity', 'Altitude',
        'Mean Degree',
        'CO Degree', 'CH4 Degree', 'NH3 Degree', 'H2O Degree',
        'Average shortest path length',
        'Average clustering coefficient',
        'CO clustering coefficient', 'CH4 clustering coefficient', 'NH3 clustering coefficient','H2O clustering coefficient',
        'CO node betweenness centrality', 'CH4 node betweenness centrality', 'NH3 node betweenness centrality',
        'H2O node betweenness centrality',
        'Edge betweenness centrality',
        'Average neighbor degree',
        'CO neighbor degree', 'CH4 neighbor degree', 'NH3 neighbor degree', 'H2O neighbor degree',
        'CO abundance', 'CH4 abundance', 'NH3 abundance', 'H2O abundance',
        'Delta G distribution', 'Phi distribution',
        'Average node betweenness centrality', 'Temperature', 'Kzz']

header_average = [
    'Mean Degree', 'Average shortest path length', 'Average clustering coefficient',
    'Average neighbor degree','Average node betweenness centrality', 'Edge betweenness centrality']

header_abundance = [n for n in header if n.find('abundance') > -1] 

In [4]:
### features for Delta G distribution + Abundance VS Kzz

f1 = ['Delta G distribution'] + ['kzz']
f2 = header_abundance + ['kzz']
f3 = header_average + ['kzz']
f4 = ['Delta G distribution'] + header_average + ['kzz']
f5 = ['Delta G distribution'] + header_abundance + ['kzz']
f6 = header_average + header_abundance + ['kzz']
f7 = ['Delta G distribution'] + header_abundance + header_average + ['kzz']

In [14]:
st = time.time()
data_dir = "/Users/hkim78/work/2020-hotJupiter/data/perturbed-data/2021/"
result_dir = "/Users/hkim78/work/2020-hotJupiter/ML/results/perturbed_data/2021/"

dict_accuracy = dict()
for removed_species in ["CH4", "CO", "H2O", "NH3"]:
    dict_accuracy[removed_species] = dict()
    for features in ["f1", "f2", "f3", "f4", "f5", "f6", "f7"]:
        dict_accuracy[removed_species][features] = list()

for removed_species in ["NH3"]:
    for t in np.arange(400, 2100, 100):
        data0 = pd.read_csv(data_dir + '%s_removed/0.00 kzz analytical 50k spread observables '
                                       '%s_removed %dK.csv'%(removed_species, removed_species, t))
        data1 = pd.read_csv(data_dir + '%s_removed/1e06 kzz analytical 50k spread observables '
                                       '%s_removed %dK.csv'%(removed_species, removed_species, t))
        data2 = pd.read_csv(data_dir + '%s_removed/1e08 kzz analytical 50k spread observables '
                                       '%s_removed %dK.csv'%(removed_species, removed_species, t))
        data3 = pd.read_csv(data_dir + '%s_removed/1e10 kzz analytical 50k spread observables '
                                       '%s_removed %dK.csv'%(removed_species, removed_species, t))

        data0["kzz"] = 0
        data1["kzz"] = 1
        data2["kzz"] = 2
        data3["kzz"] = 3

        frames = [data0, data1, data2, data3]

        allData = pd.concat(frames, ignore_index=True)

        allData1 = allData[f1]
        allData2 = allData[f2]
        allData3 = allData[f3]
        allData4 = allData[f4]
        allData5 = allData[f5]
        allData6 = allData[f6]
        allData7 = allData[f7]

        # Split into dependent and independent variables
        X1 = allData1.iloc[:, :-1]
        Y1 = allData1.iloc[:, -1].values
        a = XGB_accuracy(X1, Y1)
        dict_accuracy[removed_species]['f1'].append(a)

        X2 = allData2.iloc[:, :-1]
        Y2 = allData2.iloc[:, -1].values
        a = XGB_accuracy(X2, Y2)
        dict_accuracy[removed_species]['f2'].append(a)    

        X3 = allData3.iloc[:, :-1]
        Y3 = allData3.iloc[:, -1].values
        a = XGB_accuracy(X3, Y3)
        dict_accuracy[removed_species]['f3'].append(a) 

        X4 = allData4.iloc[:, :-1]
        Y4 = allData4.iloc[:, -1].values
        a = XGB_accuracy(X4, Y4)
        dict_accuracy[removed_species]['f4'].append(a) 

        X5 = allData5.iloc[:, :-1]
        Y5 = allData5.iloc[:, -1].values
        a = XGB_accuracy(X5, Y5)
        dict_accuracy[removed_species]['f5'].append(a) 

        X6 = allData6.iloc[:, :-1]
        Y6 = allData6.iloc[:, -1].values
        a = XGB_accuracy(X6, Y6)
        dict_accuracy[removed_species]['f6'].append(a) 

        X7 = allData5.iloc[:, :-1]
        Y7 = allData5.iloc[:, -1].values
        a = XGB_accuracy(X7, Y7)
        dict_accuracy[removed_species]['f7'].append(a) 

    output_path = result_dir + "accuracy_with_perturbation_%s.json"%removed_species
    with open(output_path, 'w') as outfile:
        json.dump(dict_accuracy[removed_species], outfile)

et = time.time()

print(et - st)

NH3 400 71.02150392532349
NH3 500 117.06728172302246
NH3 600 87.44561100006104
NH3 700 209.64267301559448
NH3 800 143.2902750968933
NH3 900 127.08848786354065
NH3 1000 239.13792514801025
NH3 1100 206.0747950077057
NH3 1200 421.25581073760986
NH3 1300 184.75969099998474
NH3 1400 153.74903988838196
NH3 1500 112.12838983535767
NH3 1600 121.28483700752258
NH3 1700 166.38211679458618
NH3 1800 211.32112789154053
NH3 1900 263.65814113616943
NH3 2000 251.83801293373108
251.84034872055054
