#### Accuracy vs. Temp (and spread) for individual gas abundances (CO, CH4, NH3, H2O)
#### Accuracy vs. Temp (and spread) for CO/CH4 (or "cross-mix" systems, CO/NH3 or CH4/NH3)

In [1]:
import os
import numpy as np
import pandas as pd
import json
import glob
import sklearn.model_selection as ms
from sklearn import metrics
import xgboost as xgb
from itertools import combinations
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings(action="ignore", category=UserWarning)

In [2]:
def XGB_accuracy(X, Y):
    
    # split datasets for training and testing
    x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.2, random_state=0)
    eval_set = [(x_train, y_train), (x_test, y_test)]

    # fit the decision tree
    classifier = xgb.XGBClassifier(objective="multi:softprob", min_child_wight=10, max_depth=5, n_estimators=1000)
    classifier = classifier.fit(x_train, y_train, early_stopping_rounds=100, eval_set=eval_set,
                                eval_metric=["merror", "mlogloss"], verbose=False)
    # predictions
    y_pred = classifier.predict(x_test)
    return metrics.accuracy_score(y_test, y_pred)

#### combinations of abundance of 4 compounds for feature and variables

In [3]:
def select_abundance(list_var, nbr_selection):
    var_comb = dict()
    length = 0  
    for n in combinations(list_var, nbr_selection):
        var_comb[length] = list()
        for c in n:
            var_comb[length].append(c)  
        length += 1
    return var_comb

def convert_listVar_to_str(listVar, cutout='', connection='_'):
    var_name = ""
    for c in listVar:
        var_name += c.replace(cutout, connection)
    return var_name

#### Individual plots with different combination of abundance.

In [4]:
dir_plot = "/Users/hkim78/work/HotJupiter/plot/atmosphere-uncertainty/machine_learning/"
if not os.path.exists(dir_plot):
    os.mkdir(dir_plot)

result_dir = "/Users/hkim78/work/HotJupiter/ML/results/accuracy/"

In [5]:
header_abundance = ['CH4 abundance', 'CO abundance', 'H2O abundance', 'NH3 abundance'] 
header_comb1 = select_abundance(header_abundance, 1)
header_comb2 = select_abundance(header_abundance, 2)
header_comb3 = select_abundance(header_abundance, 3)

In [None]:
for h in [header_comb1, header_comb2, header_comb3]: 
    for n in h:
        dict_accuracy = dict()
        st = time.time()
        for spread in ["50", "250", "1000"]: #spread
            data_dir = "/Users/hkim78/work/HotJupiter/data/atmosphere-uncertainty/parsed_data/%sk_spread/"%spread
            plot_dir = "/Users/hkim78/work/HotJupiter/plot/atmosphere-uncertainty/%sk_spread/"%spread
            dict_accuracy[spread] = list()

            for t in np.arange(400, 2100, 100):
                data0 = pd.read_csv(data_dir + 'kzz0_temp%d_spread%s.csv'%(t, spread))
                data1 = pd.read_csv(data_dir + 'kzz1_temp%d_spread%s.csv'%(t, spread))
                data2 = pd.read_csv(data_dir + 'kzz2_temp%d_spread%s.csv'%(t, spread))
                data3 = pd.read_csv(data_dir + 'kzz3_temp%d_spread%s.csv'%(t, spread))

                frames = [data0, data1, data2, data3]
                features = h[n] + ['kzz']
                allData = pd.concat(frames, ignore_index=True)

                allData = allData[features]

                # split into dependent and independent variables
                X = allData.iloc[:, :-1]
                Y = allData.iloc[:, -1].values
                a = XGB_accuracy(X, Y)
                dict_accuracy[spread].append(a)

        var = convert_listVar_to_str(h[n], " abundance")
        output_path = result_dir + var + "accuracy.json"

        # with open(output_path, 'w') as outfile:
        #     json.dump(dict_accuracy, outfile)
        et = time.time()
        print(n, h[n], var, et-st)

Parameters: { "min_child_wight" } are not used.

Parameters: { "min_child_wight" } are not used.



In [8]:
result_dir = "/Users/hkim78/work/2020-hotJupiter/ML/results/accuracy/abundance/"

input_paths = dict()

list_com_var = list()

for file_path in glob.glob(result_dir +"*.json"):
    
    var = file_path.split("abundance/")[1].split("_accuracy")[0]
    list_com_var.append(var)
    
    with open(file_path) as infile:
        a = json.load(infile)
    
    list_cc = ['#1E88E5', '#BB5566', '#228833']

    plt.figure(figsize=(3.3, 2.7))
    i = 0
    for spread in ["50", "250", "1000"]:
        plt.plot(a[spread], label=spread, linewidth=2, color=list_cc[i])

        plt.xticks(np.arange(0, 17, 2), np.arange(400, 2100, 200), fontsize=7.5, rotation=30)
        plt.yticks(fontsize=7.5)
        plt.xlabel("Mean Temperature (K)", fontsize=8)
        plt.ylabel("Accuracy", fontsize=8)
        i += 1

    plt.title("%s"%var, fontsize=9)
    plt.ylim([0.0,1.05])
    legend_spread = plt.legend(prop={"size":7.5}, title='Spread')
    plt.setp(legend_spread.get_title(), fontsize= 7.5)
    plt.tight_layout()
    # plt.savefig(dir_plot + "accuracy_%s.png"%var, dpi=90)
    plt.show()