In [1]:
import os
import shap
import smogn
import imblearn
import numpy as np
import pandas as pd
from math import sqrt
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.svm import SVR, SVC
from sklearn.decomposition import PCA
from imblearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import SGDRegressor
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import f1_score, classification_report, mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.feature_selection import f_regression, f_classif, RFE, VarianceThreshold, chi2, SelectKBest, SelectFromModel

## **Data preparation**

In [2]:
# import data and remove unwanted columns
data = pd.read_excel('augmented sample.xlsx').drop(['index',
                                                    'Formation_energy'],
                                                    axis=1)
data = data.set_index(['Material Composition'])

In [3]:
# split the dataset into X and y(Energy Above Hull)
X = data.drop(['EnergyAboveHull'], axis=1)
y = data['EnergyAboveHull']

In [4]:
# preview the shape of the dataframe
sample_size = X.shape[0]
feature_size = X.shape[1]
print("Number of samples", sample_size,
      "\nNumber of features:", feature_size)

Number of samples 2138 
Number of features: 962


In [5]:
# preprocess the y for classification
y_clf = np.zeros_like(y)

# samples with EnergyAboveHull larger than 40 will be marked as unstable
# unstable = 0, stable = 1
y_clf = [1*(EAH<=40) for EAH in y]

## **Regression**

**best parameters for XGB regressor**

{'kbest__k': 250, 'model__colsample_bytree': 0.6, 'model__max_depth': 5, 'model__n_estimators': 150, 'pca__n_components': 25}

In [6]:
xgb_regressor = xgb.XGBRegressor(colsample_bytree=0.6, 
                                 max_depth=5, 
                                 n_estimators=150)

In [7]:
# best pipeline construction
xgb_reg_pipeline = Pipeline([
    ('variance threshold', VarianceThreshold()),
    ('kbest', SelectKBest(f_regression, k=250)),
    ('standard_scaler', StandardScaler()), 
    ('pca', PCA(n_components=25)), 
    ('model', xgb_regressor)
])

**best parameters for SVM regressor**

{'svr__C': 2260}

In [8]:
svm_regressor = SVR(C=2260)

In [9]:
# best pipeline construction
svm_reg_pipeline = Pipeline([
    ('variance threshold', VarianceThreshold()),
    ('kbest', SelectKBest(f_regression, k=250)),
    ('standard_scaler', StandardScaler()), 
    ('pca', PCA(n_components=25)), 
    ('model', svm_regressor)
])

**best parameters for GBDT regressor**

{'gradientboostingregressor__alpha': 0.01}

In [10]:
gbdt_regressor = GradientBoostingRegressor(alpha=0.01)

In [11]:
# best pipeline construction
gbdt_reg_pipeline = Pipeline([
    ('variance threshold', VarianceThreshold()),
    ('kbest', SelectKBest(f_regression, k=250)),
    ('standard_scaler', StandardScaler()), 
    ('pca', PCA(n_components=25)), 
    ('model', gbdt_regressor)
])

**10-fold verification**

In [12]:
fold = 10

xgb_reg_avg_mae = []
xgb_reg_avg_rmse = []

svm_reg_avg_mae = []
svm_reg_avg_rmse = []

gbdt_reg_avg_mae = []
gbdt_reg_avg_rmse = []

for i in range(fold):
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=(1/fold))
    
    xgb_reg_pipeline.fit(X_trainval, y_trainval)
    svm_reg_pipeline.fit(X_trainval, y_trainval)
    gbdt_reg_pipeline.fit(X_trainval, y_trainval)
    
    # for xgb reg
    y_pred = xgb_reg_pipeline.predict(X_test)
    y_true = y_test
    
    xgb_reg_avg_mae.append(mean_absolute_error(y_true, y_pred))
    xgb_reg_avg_rmse.append(sqrt(mean_squared_error(y_true, y_pred)))
    
    # for svm reg
    y_pred = svm_reg_pipeline.predict(X_test)
    y_true = y_test
    
    svm_reg_avg_mae.append(mean_absolute_error(y_true, y_pred))
    svm_reg_avg_rmse.append(sqrt(mean_squared_error(y_true, y_pred)))
    
    # for gbdt reg
    y_pred = gbdt_reg_pipeline.predict(X_test)
    y_true = y_test
    
    gbdt_reg_avg_mae.append(mean_absolute_error(y_true, y_pred))
    gbdt_reg_avg_rmse.append(sqrt(mean_squared_error(y_true, y_pred)))



**xgb regression results**

In [13]:
print('average mae of xgb reg', 
      '{:.3f}'.format(np.mean(xgb_reg_avg_mae)), 
      '±', 
      '{:.3f}'.format(2*np.std(xgb_reg_avg_mae)))
print('average rmse of xgb reg', 
      '{:.3f}'.format(np.mean(xgb_reg_avg_rmse)), 
      '±', 
      '{:.3f}'.format(2*np.std(xgb_reg_avg_rmse)))

average mae of xgb reg 27.377 ± 6.226
average rmse of xgb reg 48.451 ± 23.966


**svm regression results**

In [14]:
print('average mae of svm reg', 
      '{:.3f}'.format(np.mean(svm_reg_avg_mae)), 
      '±', 
      '{:.3f}'.format(2*np.std(svm_reg_avg_mae)))
print('average rmse of svm reg', 
      '{:.3f}'.format(np.mean(svm_reg_avg_rmse)), 
      '±', 
      '{:.3f}'.format(2*np.std(svm_reg_avg_rmse)))

average mae of svm reg 18.940 ± 5.122
average rmse of svm reg 44.319 ± 28.163


**gbdt regression results**

In [15]:
print('average mae of gbdt reg', 
      '{:.3f}'.format(np.mean(gbdt_reg_avg_mae)), 
      '±', 
      '{:.3f}'.format(2*np.std(gbdt_reg_avg_mae)))
print('average rmse of gbdt reg', 
      '{:.3f}'.format(np.mean(gbdt_reg_avg_rmse)), 
      '±', 
      '{:.3f}'.format(2*np.std(gbdt_reg_avg_rmse)))

average mae of gbdt reg 32.443 ± 6.260
average rmse of gbdt reg 53.768 ± 23.394


## **Classification**

**best parameters for XGB classifier**

{'kbest__k': 450, 'model__colsample_bytree': 0.6, 'model__max_depth': 5, 'model__n_estimators': 650, 'pca__n_components': 50}

{'sampling__k_neighbors': 1, 'sampling__m_neighbors': 1}

In [16]:
xgb_classifier = xgb.XGBClassifier(max_depth=5, 
                                   n_estimators=650, 
                                   colsample_bytree=0.6)

In [17]:
# best pipeline construction
xgb_clf_pipeline = Pipeline([
    ('sampling', BorderlineSMOTE(k_neighbors=1, 
                                 m_neighbors=1, 
                                 sampling_strategy='minority')),
    ('variance threshold', VarianceThreshold()),
    ('kbest', SelectKBest(f_classif, k=450)),
    ('standard_scaler', StandardScaler()), 
    ('pca', PCA(n_components=50)), 
    ('model', xgb_classifier)
])

**best parameters for SVM classifier**

{'svc__C': 81}

{'sampling__k_neighbors': 1, 'sampling__m_neighbors': 1}

In [18]:
svm_classifier = SVC(C=81)

In [19]:
# best pipeline construction
svm_clf_pipeline = Pipeline([
    ('sampling', BorderlineSMOTE(k_neighbors=1, 
                                 m_neighbors=1, 
                                 sampling_strategy='minority')),
    ('variance threshold', VarianceThreshold()),
    ('kbest', SelectKBest(f_classif, k=450)),
    ('standard_scaler', StandardScaler()), 
    ('pca', PCA(n_components=50)), 
    ('model', svm_classifier)
])

**best parameters for GBDT classifier**

{'gradientboostingclassifier__ccp_alpha': 0.0}

{'sampling__k_neighbors': 1, 'sampling__m_neighbors': 1}

In [20]:
gbdt_classifier = GradientBoostingClassifier(ccp_alpha=0)

In [21]:
# best pipeline construction
gbdt_clf_pipeline = Pipeline([
    ('sampling', BorderlineSMOTE(k_neighbors=1, 
                                 m_neighbors=1, 
                                 sampling_strategy='minority')),
    ('variance threshold', VarianceThreshold()),
    ('kbest', SelectKBest(f_classif, k=450)),
    ('standard_scaler', StandardScaler()), 
    ('pca', PCA(n_components=50)), 
    ('model', gbdt_classifier)
])

**10-fold verification**

In [22]:
fold = 10

xgb_clf_weighted_avg_f1 = []
svm_clf_weighted_avg_f1 = []
gbdt_clf_weighted_avg_f1 = []

for i in range(fold):
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y_clf, test_size=(1/fold))
    
    xgb_clf_pipeline.fit(X_trainval, y_trainval)
    svm_clf_pipeline.fit(X_trainval, y_trainval)
    gbdt_clf_pipeline.fit(X_trainval, y_trainval)
    
    # for xgb clf
    y_pred = xgb_clf_pipeline.predict(X_test)
    y_true = y_test
    
    target_names = ['unstale', 'stable']
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    xgb_clf_weighted_avg_f1.append(report['weighted avg']['f1-score'])
    
    # for svm clf
    y_pred = svm_clf_pipeline.predict(X_test)
    y_true = y_test
    
    target_names = ['unstale', 'stable']
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    svm_clf_weighted_avg_f1.append(report['weighted avg']['f1-score'])
    
    # for gbdt clf
    y_pred = gbdt_clf_pipeline.predict(X_test)
    y_true = y_test
    
    target_names = ['unstale', 'stable']
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    gbdt_clf_weighted_avg_f1.append(report['weighted avg']['f1-score'])

**xgb classification results**

In [23]:
print('average weighted f1 score for xgb clf', 
      '{:.3f}'.format(np.mean(xgb_clf_weighted_avg_f1)), 
      '±', 
      '{:.3f}'.format(2*np.std(xgb_clf_weighted_avg_f1)))

average weighted f1 score for xgb clf 0.909 ± 0.049


**svm classification results**

In [24]:
print('average weighted f1 score for svm clf', 
      '{:.3f}'.format(np.mean(svm_clf_weighted_avg_f1)), 
      '±', 
      '{:.3f}'.format(2*np.std(svm_clf_weighted_avg_f1)))

average weighted f1 score for svm clf 0.921 ± 0.027


**gbdt classification results**

In [25]:
print('average weighted f1 score for gbdt clf', 
      '{:.3f}'.format(np.mean(gbdt_clf_weighted_avg_f1)), 
      '±', 
      '{:.3f}'.format(2*np.std(gbdt_clf_weighted_avg_f1)))

average weighted f1 score for gbdt clf 0.872 ± 0.031


## **Evaluation on subgroups**

**Ba, Sr, Fe, Co subset creation**

In [26]:
Ba_list = []
Sr_list = []
Fe_list = []
Co_list = []
for i in range(len(data.index)):
    if 'Ba' in data.index[i]:
        Ba_list.append(i)
    if 'Sr' in data.index[i]:
        Sr_list.append(i)
    if 'Fe' in data.index[i]:
        Fe_list.append(i)
    if 'Co' in data.index[i]:
        Co_list.append(i)
data_Ba = data.iloc[Ba_list]
data_Sr = data.iloc[Sr_list]
data_Fe = data.iloc[Fe_list]
data_Co = data.iloc[Co_list]

**modeling on Ba**

In [27]:
# split the dataset into X and y(Energy Above Hull)
X_ba = data_Ba.drop(['EnergyAboveHull'], axis=1)
y_ba = data_Ba['EnergyAboveHull']

y_ba_clf = np.zeros_like(y_ba)
y_ba_clf = [1*(EAH<=40) for EAH in y_ba]

**regression on Ba**

In [28]:
fold = 10

xgb_reg_avg_mae = []
xgb_reg_avg_rmse = []

svm_reg_avg_mae = []
svm_reg_avg_rmse = []

gbdt_reg_avg_mae = []
gbdt_reg_avg_rmse = []

for i in range(fold):
    X_trainval, X_test, y_trainval, y_test = train_test_split(X_ba, y_ba, test_size=(1/fold))
    
    xgb_reg_pipeline.fit(X_trainval, y_trainval)
    svm_reg_pipeline.fit(X_trainval, y_trainval)
    gbdt_reg_pipeline.fit(X_trainval, y_trainval)
    
    # for xgb reg
    y_pred = xgb_reg_pipeline.predict(X_test)
    y_true = y_test
    
    xgb_reg_avg_mae.append(mean_absolute_error(y_true, y_pred))
    xgb_reg_avg_rmse.append(sqrt(mean_squared_error(y_true, y_pred)))
    
    # for svm reg
    y_pred = svm_reg_pipeline.predict(X_test)
    y_true = y_test
    
    svm_reg_avg_mae.append(mean_absolute_error(y_true, y_pred))
    svm_reg_avg_rmse.append(sqrt(mean_squared_error(y_true, y_pred)))
    
    # for gbdt reg
    y_pred = gbdt_reg_pipeline.predict(X_test)
    y_true = y_test
    
    gbdt_reg_avg_mae.append(mean_absolute_error(y_true, y_pred))
    gbdt_reg_avg_rmse.append(sqrt(mean_squared_error(y_true, y_pred)))



**xgb regression results**

In [29]:
print('average mae of xgb reg', 
      '{:.3f}'.format(np.mean(xgb_reg_avg_mae)), 
      '±', 
      '{:.3f}'.format(2*np.std(xgb_reg_avg_mae)))
print('average rmse of xgb reg', 
      '{:.3f}'.format(np.mean(xgb_reg_avg_rmse)), 
      '±', 
      '{:.3f}'.format(2*np.std(xgb_reg_avg_rmse)))

average mae of xgb reg 33.743 ± 7.160
average rmse of xgb reg 66.046 ± 47.597


**svm regression results**

In [30]:
print('average mae of svm reg', 
      '{:.3f}'.format(np.mean(svm_reg_avg_mae)), 
      '±', 
      '{:.3f}'.format(2*np.std(svm_reg_avg_mae)))
print('average rmse of svm reg', 
      '{:.3f}'.format(np.mean(svm_reg_avg_rmse)), 
      '±', 
      '{:.3f}'.format(2*np.std(svm_reg_avg_rmse)))

average mae of svm reg 23.625 ± 7.366
average rmse of svm reg 60.654 ± 54.178


**gbdt regression results**

In [31]:
print('average mae of gbdt reg', 
      '{:.3f}'.format(np.mean(gbdt_reg_avg_mae)), 
      '±', 
      '{:.3f}'.format(2*np.std(gbdt_reg_avg_mae)))
print('average rmse of gbdt reg', 
      '{:.3f}'.format(np.mean(gbdt_reg_avg_rmse)), 
      '±', 
      '{:.3f}'.format(2*np.std(gbdt_reg_avg_rmse)))

average mae of gbdt reg 39.047 ± 10.114
average rmse of gbdt reg 72.620 ± 42.463


**classification on Ba**

In [32]:
fold = 10

xgb_clf_weighted_avg_f1 = []
svm_clf_weighted_avg_f1 = []
gbdt_clf_weighted_avg_f1 = []

for i in range(fold):
    X_trainval, X_test, y_trainval, y_test = train_test_split(X_ba, y_ba_clf, test_size=(1/fold))
    
    xgb_clf_pipeline.fit(X_trainval, y_trainval)
    svm_clf_pipeline.fit(X_trainval, y_trainval)
    gbdt_clf_pipeline.fit(X_trainval, y_trainval)
    
    # for xgb clf
    y_pred = xgb_clf_pipeline.predict(X_test)
    y_true = y_test
    
    target_names = ['unstale', 'stable']
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    xgb_clf_weighted_avg_f1.append(report['weighted avg']['f1-score'])
    
    # for svm clf
    y_pred = svm_clf_pipeline.predict(X_test)
    y_true = y_test
    
    target_names = ['unstale', 'stable']
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    svm_clf_weighted_avg_f1.append(report['weighted avg']['f1-score'])
    
    # for gbdt clf
    y_pred = gbdt_clf_pipeline.predict(X_test)
    y_true = y_test
    
    target_names = ['unstale', 'stable']
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    gbdt_clf_weighted_avg_f1.append(report['weighted avg']['f1-score'])

**xgb classification results**

In [33]:
print('average weighted f1 score for xgb clf', 
      '{:.3f}'.format(np.mean(xgb_clf_weighted_avg_f1)), 
      '±', 
      '{:.3f}'.format(2*np.std(xgb_clf_weighted_avg_f1)))

average weighted f1 score for xgb clf 0.908 ± 0.048


**svm classification results**

In [34]:
print('average weighted f1 score for svm clf', 
      '{:.3f}'.format(np.mean(svm_clf_weighted_avg_f1)), 
      '±', 
      '{:.3f}'.format(2*np.std(svm_clf_weighted_avg_f1)))

average weighted f1 score for svm clf 0.923 ± 0.042


**gbdt classification results**

In [35]:
print('average weighted f1 score for gbdt clf', 
      '{:.3f}'.format(np.mean(gbdt_clf_weighted_avg_f1)), 
      '±', 
      '{:.3f}'.format(2*np.std(gbdt_clf_weighted_avg_f1)))

average weighted f1 score for gbdt clf 0.895 ± 0.068


**modeling on Sr**

In [36]:
# split the dataset into X and y(Energy Above Hull)
X_sr = data_Sr.drop(['EnergyAboveHull'], axis=1)
y_sr = data_Sr['EnergyAboveHull']

y_sr_clf = np.zeros_like(y_sr)
y_sr_clf = [1*(EAH<=40) for EAH in y_sr]

**regression on Sr**

In [37]:
fold = 10

xgb_reg_avg_mae = []
xgb_reg_avg_rmse = []

svm_reg_avg_mae = []
svm_reg_avg_rmse = []

gbdt_reg_avg_mae = []
gbdt_reg_avg_rmse = []

for i in range(fold):
    X_trainval, X_test, y_trainval, y_test = train_test_split(X_sr, y_sr, test_size=(1/fold))
    
    xgb_reg_pipeline.fit(X_trainval, y_trainval)
    svm_reg_pipeline.fit(X_trainval, y_trainval)
    gbdt_reg_pipeline.fit(X_trainval, y_trainval)
    
    # for xgb reg
    y_pred = xgb_reg_pipeline.predict(X_test)
    y_true = y_test
    
    xgb_reg_avg_mae.append(mean_absolute_error(y_true, y_pred))
    xgb_reg_avg_rmse.append(sqrt(mean_squared_error(y_true, y_pred)))
    
    # for svm reg
    y_pred = svm_reg_pipeline.predict(X_test)
    y_true = y_test
    
    svm_reg_avg_mae.append(mean_absolute_error(y_true, y_pred))
    svm_reg_avg_rmse.append(sqrt(mean_squared_error(y_true, y_pred)))
    
    # for gbdt reg
    y_pred = gbdt_reg_pipeline.predict(X_test)
    y_true = y_test
    
    gbdt_reg_avg_mae.append(mean_absolute_error(y_true, y_pred))
    gbdt_reg_avg_rmse.append(sqrt(mean_squared_error(y_true, y_pred)))



**xgb regression results**

In [38]:
print('average mae of xgb reg', 
      '{:.3f}'.format(np.mean(xgb_reg_avg_mae)), 
      '±', 
      '{:.3f}'.format(2*np.std(xgb_reg_avg_mae)))
print('average rmse of xgb reg', 
      '{:.3f}'.format(np.mean(xgb_reg_avg_rmse)), 
      '±', 
      '{:.3f}'.format(2*np.std(xgb_reg_avg_rmse)))

average mae of xgb reg 21.825 ± 3.378
average rmse of xgb reg 31.385 ± 4.426


**svm regression results**

In [39]:
print('average mae of svm reg', 
      '{:.3f}'.format(np.mean(svm_reg_avg_mae)), 
      '±', 
      '{:.3f}'.format(2*np.std(svm_reg_avg_mae)))
print('average rmse of svm reg', 
      '{:.3f}'.format(np.mean(svm_reg_avg_rmse)), 
      '±', 
      '{:.3f}'.format(2*np.std(svm_reg_avg_rmse)))

average mae of svm reg 13.437 ± 4.570
average rmse of svm reg 23.101 ± 8.466


**gbdt regression results**

In [40]:
print('average mae of gbdt reg', 
      '{:.3f}'.format(np.mean(gbdt_reg_avg_mae)), 
      '±', 
      '{:.3f}'.format(2*np.std(gbdt_reg_avg_mae)))
print('average rmse of gbdt reg', 
      '{:.3f}'.format(np.mean(gbdt_reg_avg_rmse)), 
      '±', 
      '{:.3f}'.format(2*np.std(gbdt_reg_avg_rmse)))

average mae of gbdt reg 24.526 ± 3.511
average rmse of gbdt reg 33.836 ± 6.283


**classification on Sr**

In [41]:
fold = 10

xgb_clf_weighted_avg_f1 = []
svm_clf_weighted_avg_f1 = []
gbdt_clf_weighted_avg_f1 = []

for i in range(fold):
    X_trainval, X_test, y_trainval, y_test = train_test_split(X_sr, y_sr_clf, test_size=(1/fold))
    
    xgb_clf_pipeline.fit(X_trainval, y_trainval)
    svm_clf_pipeline.fit(X_trainval, y_trainval)
    gbdt_clf_pipeline.fit(X_trainval, y_trainval)
    
    # for xgb clf
    y_pred = xgb_clf_pipeline.predict(X_test)
    y_true = y_test
    
    target_names = ['unstale', 'stable']
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    xgb_clf_weighted_avg_f1.append(report['weighted avg']['f1-score'])
    
    # for svm clf
    y_pred = svm_clf_pipeline.predict(X_test)
    y_true = y_test
    
    target_names = ['unstale', 'stable']
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    svm_clf_weighted_avg_f1.append(report['weighted avg']['f1-score'])
    
    # for gbdt clf
    y_pred = gbdt_clf_pipeline.predict(X_test)
    y_true = y_test
    
    target_names = ['unstale', 'stable']
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    gbdt_clf_weighted_avg_f1.append(report['weighted avg']['f1-score'])

**xgb classification results**

In [42]:
print('average weighted f1 score for xgb clf', 
      '{:.3f}'.format(np.mean(xgb_clf_weighted_avg_f1)), 
      '±', 
      '{:.3f}'.format(2*np.std(xgb_clf_weighted_avg_f1)))

average weighted f1 score for xgb clf 0.935 ± 0.086


**svm classification results**

In [43]:
print('average weighted f1 score for svm clf', 
      '{:.3f}'.format(np.mean(svm_clf_weighted_avg_f1)), 
      '±', 
      '{:.3f}'.format(2*np.std(svm_clf_weighted_avg_f1)))

average weighted f1 score for svm clf 0.933 ± 0.065


**gbdt classification results**

In [44]:
print('average weighted f1 score for gbdt clf', 
      '{:.3f}'.format(np.mean(gbdt_clf_weighted_avg_f1)), 
      '±', 
      '{:.3f}'.format(2*np.std(gbdt_clf_weighted_avg_f1)))

average weighted f1 score for gbdt clf 0.932 ± 0.067


**modeling on Fe**

In [45]:
# split the dataset into X and y(Energy Above Hull)
X_fe = data_Fe.drop(['EnergyAboveHull'], axis=1)
y_fe = data_Fe['EnergyAboveHull']

y_fe_clf = np.zeros_like(y_fe)
y_fe_clf = [1*(EAH<=40) for EAH in y_fe]

**regression on Fe**

In [46]:
fold = 10

xgb_reg_avg_mae = []
xgb_reg_avg_rmse = []

svm_reg_avg_mae = []
svm_reg_avg_rmse = []

gbdt_reg_avg_mae = []
gbdt_reg_avg_rmse = []

for i in range(fold):
    X_trainval, X_test, y_trainval, y_test = train_test_split(X_fe, y_fe, test_size=(1/fold))
    
    xgb_reg_pipeline.fit(X_trainval, y_trainval)
    svm_reg_pipeline.fit(X_trainval, y_trainval)
    gbdt_reg_pipeline.fit(X_trainval, y_trainval)
    
    # for xgb reg
    y_pred = xgb_reg_pipeline.predict(X_test)
    y_true = y_test
    
    xgb_reg_avg_mae.append(mean_absolute_error(y_true, y_pred))
    xgb_reg_avg_rmse.append(sqrt(mean_squared_error(y_true, y_pred)))
    
    # for svm reg
    y_pred = svm_reg_pipeline.predict(X_test)
    y_true = y_test
    
    svm_reg_avg_mae.append(mean_absolute_error(y_true, y_pred))
    svm_reg_avg_rmse.append(sqrt(mean_squared_error(y_true, y_pred)))
    
    # for gbdt reg
    y_pred = gbdt_reg_pipeline.predict(X_test)
    y_true = y_test
    
    gbdt_reg_avg_mae.append(mean_absolute_error(y_true, y_pred))
    gbdt_reg_avg_rmse.append(sqrt(mean_squared_error(y_true, y_pred)))



**xgb regression results**

In [47]:
print('average mae of xgb reg', 
      '{:.3f}'.format(np.mean(xgb_reg_avg_mae)), 
      '±', 
      '{:.3f}'.format(2*np.std(xgb_reg_avg_mae)))
print('average rmse of xgb reg', 
      '{:.3f}'.format(np.mean(xgb_reg_avg_rmse)), 
      '±', 
      '{:.3f}'.format(2*np.std(xgb_reg_avg_rmse)))

average mae of xgb reg 29.623 ± 8.216
average rmse of xgb reg 53.328 ± 40.789


**svm regression results**

In [48]:
print('average mae of svm reg', 
      '{:.3f}'.format(np.mean(svm_reg_avg_mae)), 
      '±', 
      '{:.3f}'.format(2*np.std(svm_reg_avg_mae)))
print('average rmse of svm reg', 
      '{:.3f}'.format(np.mean(svm_reg_avg_rmse)), 
      '±', 
      '{:.3f}'.format(2*np.std(svm_reg_avg_rmse)))

average mae of svm reg 21.700 ± 8.574
average rmse of svm reg 49.154 ± 53.715


**gbdt regression results**

In [49]:
print('average mae of gbdt reg', 
      '{:.3f}'.format(np.mean(gbdt_reg_avg_mae)), 
      '±', 
      '{:.3f}'.format(2*np.std(gbdt_reg_avg_mae)))
print('average rmse of gbdt reg', 
      '{:.3f}'.format(np.mean(gbdt_reg_avg_rmse)), 
      '±', 
      '{:.3f}'.format(2*np.std(gbdt_reg_avg_rmse)))

average mae of gbdt reg 36.139 ± 10.627
average rmse of gbdt reg 59.467 ± 37.883


**classification on Fe**

In [50]:
fold = 10

xgb_clf_weighted_avg_f1 = []
svm_clf_weighted_avg_f1 = []
gbdt_clf_weighted_avg_f1 = []

for i in range(fold):
    X_trainval, X_test, y_trainval, y_test = train_test_split(X_fe, y_fe_clf, test_size=(1/fold))
    
    xgb_clf_pipeline.fit(X_trainval, y_trainval)
    svm_clf_pipeline.fit(X_trainval, y_trainval)
    gbdt_clf_pipeline.fit(X_trainval, y_trainval)
    
    # for xgb clf
    y_pred = xgb_clf_pipeline.predict(X_test)
    y_true = y_test
    
    target_names = ['unstale', 'stable']
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    xgb_clf_weighted_avg_f1.append(report['weighted avg']['f1-score'])
    
    # for svm clf
    y_pred = svm_clf_pipeline.predict(X_test)
    y_true = y_test
    
    target_names = ['unstale', 'stable']
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    svm_clf_weighted_avg_f1.append(report['weighted avg']['f1-score'])
    
    # for gbdt clf
    y_pred = gbdt_clf_pipeline.predict(X_test)
    y_true = y_test
    
    target_names = ['unstale', 'stable']
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    gbdt_clf_weighted_avg_f1.append(report['weighted avg']['f1-score'])

**xgb classifcation results**

In [51]:
print('average weighted f1 score for xgb clf', 
      '{:.3f}'.format(np.mean(xgb_clf_weighted_avg_f1)), 
      '±', 
      '{:.3f}'.format(2*np.std(xgb_clf_weighted_avg_f1)))

average weighted f1 score for xgb clf 0.894 ± 0.077


**svm classifcation results**

In [52]:
print('average weighted f1 score for svm clf', 
      '{:.3f}'.format(np.mean(svm_clf_weighted_avg_f1)), 
      '±', 
      '{:.3f}'.format(2*np.std(svm_clf_weighted_avg_f1)))

average weighted f1 score for svm clf 0.927 ± 0.051


**gbdt classifcation results**

In [53]:
print('average weighted f1 score for gbdt clf', 
      '{:.3f}'.format(np.mean(gbdt_clf_weighted_avg_f1)), 
      '±', 
      '{:.3f}'.format(2*np.std(gbdt_clf_weighted_avg_f1)))

average weighted f1 score for gbdt clf 0.887 ± 0.097


**modeling on Co**

In [54]:
# split the dataset into X and y(Energy Above Hull)
X_co = data_Co.drop(['EnergyAboveHull'], axis=1)
y_co = data_Co['EnergyAboveHull']

y_co_clf = np.zeros_like(y_co)
y_co_clf = [1*(EAH<=40) for EAH in y_co]

**regression on Co**

In [55]:
fold = 10

xgb_reg_avg_mae = []
xgb_reg_avg_rmse = []

svm_reg_avg_mae = []
svm_reg_avg_rmse = []

gbdt_reg_avg_mae = []
gbdt_reg_avg_rmse = []

for i in range(fold):
    X_trainval, X_test, y_trainval, y_test = train_test_split(X_co, y_co, test_size=(1/fold))
    
    xgb_reg_pipeline.fit(X_trainval, y_trainval)
    svm_reg_pipeline.fit(X_trainval, y_trainval)
    gbdt_reg_pipeline.fit(X_trainval, y_trainval)
    
    # for xgb reg
    y_pred = xgb_reg_pipeline.predict(X_test)
    y_true = y_test
    
    xgb_reg_avg_mae.append(mean_absolute_error(y_true, y_pred))
    xgb_reg_avg_rmse.append(sqrt(mean_squared_error(y_true, y_pred)))
    
    # for svm reg
    y_pred = svm_reg_pipeline.predict(X_test)
    y_true = y_test
    
    svm_reg_avg_mae.append(mean_absolute_error(y_true, y_pred))
    svm_reg_avg_rmse.append(sqrt(mean_squared_error(y_true, y_pred)))
    
    # for gbdt reg
    y_pred = gbdt_reg_pipeline.predict(X_test)
    y_true = y_test
    
    gbdt_reg_avg_mae.append(mean_absolute_error(y_true, y_pred))
    gbdt_reg_avg_rmse.append(sqrt(mean_squared_error(y_true, y_pred)))



**xgb regression results**

In [56]:
print('average mae of xgb reg', 
      '{:.3f}'.format(np.mean(xgb_reg_avg_mae)), 
      '±', 
      '{:.3f}'.format(2*np.std(xgb_reg_avg_mae)))
print('average rmse of xgb reg', 
      '{:.3f}'.format(np.mean(xgb_reg_avg_rmse)), 
      '±', 
      '{:.3f}'.format(2*np.std(xgb_reg_avg_rmse)))

average mae of xgb reg 20.983 ± 7.081
average rmse of xgb reg 30.771 ± 13.017


**svm regression results**

In [57]:
print('average mae of svm reg', 
      '{:.3f}'.format(np.mean(svm_reg_avg_mae)), 
      '±', 
      '{:.3f}'.format(2*np.std(svm_reg_avg_mae)))
print('average rmse of svm reg', 
      '{:.3f}'.format(np.mean(svm_reg_avg_rmse)), 
      '±', 
      '{:.3f}'.format(2*np.std(svm_reg_avg_rmse)))

average mae of svm reg 17.530 ± 7.541
average rmse of svm reg 28.730 ± 14.677


**gbdt regression results**

In [58]:
print('average mae of gbdt reg', 
      '{:.3f}'.format(np.mean(gbdt_reg_avg_mae)), 
      '±', 
      '{:.3f}'.format(2*np.std(gbdt_reg_avg_mae)))
print('average rmse of gbdt reg', 
      '{:.3f}'.format(np.mean(gbdt_reg_avg_rmse)), 
      '±', 
      '{:.3f}'.format(2*np.std(gbdt_reg_avg_rmse)))

average mae of gbdt reg 22.271 ± 7.975
average rmse of gbdt reg 31.126 ± 14.163


**classification on Co**

In [59]:
fold = 10

xgb_clf_weighted_avg_f1 = []
svm_clf_weighted_avg_f1 = []
gbdt_clf_weighted_avg_f1 = []

for i in range(fold):
    X_trainval, X_test, y_trainval, y_test = train_test_split(X_co, y_co_clf, test_size=(1/fold))
    
    xgb_clf_pipeline.fit(X_trainval, y_trainval)
    svm_clf_pipeline.fit(X_trainval, y_trainval)
    gbdt_clf_pipeline.fit(X_trainval, y_trainval)
    
    # for xgb clf
    y_pred = xgb_clf_pipeline.predict(X_test)
    y_true = y_test
    
    target_names = ['unstale', 'stable']
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    xgb_clf_weighted_avg_f1.append(report['weighted avg']['f1-score'])
    
    # for svm clf
    y_pred = svm_clf_pipeline.predict(X_test)
    y_true = y_test
    
    target_names = ['unstale', 'stable']
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    svm_clf_weighted_avg_f1.append(report['weighted avg']['f1-score'])
    
    # for gbdt clf
    y_pred = gbdt_clf_pipeline.predict(X_test)
    y_true = y_test
    
    target_names = ['unstale', 'stable']
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    gbdt_clf_weighted_avg_f1.append(report['weighted avg']['f1-score'])

**xgb classification results**

In [60]:
print('average weighted f1 score for xgb clf', 
      '{:.3f}'.format(np.mean(xgb_clf_weighted_avg_f1)), 
      '±', 
      '{:.3f}'.format(2*np.std(xgb_clf_weighted_avg_f1)))

average weighted f1 score for xgb clf 0.915 ± 0.109


**svm classification results**

In [61]:
print('average weighted f1 score for svm clf', 
      '{:.3f}'.format(np.mean(svm_clf_weighted_avg_f1)), 
      '±', 
      '{:.3f}'.format(2*np.std(svm_clf_weighted_avg_f1)))

average weighted f1 score for svm clf 0.921 ± 0.063


**gbdt classification results**

In [62]:
print('average weighted f1 score for gbdt clf', 
      '{:.3f}'.format(np.mean(gbdt_clf_weighted_avg_f1)), 
      '±', 
      '{:.3f}'.format(2*np.std(gbdt_clf_weighted_avg_f1)))

average weighted f1 score for gbdt clf 0.896 ± 0.077
