1. Import Libraries

In [None]:
#path
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

In [None]:
#import 
from function import yyplot_k, boruta, search_highly_correlated_variables

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import ARDRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV, KFold
from sklearn.metrics import pairwise_distances, r2_score, mean_squared_error, mean_absolute_error
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import itertools

In [None]:
#random seed
rseed_boruta = 1

In [None]:
if __name__ == "__main__":
    # file & preprocessing
    df = pd.read_excel("./0_doctor_reserch/1_nitroxide/paper/sentence/論文用_github/data/XY/JACS/1999_JACS_St_mechanism_oriented_descriptors.xlsx", index_col = 0)
    
    df.index = df.index.astype("str")
    df.drop(["41"], axis=0, inplace=True) #Select the sample you want to use
    X = df.loc[:, :"ΔH"]
    y = np.log(df["PDI"]-1)
    
    model = ARDRegression(n_iter=1000)
    
    fitting_results = pd.DataFrame(index = X.index, columns = ['pred_y', 'observed_y'])
    
    
    X = search_highly_correlated_variables(X, 0.8)
    X = boruta(X, y, perc = 90, rseed_boruta = rseed_boruta)
    
    scaler_X = StandardScaler()
    scaler_X.fit(X)
    X_sc = pd.DataFrame(scaler_X.transform(X), index = X.index, columns = X.columns)
    
    model.fit(X_sc, y)
    y_pred = model.predict(X_sc)
    
    coefficients = model.coef_
    
    fitting_results.loc[X.index, 'pred_y'] = np.exp(y_pred) + 1
    fitting_results.loc[X.index, 'observed_y'] = np.exp(y) + 1

    
    
    dirname = "./0_doctor_reserch/1_nitroxide/paper/sentence/論文用_github/result"
    os.makedirs(dirname, exist_ok = True)
    
    fitting_results.to_excel(dirname + "/ARD_fitting.xlsx")
    

    #valuation index
    r2 = r2_score(fitting_results['observed_y'], fitting_results['pred_y'])
    MAE = mean_absolute_error(fitting_results['observed_y'], fitting_results['pred_y'])
    
    #yy-plot
    yyplot_k(fitting_results['observed_y'], fitting_results['pred_y'])
    
    for i, label in enumerate (fitting_results.index):
        plt.annotate(label, xy = (fitting_results['observed_y'][label], fitting_results['pred_y'][label]), xytext=(0, 5),  # Adjust these values as needed
            textcoords='offset points',size =8, color = "steelblue")
        
    plt.text(0.05, 0.95, r"$R^2$={}, MAE={}".format(round(r2, 2), round(MAE, 2)), transform=plt.gca().transAxes,
             verticalalignment='top', horizontalalignment='left',
             bbox=dict(facecolor='white', edgecolor='none', alpha=0.5))
    
    plt.savefig(dirname + "/ARD_fitiing.jpg")
    
    #feature importance
    plt.figure(figsize = [15, 8])
    plt.rcParams["font.size"] =30
    nonzero_coefficients = [i for i, coef in enumerate(model.coef_) if abs(coef) >  0.002]
    # 係数が0でない特徴量だけを残す
    X_selected = X.iloc[:, nonzero_coefficients]
    plt.barh(X_selected.columns.ravel(), model.coef_[nonzero_coefficients].ravel())
    plt.grid()
    plt.title("feature importance")
    plt.tight_layout()
    
    plt.savefig(dirname + "/ARD_fitiing_feature_importance.jpg")