In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import VarianceThreshold, RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import shap
from xgboost import XGBRegressor
import time
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from mpl_toolkits.axes_grid1 import make_axes_locatable
import warnings
warnings.filterwarnings('ignore') 

In [2]:
def del_constant(df_train):
    column_names = df_train.columns.tolist()
    column_names.remove('Repeat_Unit')
    column_names.remove('Tg')
    All_des = df_train[column_names].values
    selector = VarianceThreshold() 
    selector.fit_transform(All_des) 
    sel = selector.get_support(indices=True) 
    sel_name = []
    for i in range(len(sel)):
        m = sel[i]
        sel_name.append(column_names[m]) 
    cols = ['Repeat_Unit'] + sel_name + ['Tg']
    df = df_train[cols] 
    return df

In [3]:
def del_Colinear(train_df):
    
    threshold = 0.95
    
    df = train_df.drop(columns=['Repeat_Unit'])
    r_pearson = df.corr(method='pearson') 
    del_name = []
    for i in range(r_pearson.shape[0]-1): 
        if r_pearson.columns[i] in del_name:
            continue
        else:
            for j in range(i+1, r_pearson.shape[0]-1):
                if abs(r_pearson.iat[i, j]) > threshold: 
                    r_tg_i = r_pearson.loc[r_pearson.columns[i], 'Tg']
                    r_tg_j = r_pearson.loc[r_pearson.columns[j], 'Tg'] 
                    if abs(r_tg_i) >= abs(r_tg_j):
                        if r_pearson.columns[j] not in del_name:
                            del_name.append(r_pearson.columns[j])
                    else:
                        if r_pearson.columns[i] not in del_name:
                            del_name.append(r_pearson.columns[i])
                            break
                else:
                    continue

    newdf = train_df.drop(columns=del_name)
    return newdf

In [4]:
df0 = pd.read_csv('./polymer_content_molecule_descriptors_final.csv')
df_t1 = del_constant(df0)
df = del_Colinear(df_t1)
df = df[:40]

In [None]:
%run utils.py
%run kNNMTD.py

In [47]:
real = df.drop(columns=['Repeat_Unit'])
generator = kNNMTD(n_obs=200, k=5,sigma_factor=0.25,random_state=2042)
synthetic = generator.fit(real, class_col='Tg')
synthetic
pcd = PCD(real,synthetic)

In [None]:
synthetic['Tg'] = synthetic['Tg'].round(2)
combined_data = pd.concat([real, synthetic], ignore_index=True)

In [None]:
X = synthetic.drop(columns=['Tg'])
y = synthetic['Tg']
y = y.to_numpy()
y = y.reshape(-1, 1)
scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.2, random_state=66)

In [None]:
# ml model
models = {
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(),
    "LightGBM": LGBMRegressor(),
}

sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.5)
colors = sns.color_palette("BrBG_r") 

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Times New Roman'
plt.rcParams['font.size'] = 20

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    index = range(len(y_pred))
    confidence_interval = 1.96 * np.std(y_pred) 

    plt.figure(figsize=(14, 9))
    plt.plot(index, y_test, 
             label='True values', 
             marker='o', 
             linestyle='-', 
             color=colors[4], 
             linewidth=2.5,
             markersize=8)
    plt.plot(index, y_pred, 
             label='Predicted values', 
             marker='s', 
             linestyle='--', 
             color=colors[1], 
             linewidth=2.5,
             markersize=8)
    plt.fill_between(index, 
                     y_pred - confidence_interval, 
                     y_pred + confidence_interval, 
                     color=colors[1], 
                     alpha=0.2, 
                     label='Confidence Interval')

    textstr = f'RMSE = {rmse:.4f}\nR² = {r2:.4f}\nMAE = {mae:.4f}'
    plt.text(0.02, 0.9, textstr, 
             transform=plt.gca().transAxes,
             bbox=dict(boxstyle="round", fc="white", ec="0.5", alpha=0.9),
             fontsize=14)
    plt.grid(False)
    plt.title(f'{name}: Predicted vs True values with Confidence Interval', 
              fontsize=20, fontweight='bold')
    plt.xlabel('Sample index', fontsize=16)
    plt.ylabel('Value', fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.legend(loc='upper right', fontsize=14, framealpha=0.9)
    plt.tight_layout()
    plt.savefig(f'{name}_Predicted vs True values with Confidence Interval.svg', format="svg", dpi=1200, bbox_inches="tight",  transparent=True)
    plt.show()

