In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import MinMaxScaler

data = pd.read_excel(r"E:\LCMs-descriptors.xlsx")

names = data["Name"]

if "label" in data.columns:
    y = data["label"].values
else:
    y = data["Category"].map({
        "acceptable": 0,
        "potential": 1,
        "unacceptable": 2
    }).values

X = data.iloc[:, 4:]
feature_names = X.columns

vt = VarianceThreshold(threshold=0.0)
X_var = vt.fit_transform(X)
features_var = feature_names[vt.get_support()]

corr_matrix = X[features_var].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.65)]
X_uncorr = X[features_var].drop(columns=to_drop)

rf = RandomForestClassifier(n_estimators=500, random_state=42)
rf.fit(X_uncorr, y)

sfm = SelectFromModel(rf, threshold="median", prefit=True) 
X_selected = sfm.transform(X_uncorr)
features_selected = list(X_uncorr.columns[sfm.get_support()])

features_selected = [f for f in X.columns if f in features_selected]

final_df = pd.concat([names, pd.Series(y, name="label"), X[features_selected]], axis=1)

output_path = r"E:\LCMs-Features-Selected(final).xlsx"
final_df.to_excel(output_path, index=False)


In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
import random
import math

X = X[features_selected].values  
y = y                           

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
import random

seed = 40

torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.176, random_state=42, stratify=y_temp)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

train_min = scaler.data_min_
train_max = scaler.data_max_

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=16, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_tensor, y_val_tensor), batch_size=16, shuffle=False)

class RealFormerClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, d_model=32, nhead=4, num_layers=1, dropout=0.3):
        super(RealFormerClassifier, self).__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Sequential(
            nn.Linear(d_model, 32),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(32, num_classes)
        )

    def forward(self, x):
        x = self.input_proj(x)
        x = x.unsqueeze(1)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        return self.classifier(x)

num_classes = len(np.unique(y))
model = RealFormerClassifier(input_dim=X_train.shape[1], num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

epochs = 500
best_val_acc = 0
patience = 100
counter = 0

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch_X, batch_y in train_loader:
        preds = model(batch_X)
        loss = criterion(preds, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    model.eval()
    with torch.no_grad():
        val_logits = model(X_val_tensor)
        val_pred = val_logits.argmax(dim=1).cpu().detach().numpy()
        val_proba = torch.softmax(val_logits, dim=1).cpu().detach().numpy()
        val_acc = accuracy_score(y_val, val_pred)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        counter = 0
        torch.save(model.state_dict(), f"best_model_seed{seed}.pt")
        best_val_pred = val_pred
        best_val_proba = val_proba
    else:
        counter += 1

    if counter >= patience:
        break

model.load_state_dict(torch.load(f"best_model_seed{seed}.pt"))
model.eval()

y_val_pred = best_val_pred
y_val_proba = best_val_proba

print("Accuracy :", accuracy_score(y_val, y_val_pred))
print("Precision:", precision_score(y_val, y_val_pred, average="macro"))
print("Recall   :", recall_score(y_val, y_val_pred, average="macro"))
print("F1 Score :", f1_score(y_val, y_val_pred, average="macro"))
try:
    auc_val = roc_auc_score(y_val, y_val_proba, multi_class="ovr", average="macro")
    print("AUC      :", auc_val)
except:
    print("AUC      : Only supports binary classification or One-vs-Rest multi-classification")

print("confusion matrix: \n", confusion_matrix(y_val, y_val_pred))

with torch.no_grad():
    y_test_logits = model(X_test_tensor)
    y_test_pred = y_test_logits.argmax(dim=1).cpu().detach().numpy()
    y_test_proba = torch.softmax(y_test_logits, dim=1).cpu().detach().numpy()

print("Accuracy :", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred, average="macro"))
print("Recall   :", recall_score(y_test, y_test_pred, average="macro"))
print("F1 Score :", f1_score(y_test, y_test_pred, average="macro"))
try:
    auc_test = roc_auc_score(y_test, y_test_proba, multi_class="ovr", average="macro")
    print("AUC      :", auc_test)
except:
    print("AUC      : Only supports binary classification or One-vs-Rest multi-classification")

print("confusion matrix: \n", confusion_matrix(y_test, y_test_pred))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

plt.rcParams["font.family"] = "Times New Roman"

def set_black_lines(disp):
    ax = disp.ax_
    for spine in ax.spines.values():
        spine.set_color('black')
        spine.set_linewidth(1.2)
    if hasattr(disp, 'im_') and disp.im_.colorbar is not None:
        disp.im_.colorbar.outline.set_edgecolor('black')
        disp.im_.colorbar.outline.set_linewidth(1.2)
    ax.grid(False)

disp_val = ConfusionMatrixDisplay.from_predictions(
    y_val, val_pred,
    display_labels=["acceptable", "potential", "unacceptable"],
    cmap="Blues"
)
set_black_lines(disp_val)
plt.title("Confusion Matrix (Validation Set)", fontsize=14)
plt.tight_layout() 
plt.savefig(
    r"E:\Confusion_Matrix_Validation.png",
    dpi=900,
    bbox_inches="tight"
)
plt.show()

disp_test = ConfusionMatrixDisplay.from_predictions(
    y_test, y_test_pred,
    display_labels=["acceptable", "potential", "unacceptable"],
    cmap="Blues"
)
set_black_lines(disp_test)
plt.title("Confusion Matrix (Test Set)", fontsize=14)
plt.tight_layout() 
plt.savefig(
    r"E:\Confusion_Matrix_Test.png",
    dpi=900,
    bbox_inches="tight"
)
plt.show()

In [None]:
import pandas as pd
import torch
import numpy as np

new_data_path = r"E:\LCMs_1431_desc.csv"
new_data = pd.read_csv(new_data_path)

selected_features = [
    "ALogP", "AMR", "apol",  "ATSC2c",
    "ATSC6m", "ATSC7m", "ATSC6i",
    "GATS3c", "VE1_DzZ", "VR1_DzZ", "VE1_Dzp", "VE3_Dt", "MDEC-22", 
    "PetitjeanNumber", "JGI10", "VR1_D"
]

missing_cols = [col for col in selected_features if col not in new_data.columns]
if missing_cols:
    raise ValueError(f"The new data is missing these characteristic columns:{missing_cols}")

new_features = new_data[selected_features]

arr = new_features.values
arr_clipped = np.clip(arr, train_min, train_max)  
new_features_scaled = scaler.transform(arr_clipped)

X_new = new_features_scaled
df = new_data.copy()

new_features_tensor = torch.tensor(new_features_scaled, dtype=torch.float32)

model.eval()
with torch.no_grad():
    logits = model(new_features_tensor)            
    probs = torch.softmax(logits, dim=1).numpy()    
    preds = probs.argmax(axis=1)                  

label_map = {0: "acceptable", 1: "potential", 2: "unacceptable"}
pred_labels = [label_map[i] for i in preds]

df["Predicted_Category"] = pred_labels
for i, cls in label_map.items():
    df[f"Prob_{cls}"] = probs[:, i]

output_path = r"E:\LCMs_1431_Predicted_Category(final).csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")

print(df[["Name", "Predicted_Category"] + [f"Prob_{c}" for c in label_map.values()]].head())

In [None]:
import os
import numpy as np
import torch
import shap
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["font.family"] = "Times New Roman"
sns.set(style="white") 

def model_predict(x_numpy):
    """Define the model prediction function: input numpy, output softmax probability"""
    x = np.asarray(x_numpy, dtype=np.float32)
    with torch.no_grad():
        out = model(torch.tensor(x))
        return torch.softmax(out, dim=1).cpu().numpy()

rng = np.random.default_rng(42)
N_SHAP = len(X_new)
sample_idx = rng.choice(len(X_new), size=N_SHAP, replace=False)
X_shap = X_new[sample_idx]

explainer = shap.Explainer(model_predict, X_new, algorithm="permutation")
shap_values = explainer(X_shap)

vals = getattr(shap_values, "values", None)
if vals is None:
    vals = np.array(shap_values)
n_samples, n_features, n_classes = vals.shape

save_dir = r"E:\classification"
os.makedirs(save_dir, exist_ok=True)

max_classes_to_plot = 3
for cls_idx, cls_name in list(label_map.items())[:max_classes_to_plot]:
    shap_vals_one = vals[:, :, cls_idx]
    shap.summary_plot(
        shap_vals_one,
        X_shap,
        feature_names=selected_features,
        show=False,
        plot_type="dot",
        max_display=20
    )

    plt.title(f"SHAP Beeswarm â€“ Class: {cls_name}", fontname="Times New Roman")
    plt.grid(False)
    plt.tight_layout()

    save_path = os.path.join(save_dir, f"SHAP_Beeswarm_{cls_name}.png")
    plt.savefig(save_path, dpi=900, bbox_inches='tight')
    plt.close() 

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

mean_abs_per_class = np.mean(np.abs(vals), axis=0)   # (n_features, n_classes)

feat_names = np.array(selected_features)
df_importance = pd.DataFrame(
    mean_abs_per_class,
    index=feat_names,
    columns=[label_map[c] for c in range(n_classes)]
)

topk = 20
order = df_importance.sum(axis=1).sort_values(ascending=False).index[:topk]
df_top = df_importance.loc[order]

plt.rcParams["font.family"] = "Times New Roman" 
fig, ax = plt.subplots(figsize=(9, 6))
colors = ["#1f77b4", "#ff7f0e", "#2ca02c"] 

bottom = np.zeros(len(df_top)) 
for i, cls in enumerate(df_top.columns):
    ax.barh(df_top.index, df_top[cls], left=bottom, color=colors[i], label=cls)
    bottom += df_top[cls].values

ax.grid(False)

ax.set_xlabel("Mean(|SHAP|)", fontname="Times New Roman")
ax.set_title(f"Top-{topk} Feature Importance (stacked by class)", fontname="Times New Roman")

ax.legend(title="Class", prop={"family": "Times New Roman"})

plt.gca().invert_yaxis()

plt.tight_layout()

plt.savefig(r"E:\importance.png", dpi=900, bbox_inches='tight')

plt.show()