# group validation on 160 dataset

In [1]:
import os
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import pandas as pd

# Import relevant scikit-learn modules
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_validate

In [25]:
# read .csv format data
Data_path = '../Data/data_160_ori.csv'
REPR_DIR = '../Reprs'

repr_path= []
for file in os.listdir(REPR_DIR):
    if '.csv' in file:
        path = REPR_DIR+ '/' + file
        repr_path.append(path)

df = pd.read_csv(Data_path, index_col=0)
unimol_repr = pd.read_csv(repr_path[-2],index_col=0)
sol_repr = pd.read_csv(repr_path[1],index_col=0)
sol_oh = pd.read_csv(repr_path[3],index_col=0)
# sol_oh = sol_oh[0:160]

# concat required features
fea_df = pd.concat([unimol_repr,sol_repr], axis=1)

# Extract label colunm
label_df = df['yields_label']

In [26]:
# test yields
# label Encoding
le = LabelEncoder()
label_le = le.fit_transform(label_df) #  0: high 1: low, 2: medium

ee = df['ee'].copy()

low = []
med = []
high = []

for i in range(len(ee)):
    if ee[i] >= 80:
        high.append(i)
        ee.iloc[i] = 'high'
    elif ee[i] <= 20:
        low.append(i)
        ee.iloc[i] = 'low'
    else:
        med.append(i)
        ee.iloc[i] = 'medium'
label_df2 = ee

le2 = LabelEncoder()
label_le2 = le2.fit_transform(label_df2) # medium 2, low 1

  ee.iloc[i] = 'medium'


In [27]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

X = fea_df  
y = label_le
groups = np.repeat(np.arange(32), 5)  # 每5条数据是一个组，总共有36组

# 创建随机森林模型
model = RandomForestClassifier(
    n_estimators=93,         # 树的数量
    max_depth=19,             # 树的最大深度，防止过拟合
    min_samples_split=4,      # 内部节点再划分所需的最小样本数
    min_samples_leaf=4,       # 叶节点所需的最小样本数
    class_weight= {0: 4, 1: 1, 2: 2},  # 处理类别不均衡
    max_features='sqrt',
    random_state=42)

# 创建 GroupKFold 对象
gkf = GroupKFold(n_splits=5)

# 存储每次测试集的评分
accuracies = []
precisions = []
recalls = []
f1_scores = []
confusion_matrices = []

# 进行分组交叉验证
for train_idx, test_idx in gkf.split(X, y, groups=groups):
    # print("Train indices:", train_idx)
    # print("Test indices:", test_idx)
    
    # 使用 .iloc 基于整数位置索引划分数据集
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # 训练模型
    model.fit(X_train, y_train)

    # 预测
    y_pred = model.predict(X_test)
    
    # 计算各项评分
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    confusion = confusion_matrix(y_test, y_pred)


    # 存储评分
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    confusion_matrices.append(confusion)

# 输出每次交叉验证的准确率以及其他评分
print(f"Accuracies: {accuracies}")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")

print(f"Precisions: {precisions}")
print(f"Mean Precision: {np.mean(precisions):.4f}")

print(f"Recalls: {recalls}")
print(f"Mean Recall: {np.mean(recalls):.4f}")

print(f"F1 Scores: {f1_scores}")
print(f"Mean F1 Score: {np.mean(f1_scores):.4f}")

# 输出混淆矩阵（每个折的混淆矩阵）
print(f"Confusion Matrices: {confusion_matrices}")



Accuracies: [0.5714285714285714, 0.5428571428571428, 0.8333333333333334, 0.8, 0.6]
Mean Accuracy: 0.6695
Precisions: [0.19047619047619047, 0.2933333333333334, 0.4188034188034188, 0.26666666666666666, 0.3717948717948718]
Mean Precision: 0.3082
Recalls: [0.3333333333333333, 0.4488888888888889, 0.6533333333333333, 0.3333333333333333, 0.3803921568627451]
Mean Recall: 0.4299
F1 Scores: [0.24242424242424243, 0.3292307692307692, 0.4803921568627451, 0.29629629629629634, 0.34330011074197125]
Mean F1 Score: 0.3383
Confusion Matrices: [array([[ 0,  5,  0],
       [ 0, 20,  0],
       [ 0, 10,  0]], dtype=int64), array([[ 0,  7,  0],
       [ 0, 17,  8],
       [ 0,  1,  2]], dtype=int64), array([[ 1,  0,  0],
       [ 0, 24,  1],
       [ 2,  2,  0]], dtype=int64), array([[ 0,  1,  0],
       [ 0, 24,  0],
       [ 0,  5,  0]], dtype=int64), array([[ 0,  2,  1],
       [ 0, 16,  1],
       [ 0,  8,  2]], dtype=int64)]


# 180 dataset

In [19]:
# read .csv format data
Data_path = '../Data/data_180_ori.csv'
REPR_DIR = '../Reprs'

repr_path= []
for file in os.listdir(REPR_DIR):
    if '.csv' in file:
        path = REPR_DIR+ '/' + file
        repr_path.append(path)

df = pd.read_csv(Data_path, index_col=0)
unimol_repr = pd.read_csv(repr_path[-1],index_col=0)
sol_repr = pd.read_csv(repr_path[2],index_col=0)

# concat required features
fea_df = pd.concat([unimol_repr,sol_repr], axis=1)

# Extract label colunm
label_df = df['yields_label']

In [21]:
# test yields
# label Encoding
le = LabelEncoder()
label_le = le.fit_transform(label_df) #  0: high 1: low, 2: medium

ee = df['ee'].copy()

low = []
med = []
high = []

for i in range(len(ee)):
    if ee[i] >= 80:
        high.append(i)
        ee.iloc[i] = 'high'
    elif ee[i] <= 20:
        low.append(i)
        ee.iloc[i] = 'low'
    else:
        med.append(i)
        ee.iloc[i] = 'medium'
label_df2 = ee

le2 = LabelEncoder()
label_le2 = le2.fit_transform(label_df2) # medium 2, low 1

  ee.iloc[i] = 'medium'


In [24]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

X = fea_df  
y = label_le
groups = np.repeat(np.arange(36), 5)  # 每5条数据是一个组，总共有36组

# 创建随机森林模型
model = RandomForestClassifier(
    n_estimators=93,         # 树的数量
    max_depth=19,             # 树的最大深度，防止过拟合
    min_samples_split=4,      # 内部节点再划分所需的最小样本数
    min_samples_leaf=4,       # 叶节点所需的最小样本数
    class_weight= {0: 4, 1: 1, 2: 2},  # 处理类别不均衡
    max_features='sqrt',
    random_state=42)

# 创建 GroupKFold 对象
gkf = GroupKFold(n_splits=5)

# 存储每次测试集的评分
accuracies = []
precisions = []
recalls = []
f1_scores = []
confusion_matrices = []

# 进行分组交叉验证
for train_idx, test_idx in gkf.split(X, y, groups=groups):
    # print("Train indices:", train_idx)
    print("Test indices:", test_idx)
    
    # 使用 .iloc 基于整数位置索引划分数据集
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # 训练模型
    model.fit(X_train, y_train)

    # 预测
    y_pred = model.predict(X_test)
    
    # 计算各项评分
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    confusion = confusion_matrix(y_test, y_pred)


    # 存储评分
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    confusion_matrices.append(confusion)

# 输出每次交叉验证的准确率以及其他评分
print(f"Accuracies: {accuracies}")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")

print(f"Precisions: {precisions}")
print(f"Mean Precision: {np.mean(precisions):.4f}")

print(f"Recalls: {recalls}")
print(f"Mean Recall: {np.mean(recalls):.4f}")

print(f"F1 Scores: {f1_scores}")
print(f"Mean F1 Score: {np.mean(f1_scores):.4f}")

# 输出混淆矩阵（每个折的混淆矩阵）
print(f"Confusion Matrices: {confusion_matrices}")



Test indices: [  0   1   2   3   4  10  11  12  13  14  35  36  37  38  39  60  61  62
  63  64 120 121 122 123 124 135 136 137 138 139 145 146 147 148 149 175
 176 177 178 179]
Test indices: [  5   6   7   8   9  30  31  32  33  34  55  56  57  58  59 115 116 117
 118 119 140 141 142 143 144 165 166 167 168 169 170 171 172 173 174]
Test indices: [ 25  26  27  28  29  50  51  52  53  54  75  76  77  78  79  80  81  82
  83  84 110 111 112 113 114 130 131 132 133 134 160 161 162 163 164]
Test indices: [ 20  21  22  23  24  45  46  47  48  49  70  71  72  73  74  85  86  87
  88  89  95  96  97  98  99 105 106 107 108 109 155 156 157 158 159]
Test indices: [ 15  16  17  18  19  40  41  42  43  44  65  66  67  68  69  90  91  92
  93  94 100 101 102 103 104 125 126 127 128 129 150 151 152 153 154]
Accuracies: [0.725, 0.7142857142857143, 0.6285714285714286, 0.7142857142857143, 0.6857142857142857]
Mean Accuracy: 0.6936
Precisions: [0.41228070175438597, 0.7924297924297924, 0.7133333333333334