In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

In [None]:
# 加载数据集
data = pd.read_csv('pro_stru_features.csv')

# 将特征列和标签列分离出来
features = data.drop(['normalized_length', 'normalized_abs_surf_acc', 'rel_surf_acc', 'prob_helix', 'prob_sheet', 'prob_coil'], axis=1)
labels = data['p_interface']

In [None]:
# 对蛋白质序列进行单热编码
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
sequence_encoded = encoder.fit_transform(features['sequence'].values.reshape(-1, 1))
# 将单热编码后的序列加入特征矩阵中
features_encoded = pd.concat([features.drop('sequence', axis=1), pd.DataFrame(sequence_encoded)], axis=1)

In [None]:
# 对标签列进行标签编码
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

In [None]:
# 将特征列进行特征缩放
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_encoded)

In [None]:
# 将数据集分为训练集、验证集和测试集
train_features, test_features, train_labels, test_labels = train_test_split(scaled_features, labels_encoded, test_size=0.2, random_state=42)
train_features, val_features, train_labels, val_labels = train_test_split(train_features, train_labels, test_size=0.2, random_state=42)

In [None]:
# 训练模型
rfc = RandomForestClassifier(random_state=42)
rfc.fit(train_features, train_labels)

# 在验证集上评估模型性能
val_acc = rfc.score(val_features, val_labels)
print('Validation accuracy:', val_acc)

# 在测试集上评估模型性能
test_acc = rfc.score(test_features, test_labels)
print('Test accuracy:', test_acc)