In [7]:
import pandas as pd
from sklearn.utils import resample

# 读取csv文件
df = pd.read_csv('ppi.csv')

# 将数据集拆分为少数类和多数类
df_minority = df[df['p_interface']==1]
df_majority = df[df['p_interface']==0]

# 下采样多数类样本
df_majority_downsampled = resample(df_majority, 
                                   replace=False,    # 不替换样本
                                   n_samples=len(df_minority),    # 设置多数类样本数等于少数类样本数
                                   random_state=123)    # 使结果可重复

# 将下采样后的多数类样本和少数类样本合并成一个新的数据集
df_balanced = pd.concat([df_minority, df_majority_downsampled])

# 将新的数据集保存为CSV文件
df_balanced.to_csv('balanced_ppi.csv', index=False)

import pandas as pd

# 读取下采样后的数据集
df_balanced = pd.read_csv('balanced_ppi.csv')

# 输出数据集信息
print("下采样后的数据集信息：")
print(df_balanced['p_interface'].value_counts())

下采样后的数据集信息：
1    7845
0    7845
Name: p_interface, dtype: int64


In [14]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np

# 读取df_balanced数据集
ppi_balanced_data = pd.read_csv('balanced_ppi.csv')

# 对PSSM矩阵特征进行编码
pssm_features = ppi_balanced_data[['pssm_A', 'pssm_R', 'pssm_N', 'pssm_D', 'pssm_C', 'pssm_Q', 'pssm_E', 'pssm_G', 'pssm_H', 'pssm_I', 'pssm_L', 'pssm_K', 'pssm_M', 'pssm_F', 'pssm_P', 'pssm_S', 'pssm_T', 'pssm_W', 'pssm_Y', 'pssm_V']]
# 数字化编码
pssm_encoded = pssm_features.apply(lambda x: pd.to_numeric(x, errors='coerce'))
# 对蛋白质序列特征进行编码
sequence_features = ppi_balanced_data['sequence']
# One-Hot编码
onehot_encoder = OneHotEncoder()
sequence_encoded = onehot_encoder.fit_transform(sequence_features.to_numpy().reshape(-1, 1))

# 获取氨基酸组成特征
amino_acid_composition = ppi_balanced_data.filter(regex=("^{n}wm.*$"))

# 特征标准化
scaler = StandardScaler()
Rlength_scaled = scaler.fit_transform(ppi_balanced_data[['Rlength']])
normalized_abs_surf_acc_scaled = scaler.fit_transform(ppi_balanced_data[['normalized_abs_surf_acc']])
rel_surf_acc_scaled = scaler.fit_transform(ppi_balanced_data[['rel_surf_acc']])
prob_helix_scaled = scaler.fit_transform(ppi_balanced_data[['prob_helix']])
prob_sheet_scaled = scaler.fit_transform(ppi_balanced_data[['prob_sheet']])
prob_coil_scaled = scaler.fit_transform(ppi_balanced_data[['prob_coil']])

# 将编码后的特征向量添加到已有的特征数组中
features = np.column_stack([
    Rlength_scaled,
    pssm_encoded,
    normalized_abs_surf_acc_scaled,
    rel_surf_acc_scaled,
    prob_helix_scaled,
    prob_sheet_scaled,
    prob_coil_scaled,
    amino_acid_composition,
    sequence_encoded.toarray(),
])
# 获取分类标签数组
labels = ppi_balanced_data['p_interface'].values

# 确保特征数组和标签数组的顺序相同
assert features.shape[0] == labels.shape[0]

# 将特征矩阵和标签向量分别存储在X和y中
X = features
y = labels

# 将特征和标签组成的numpy数组转化为数据框
df = pd.DataFrame(data=X, columns=[f'feat_{i}' for i in range(X.shape[1])])
df['p_interface'] = y

# 将数据框保存为csv文件
df.to_csv('processed_ppi.csv', index=False)


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 分割数据集为训练集、验证集和测试集
#x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
#x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2


#读取csv文件
data = pd.read_csv('pro_stru_features.csv')

# 将数据集划分为训练集（70%）、验证集（15%）和测试集（15%）
train_val, test = train_test_split(ppi_balanced_data, test_size=0.15, random_state=42)
train, val = train_test_split(train_val, test_size=0.1765, random_state=42)

# 将划分后的数据集保存为csv文件
train.to_csv('train.csv', index=False)
val.to_csv('val.csv', index=False)
test.to_csv('test.csv', index=False)