In [7]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 데이터 로드
train_data = pd.read_csv('train.csv')

# 누락된 AlogP 값이 있는 행 제거
# train = train_data.dropna(subset=['AlogP'])
median_value = train_data['AlogP'].median()
train_data['AlogP'].fillna(median_value, inplace=True)
train_data_cleaned = train_data
# SMILES 문자열에서 fingerprint를 생성하는 함수
def generate_fingerprint(smiles_string):
    mol = Chem.MolFromSmiles(smiles_string)
    if mol is not None:
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
        return list(fingerprint)
    else:
        return [0]*2048

# 모든 SMILES 문자열에 대해 fingerprint를 생성
fingerprint_data = train_data_cleaned['SMILES'].apply(generate_fingerprint)
fingerprint_df = pd.DataFrame(fingerprint_data.tolist(), columns=[f'bit_{i}' for i in range(2048)])

# 인덱스 재설정
train_data_cleaned = train_data_cleaned.reset_index(drop=True)

# 특성 및 대상 변수 준비
features = pd.concat([train_data_cleaned.drop(columns=['id', 'SMILES', 'MLM', 'HLM']), fingerprint_df], axis=1)
features2 = pd.concat([train_data_cleaned.drop(columns=['id', 'SMILES', 'MLM']), fingerprint_df], axis=1)

target_mlm = train_data_cleaned['MLM']

# 데이터 분할
X_train, X_test, y_train_mlm, y_test_mlm = train_test_split(features, target_mlm, test_size=0.2, random_state=42)

# 랜덤 포레스트 모델로 MLM 훈련
rf_mlm = RandomForestRegressor(n_estimators=100, random_state=42)
rf_mlm.fit(X_train, y_train_mlm)

# 테스트 세트에서 예측
y_pred_mlm = rf_mlm.predict(X_test)

# MLM의 RMSE 계산
rmse_mlm = mean_squared_error(y_test_mlm, y_pred_mlm, squared=False)
print(f"HLM의 RMSE: {rmse_mlm}")


HLM의 RMSE: 32.958613048586436


In [8]:
# HLM에 대한 타겟 변수 준비
target_hlm = train_data_cleaned['MLM']

# 데이터 분할 (HLM)
X_train_hlm, X_test_hlm, y_train_hlm, y_test_hlm = train_test_split(features2, target_hlm, test_size=0.2, random_state=42)

# 랜덤 포레스트 모델로 HLM 훈련
rf_hlm = RandomForestRegressor(n_estimators=100, random_state=42)
rf_hlm.fit(X_train_hlm, y_train_hlm)

# 테스트 세트에서 HLM 예측
y_pred_hlm = rf_hlm.predict(X_test_hlm)

# HLM의 RMSE 계산
rmse_hlm = mean_squared_error(y_test_hlm, y_pred_hlm, squared=False)
print(f"MLM의 RMSE: {rmse_hlm}")


MLM의 RMSE: 25.707476107643505


In [9]:
features2

Unnamed: 0,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,bit_0,bit_1,...,bit_2038,bit_2039,bit_2040,bit_2041,bit_2042,bit_2043,bit_2044,bit_2045,bit_2046,bit_2047
0,50.680,3.259,400.495,5,2,8,3.259,117.37,0,0,...,0,0,0,0,0,0,0,0,0,0
1,50.590,2.169,301.407,2,1,2,2.172,73.47,0,0,...,0,0,0,0,0,0,0,0,0,0
2,80.892,1.593,297.358,5,0,3,1.585,62.45,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2.000,4.771,494.652,6,0,5,3.475,92.60,0,0,...,0,0,0,0,0,0,0,0,0,0
4,99.990,2.335,268.310,3,0,1,2.337,42.43,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3493,3.079,3.409,396.195,3,1,5,3.409,64.74,0,0,...,0,0,0,0,0,0,0,0,0,0
3494,47.630,1.912,359.381,4,1,3,1.844,77.37,0,0,...,0,0,0,0,0,0,0,0,0,0
3495,1.790,1.941,261.320,3,1,6,2.124,70.14,1,0,...,0,0,0,0,0,0,0,0,0,0
3496,2.770,0.989,284.696,5,1,5,0.989,91.51,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
test_features

Unnamed: 0,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,bit_0,bit_1,...,bit_2038,bit_2039,bit_2040,bit_2041,bit_2042,bit_2043,bit_2044,bit_2045,bit_2046,bit_2047
0,22.350210,2.641,361.505,4,2,7,2.635,92.76,0,1,...,0,0,0,0,0,0,0,0,0,0
1,72.308430,0.585,370.399,5,0,3,0.585,68.31,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28.694860,4.276,347.414,4,4,5,4.290,92.86,0,0,...,0,0,0,0,0,0,0,0,0,0
3,53.365427,1.795,345.358,5,0,2,1.795,81.21,0,0,...,0,0,0,0,0,0,0,0,0,0
4,60.092860,1.219,353.418,4,0,2,0.169,61.15,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,19.524792,4.207,306.443,2,1,7,4.207,55.13,0,0,...,0,0,0,0,0,0,0,0,0,0
479,77.516680,-0.608,335.398,5,0,1,-1.736,70.16,0,0,...,0,0,0,0,0,0,0,0,0,0
480,27.726853,1.792,349.383,3,1,3,1.792,69.72,0,1,...,0,0,0,0,0,0,0,0,0,0
481,61.024540,0.790,341.132,3,2,2,0.423,69.64,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# test.csv 로드
test_data = pd.read_csv('test.csv')

median_value = test_data['AlogP'].median()
test_data['AlogP'].fillna(median_value, inplace=True)
# fingerprint 생성
fingerprint_data_test = test_data['SMILES'].apply(generate_fingerprint)
fingerprint_df_test = pd.DataFrame(fingerprint_data_test.tolist(), columns=[f'bit_{i}' for i in range(2048)])

# 테스트 데이터의 특성 준비
id_col = test_data['id']
test_features = pd.concat([test_data.drop(columns=['id', 'SMILES']), fingerprint_df_test], axis=1)

# MLM 및 HLM 예측
mlm_predictions = rf_mlm.predict(test_features)
test_features['HLM']=mlm_predictions
test_features = test_features[features2.columns]

hlm_predictions = rf_hlm.predict(test_features)

# 예측 값을 DataFrame으로 변환
predictions_df = pd.DataFrame({
    'id': id_col,
    'MLM': hlm_predictions,
    'HLM': mlm_predictions
    
})

predictions_df

Unnamed: 0,id,MLM,HLM
0,TEST_000,8.87055,22.350210
1,TEST_001,67.98025,72.308430
2,TEST_002,19.72842,28.694860
3,TEST_003,25.56523,53.365427
4,TEST_004,40.66024,60.092860
...,...,...,...
478,TEST_478,15.42366,19.524792
479,TEST_479,76.33455,77.516680
480,TEST_480,11.20531,27.726853
481,TEST_481,49.58306,61.024540


In [16]:
# 예측 값을 CSV 파일로 저장
predictions_df.to_csv('RF_AllChem_submission3.csv', index=False)