## 결측치 처리: Random Forest, Linear Regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


# 데이터 로드
train_data = pd.read_csv('train.csv')

# AlogP 결측치가 없는 데이터 선택
data_no_missing = train_data[train_data['AlogP'].notnull()]

# 특성과 타겟 변수 분리
X_no_missing = data_no_missing.drop(columns=['id', 'SMILES', 'MLM', 'HLM', 'AlogP'])
y_no_missing = data_no_missing['AlogP']

# 모델 초기화
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
linear_regressor = LinearRegression()

# 교차 검증을 사용하여 RMSE 계산 (Random Forest)
rf_scores = -cross_val_score(rf_regressor, X_no_missing, y_no_missing, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rf_rmse_scores = np.sqrt(rf_scores)

# 교차 검증을 사용하여 RMSE 계산 (Linear Regression)
linear_scores = -cross_val_score(linear_regressor, X_no_missing, y_no_missing, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
linear_rmse_scores = np.sqrt(linear_scores)

rf_rmse_scores.mean(), linear_rmse_scores.mean()

(0.45651716642254064, 0.44337098782625894)

In [5]:
train_data[train_data.isnull().any(axis=1)]

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
2796,TRAIN_2796,[H][C@]1(CC[C@@]2([H])[C@@H](C)C=CC3=C[C@H](C)...,0.549,0.2,,418.566,5,1,7,4.634,72.83
3387,TRAIN_3387,COc1cc2c(cc1OC)/C(=N\c1ccccc1)N(Cc1ccccc1F)CC2,0.24,3.2,,390.45,3,0,5,4.911,34.06


In [6]:
# 모델 학습
rf_regressor.fit(X_no_missing, y_no_missing)
linear_regressor.fit(X_no_missing, y_no_missing)

# 결측치 있는 데이터 선택
data_missing = train_data[train_data['AlogP'].isnull()]
X_missing = data_missing.drop(columns=['id', 'SMILES', 'MLM', 'HLM', 'AlogP'])

# 결측치 예측
predicted_AlogP_rf = rf_regressor.predict(X_missing)
predicted_AlogP_linear = linear_regressor.predict(X_missing)

# 예측된 값을 원래 데이터의 결측치 부분에 대체
train_data_rf_filled = train_data.copy()
train_data_rf_filled.loc[train_data['AlogP'].isnull(), 'AlogP'] = predicted_AlogP_rf

train_data_linear_filled = train_data.copy()
train_data_linear_filled.loc[train_data['AlogP'].isnull(), 'AlogP'] = predicted_AlogP_linear

In [7]:
predicted_AlogP_rf

array([4.63343, 4.7763 ])

In [9]:
train_data_rf_filled.to_csv('train_rf.csv', index=False)
train_data_linear_filled.to_csv('train_linear.csv', index=False)

In [10]:
# test_data 로드
test_data = pd.read_csv('test.csv')

# 결측치 있는 데이터 선택
test_data_missing = test_data[test_data['AlogP'].isnull()]
X_test_missing = test_data_missing.drop(columns=['id', 'SMILES', 'AlogP'])

# 결측치 예측
predicted_AlogP_test_rf = rf_regressor.predict(X_test_missing)
predicted_AlogP_test_linear = linear_regressor.predict(X_test_missing)

# 예측된 값을 원래 데이터의 결측치 부분에 대체
test_data_rf_filled = test_data.copy()
test_data_rf_filled.loc[test_data['AlogP'].isnull(), 'AlogP'] = predicted_AlogP_test_rf

test_data_linear_filled = test_data.copy()
test_data_linear_filled.loc[test_data['AlogP'].isnull(), 'AlogP'] = predicted_AlogP_test_linear

test_data_rf_filled.head(), test_data_linear_filled.head()

(         id                                             SMILES  AlogP  \
 0  TEST_000            CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1  2.641   
 1  TEST_001     COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2  0.585   
 2  TEST_002  Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1  4.276   
 3  TEST_003         O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21  1.795   
 4  TEST_004       CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1  1.219   
 
    Molecular_Weight  Num_H_Acceptors  Num_H_Donors  Num_RotatableBonds   LogD  \
 0           361.505                4             2                   7  2.635   
 1           370.399                5             0                   3  0.585   
 2           347.414                4             4                   5  4.290   
 3           345.358                5             0                   2  1.795   
 4           353.418                4             0                   2  0.169   
 
    Molecular_PolarSurfaceArea  
 0                       92

In [11]:
test_data_rf_filled.to_csv('test_rf.csv', index=False)
test_data_linear_filled.to_csv('test_linear.csv', index=False)