## 결측치 처리: Random Forest, Linear Regression

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split


# 데이터 로드
train_data = pd.read_csv('train.csv')

# AlogP 결측치가 없는 데이터 선택
data_no_missing = train_data[train_data['AlogP'].notnull()]

# 특성과 타겟 변수 분리
X_no_missing = data_no_missing.drop(columns=['id', 'SMILES', 'MLM', 'HLM', 'AlogP'])
y_no_missing = data_no_missing['AlogP']

# 모델 초기화
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
linear_regressor = LinearRegression()

# 교차 검증을 사용하여 RMSE 계산 (Random Forest)
rf_scores = -cross_val_score(rf_regressor, X_no_missing, y_no_missing, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rf_rmse_scores = np.sqrt(rf_scores)

# 교차 검증을 사용하여 RMSE 계산 (Linear Regression)
linear_scores = -cross_val_score(linear_regressor, X_no_missing, y_no_missing, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
linear_rmse_scores = np.sqrt(linear_scores)

rf_rmse_scores.mean(), linear_rmse_scores.mean()

ImportError: cannot import name 'RandomForestRegressor' from 'sklearn.model_selection' (/Users/syshin/miniforge3/envs/tensorflow/lib/python3.9/site-packages/sklearn/model_selection/__init__.py)

In [5]:
# 모델 학습
rf_regressor.fit(X_no_missing, y_no_missing)
linear_regressor.fit(X_no_missing, y_no_missing)

# 결측치 있는 데이터 선택
data_missing = train_data[train_data['AlogP'].isnull()]
X_missing = data_missing.drop(columns=['id', 'SMILES', 'MLM', 'HLM', 'AlogP'])

# 결측치 예측
predicted_AlogP_rf = rf_regressor.predict(X_missing)
predicted_AlogP_linear = linear_regressor.predict(X_missing)

# 예측된 값을 원래 데이터의 결측치 부분에 대체
train_data_rf_filled = train_data.copy()
train_data_rf_filled.loc[train_data['AlogP'].isnull(), 'AlogP'] = predicted_AlogP_rf

train_data_linear_filled = train_data.copy()
train_data_linear_filled.loc[train_data['AlogP'].isnull(), 'AlogP'] = predicted_AlogP_linear

NameError: name 'rf_regressor' is not defined

In [6]:
train_data_rf_filled.to_csv('train_rf.csv', index=False)
train_data_linear_filled.to_csv('train_linear.csv', index=False)

NameError: name 'train_data_rf_filled' is not defined

In [None]:
# test_data 로드
test_data = pd.read_csv('test.csv')

# 결측치 있는 데이터 선택
test_data_missing = test_data[test_data['AlogP'].isnull()]
X_test_missing = test_data_missing.drop(columns=['id', 'SMILES', 'AlogP'])

# 결측치 예측
predicted_AlogP_test_rf = rf_regressor.predict(X_test_missing)
predicted_AlogP_test_linear = linear_regressor.predict(X_test_missing)

# 예측된 값을 원래 데이터의 결측치 부분에 대체
test_data_rf_filled = test_data.copy()
test_data_rf_filled.loc[test_data['AlogP'].isnull(), 'AlogP'] = predicted_AlogP_test_rf

test_data_linear_filled = test_data.copy()
test_data_linear_filled.loc[test_data['AlogP'].isnull(), 'AlogP'] = predicted_AlogP_test_linear

test_data_rf_filled.head(), test_data_linear_filled.head()

In [None]:
test_data_rf_filled.to_csv('testn_rf.csv', index=False)
test_data_linear_filled.to_csv('test_linear.csv', index=False)