In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 데이터 로드
train_data = pd.read_csv('train.csv')

# 누락된 AlogP 값이 있는 행 제거
train_data_cleaned = train_data.dropna(subset=['AlogP'])

# 특성 및 대상 변수 준비
features = train_data_cleaned.drop(columns=['id', 'SMILES', 'MLM', 'HLM'])
target_mlm = train_data_cleaned['MLM']

# 데이터 분할
X_train, X_test, y_train_mlm, y_test_mlm = train_test_split(features, target_mlm, test_size=0.2, random_state=42)

# 랜덤 포레스트 모델로 MLM 훈련
rf_mlm = RandomForestRegressor(n_estimators=100, random_state=42)
rf_mlm.fit(X_train, y_train_mlm)

# 테스트 세트에서 예측
y_pred_mlm = rf_mlm.predict(X_test)

# MLM의 RMSE 계산
rmse_mlm = mean_squared_error(y_test_mlm, y_pred_mlm, squared=False)
print(f"MLM의 RMSE: {rmse_mlm}")


MLM의 RMSE: 33.86057394701357


In [9]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 데이터 로드
train_data = pd.read_csv('train.csv')

# 누락된 AlogP 값이 있는 행 제거
train_data_cleaned = train_data.dropna(subset=['AlogP'])

# SMILES 문자열에서 fingerprint를 생성하는 함수
def generate_fingerprint(smiles_string):
    mol = Chem.MolFromSmiles(smiles_string)
    if mol is not None:
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
        return list(fingerprint)
    else:
        return [0]*2048

# 모든 SMILES 문자열에 대해 fingerprint를 생성
fingerprint_data = train_data_cleaned['SMILES'].apply(generate_fingerprint)
fingerprint_df = pd.DataFrame(fingerprint_data.tolist(), columns=[f'bit_{i}' for i in range(2048)])

# 인덱스 재설정
train_data_cleaned = train_data_cleaned.reset_index(drop=True)

# 특성 및 대상 변수 준비
features = pd.concat([train_data_cleaned.drop(columns=['id', 'SMILES', 'MLM', 'HLM']), fingerprint_df], axis=1)
target_mlm = train_data_cleaned['MLM']

# 데이터 분할
X_train, X_test, y_train_mlm, y_test_mlm = train_test_split(features, target_mlm, test_size=0.2, random_state=42)

# 랜덤 포레스트 모델로 MLM 훈련
rf_mlm = RandomForestRegressor(n_estimators=100, random_state=42)
rf_mlm.fit(X_train, y_train_mlm)

# 테스트 세트에서 예측
y_pred_mlm = rf_mlm.predict(X_test)

# MLM의 RMSE 계산
rmse_mlm = mean_squared_error(y_test_mlm, y_pred_mlm, squared=False)
print(f"MLM의 RMSE: {rmse_mlm}")


MLM의 RMSE: 32.22867271161713


In [10]:
# 테스트 데이터 로드
test_data = pd.read_csv('test.csv')

# SMILES 문자열에서 fingerprint를 생성
test_fingerprint_data = test_data['SMILES'].apply(generate_fingerprint)
test_fingerprint_df = pd.DataFrame(test_fingerprint_data.tolist(), columns=[f'bit_{i}' for i in range(2048)])

# 특성 준비
test_features = pd.concat([test_data.drop(columns=['id', 'SMILES']), test_fingerprint_df], axis=1)

# 훈련된 rf_mlm 모델을 사용하여 예측 수행
test_predictions = rf_mlm.predict(test_features)

# 예측 결과를 원본 test_data에 추가 (필요한 경우)
test_data['MLM_Predicted'] = test_predictions


ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values