In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

train_data_linear_filled = pd.read_csv('train_linear.csv')
test_data_linear_filled = pd.read_csv('test_linear.csv')

# SMILES 문자열에서 fingerprint를 생성하는 함수
def generate_fingerprint(smiles_string):
    mol = Chem.MolFromSmiles(smiles_string)
    if mol is not None:
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
        return list(fingerprint)
    else:
        return [0]*2048

# 모든 SMILES 문자열에 대해 fingerprint를 생성
fingerprint_data = train_data_linear_filled['SMILES'].apply(generate_fingerprint)
fingerprint_df = pd.DataFrame(fingerprint_data.tolist(), columns=[f'bit_{i}' for i in range(2048)])

In [2]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# 데이터 분리
X_train = pd.concat([train_data_linear_filled.drop(columns=['id', 'SMILES', 'MLM', 'HLM']), fingerprint_df], axis=1)
y_train_MLM = train_data_linear_filled['MLM']
y_train_HLM = train_data_linear_filled['HLM']

# RandomForestRegressor에 대한 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [10, 30, 50, 100, 150, 200],
    'max_depth': [None, 1, 3, 5, 7, 10, 15, 20, 25, 30],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'bootstrap': [True, False]
}

# GridSearchCV 설정
grid_search_MLM = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='neg_root_mean_squared_error')
grid_search_HLM = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='neg_root_mean_squared_error')

# RandomizedSearchCV 설정
random_search_MLM = RandomizedSearchCV(estimator=rf_regressor, param_distributions=param_grid, 
                                   n_iter=100, cv=3, n_jobs=-1, verbose=2, random_state=42, scoring='neg_root_mean_squared_error')
random_search_HLM = RandomizedSearchCV(estimator=rf_regressor, param_distributions=param_grid, 
                                   n_iter=100, cv=3, n_jobs=-1, verbose=2, random_state=42, scoring='neg_root_mean_squared_error')

# GridSearchCV 실행
grid_search_MLM.fit(X_train, y_train_MLM)
grid_search_HLM.fit(X_train, y_train_HLM)


# RandomizedSearchCV 실행
random_search_MLM.fit(X_train, y_train_MLM)
random_search_HLM.fit(X_train, y_train_HLM)

Fitting 3 folds for each of 3000 candidates, totalling 9000 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=30; total time=  13.6s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=150; total time= 1.1min
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=150; total time= 1.1min
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  40.6s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=15, n_estimators=10; total time=   4.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=15, n_estimators=10; total time=   4.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=15, n_estimators=30; total time=  12.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=15, n_estimators=100; t

Traceback (most recent call last):
  File "/Users/syshin/miniforge3/envs/tensorflow/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/3m/knb96f9s4xg2t2gcrl96r6jh0000gn/T/ipykernel_3251/189648626.py", line 34, in <module>
    grid_search_HLM.fit(X_train, y_train_HLM)
  File "/Users/syshin/miniforge3/envs/tensorflow/lib/python3.9/site-packages/sklearn/base.py", line 1151, in wrapper
  File "/Users/syshin/miniforge3/envs/tensorflow/lib/python3.9/site-packages/sklearn/model_selection/_search.py", line 898, in fit
    ]
  File "/Users/syshin/miniforge3/envs/tensorflow/lib/python3.9/site-packages/sklearn/model_selection/_search.py", line 1419, in _run_search
    estimator : estimator object
  File "/Users/syshin/miniforge3/envs/tensorflow/lib/python3.9/site-packages/sklearn/model_selection/_search.py", line 845, in evaluate_candidates
    raise ValueError(
  File "/Users/syshin/mini

In [None]:
print('===MLM===')
print(grid_search_MLM.best_params_, grid_search_MLM.best_score_)
print(random_search_MLM.best_params_, random_search_MLM.best_score_)
print()
print('===HLM===')
print(grid_search_HLM.best_params_, grid_search_HLM.best_score_)
print(random_search_HLM.best_params_, random_search_HLM.best_score_)

In [None]:
# 가장 좋은 파라미터 저장
best_params_MLM = {}
best_params_HLM = {}

if grid_search_MLM.best_score_ > random_search_MLM.best_score_:
    best_params_MLM = grid_search_MLM.best_params_
else:
    best_params_MLM = random_search_MLM.best_params_

if grid_search_HLM.best_score_ > random_search_HLM.best_score_:
    best_params_HLM = grid_search_HLM.best_params_
else:
    best_params_HLM = random_search_HLM.best_params_

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor


# 2. 최적의 하이퍼파라미터를 사용하여 RandomForestRegressor 학습
# rf_params_MLM = {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 150}
# rf_params_HLM = {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
rf_regressor_MLM = RandomForestRegressor(**best_params_MLM)
rf_regressor_HLM = RandomForestRegressor(**best_params_HLM)
rf_regressor_MLM.fit(X_train, y_train_MLM)
rf_regressor_HLM.fit(X_train, y_train_HLM)

# 3. test_linear.csv 데이터 로드
test_data = pd.read_csv('test_linear.csv')

# fingerprint 생성
fingerprint_data_test = test_data['SMILES'].apply(generate_fingerprint)
fingerprint_df_test = pd.DataFrame(fingerprint_data_test.tolist(), columns=[f'bit_{i}' for i in range(2048)])

# 테스트 데이터의 특성 준비
id_col = test_data['id']
X_test = pd.concat([test_data.drop(columns=['id', 'SMILES']), fingerprint_df_test], axis=1)

# 4. 학습된 모델로 test_linear.csv 데이터의 결과 예측
predictions_MLM = rf_regressor_MLM.predict(X_test)
predictions_HLM = rf_regressor_HLM.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error

# 실제 MLM, HLM 값 가져오기
y_test_MLM = test_data['MLM']
y_test_HLM = test_data['HLM']

# MLM, HLM에 대한 RMSE 계산
rmse_MLM = mean_squared_error(y_test_MLM, predictions_MLM, squared=False)
rmse_HLM = mean_squared_error(y_test_HLM, predictions_HLM, squared=False)

print(rmse_MLM) 
print(rmse_HLM)

In [None]:
id_col = test_data['id']

# 예측 값을 DataFrame으로 변환
predictions_df = pd.DataFrame({
    'id': id_col,
    'MLM': predictions_MLM,
    'HLM': predictions_HLM
})

predictions_df

In [None]:
predictions_df.to_csv('rf_grid_search_submission.csv', index=False)