In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

### Train

In [5]:
# seed
seed = 42

# define rmse
rmse = lambda x, y: np.mean((x - y) ** 2) ** 0.5 

# features and targets
features = ["AlogP", "Molecular_Weight", "Num_H_Acceptors", "Num_H_Donors", "Num_RotatableBonds", "LogD", "Molecular_PolarSurfaceArea"]
mlm_target = "MLM"
hlm_target = "HLM"

# load data
df = pd.read_csv("./origin_data/train_clean.csv")
df["AlogP"] = np.where(pd.isna(df["AlogP"]), df["LogD"], df["AlogP"])

# train
scores = []
reg_mlms = []
reg_hlms = []
kf = KFold(n_splits = 10, random_state = seed, shuffle = True)
for i, (train_index, valid_index) in enumerate(kf.split(df)):
    df_train = df.iloc[train_index]
    df_valid = df.iloc[valid_index]

    x_train = df_train[features].values
    y_mlm_train = df_train[mlm_target].values
    y_hlm_train = df_train[hlm_target].values

    x_valid = df_valid[features].values
    y_mlm_valid = df_valid[mlm_target].values
    y_hlm_valid = df_valid[hlm_target].values

    reg_mlm = RandomForestRegressor(random_state = seed)
    reg_mlm.fit(x_train, y_mlm_train)
    p_mlm = reg_mlm.predict(x_valid)

    reg_hlm = RandomForestRegressor(random_state = seed)
    reg_hlm.fit(x_train, y_hlm_train)
    p_hlm = reg_hlm.predict(x_valid)

    score = 0.5 * rmse(y_mlm_valid, p_mlm) + 0.5 * rmse(y_hlm_valid, p_hlm)

    reg_mlms.append(reg_mlm)
    reg_hlms.append(reg_hlm)
    scores.append(score)
    print(f"Fold {i+1:2d}: {score:.5f}")

print(f"CV score: {np.mean(scores):.5f}")

Fold  1: 32.75035
Fold  2: 33.46783
Fold  3: 32.84392
Fold  4: 33.25096
Fold  5: 33.35341
Fold  6: 32.00341
Fold  7: 32.15827
Fold  8: 33.54772
Fold  9: 31.35760
Fold 10: 33.55962
CV score: 32.82931


### Submission

In [6]:
# load data
df = pd.read_csv("./origin_data/test.csv")
df["AlogP"] = np.where(pd.isna(df["AlogP"]), df["LogD"], df["AlogP"])

# predict
df_submission = pd.read_csv("./origin_data/sample_submission.csv")
df_submission["MLM"] = np.mean([reg_mlm.predict(df[features].values) for reg_mlm in reg_mlms], axis = 0)
df_submission["HLM"] = np.mean([reg_hlm.predict(df[features].values) for reg_hlm in reg_hlms], axis = 0)
df_submission.to_csv("submission_RF2_LBW.csv", index = False, encoding = "utf-8-sig")