In [1]:
import pandas as pd
# 저희는 Tabular Data를 다루기 때문에 아래 라이브러리를 호출합니다.
from autogluon.tabular import TabularDataset, TabularPredictor
import numpy as np
from sktime.utils.plotting import plot_series
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# seed
seed = 42

# define rmse
rmse = lambda x, y: np.mean((x - y) ** 2) ** 0.5 

# features and targets
features = ["AlogP", "Molecular_Weight", "Num_H_Acceptors", "Num_H_Donors", "Num_RotatableBonds", "LogD", "Molecular_PolarSurfaceArea"]
mlm_target = "MLM"
hlm_target = "HLM"

# load data
train_df = pd.read_csv("./origin_data/train.csv")
train_df["AlogP"] = np.where(pd.isna(train_df["AlogP"]), train_df["LogD"], train_df["AlogP"])

test_df = pd.read_csv("./origin_data/test.csv")
test_df["AlogP"] = np.where(pd.isna(test_df["AlogP"]), test_df["LogD"], test_df["AlogP"])

In [8]:
train_MLM = TabularDataset(train_df).drop(columns='HLM')
test_MLM = TabularDataset(test_df)
train_HLM = TabularDataset(train_df).drop(columns='MLM')
test_HLM = TabularDataset(test_df)

In [9]:
predictor = TabularPredictor(label='MLM', eval_metric='mean_squared_error',).fit(train_MLM)

No path specified. Models will be saved in: "AutogluonModels\ag-20230907_094350\"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20230907_094350\"
AutoGluon Version:  0.8.2
Python Version:     3.8.17
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
Disk Space Avail:   39.60 GB / 499.43 GB (7.9%)
Train Data Rows:    3498
Train Data Columns: 9
Label Column: MLM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (131.72, 0.0, 37.38474, 35.69599)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                 

In [10]:
ld_board = predictor.leaderboard(train_MLM, silent=True)

ld_board

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,KNeighborsDist,-200.491457,-1361.446073,0.03198,0.026726,0.008974,0.03198,0.026726,0.008974,1,True,2
1,RandomForestMSE,-286.878221,-1118.531646,0.143616,0.045877,1.35876,0.143616,0.045877,1.35876,1,True,5
2,ExtraTreesMSE,-288.48391,-1132.244105,0.163155,0.045878,0.521112,0.163155,0.045878,0.521112,1,True,7
3,LightGBMLarge,-644.711674,-1100.599422,0.014959,0.002992,0.849755,0.014959,0.002992,0.849755,1,True,11
4,XGBoost,-767.665678,-1068.353067,0.028923,0.004987,0.410913,0.028923,0.004987,0.410913,1,True,9
5,LightGBM,-853.070599,-1055.986554,0.011968,0.002992,0.386526,0.011968,0.002992,0.386526,1,True,4
6,CatBoost,-867.351189,-1024.524165,0.016954,0.002992,9.342354,0.016954,0.002992,9.342354,1,True,6
7,LightGBMXT,-881.975093,-1034.24312,0.022938,0.003989,0.4584,0.022938,0.003989,0.4584,1,True,3
8,WeightedEnsemble_L2,-896.437849,-1002.884542,0.120184,0.01496,12.964446,0.006981,0.0,0.315135,2,True,12
9,NeuralNetFastAI,-921.62379,-1006.578735,0.096248,0.011968,3.306957,0.096248,0.011968,3.306957,1,True,8


In [11]:
# 가장 우수한 모델의 이름을 가져옵니다.
best_model_name = predictor.get_model_best()

# 가장 우수한 모델로 예측 수행
predictions_MLM = predictor.predict(test_MLM, model=best_model_name)

# 예측 결과 출력
print(predictions_MLM)

0      25.684069
1      68.091270
2      25.828449
3      49.869854
4      66.121323
         ...    
478     5.665041
479    91.703972
480    48.803364
481    75.945435
482    26.867268
Name: MLM, Length: 483, dtype: float32


In [12]:
predictor = TabularPredictor(label='HLM', eval_metric='mean_squared_error',).fit(train_HLM)

No path specified. Models will be saved in: "AutogluonModels\ag-20230907_094417\"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20230907_094417\"
AutoGluon Version:  0.8.2
Python Version:     3.8.17
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
Disk Space Avail:   39.45 GB / 499.43 GB (7.9%)
Train Data Rows:    3498
Train Data Columns: 9
Label Column: HLM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (135.336, 0.0, 53.09021, 36.08008)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                

In [13]:
ld_board = predictor.leaderboard(train_HLM, silent=True)

ld_board

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,KNeighborsDist,-213.630582,-1458.132744,0.030851,0.01675,0.007978,0.030851,0.01675,0.007978,1,True,2
1,ExtraTreesMSE,-293.25896,-1132.827526,0.149438,0.045877,0.598399,0.149438,0.045877,0.598399,1,True,7
2,RandomForestMSE,-294.989624,-1145.45963,0.148603,0.045877,1.629678,0.148603,0.045877,1.629678,1,True,5
3,LightGBMLarge,-688.756475,-1147.203067,0.019947,0.002992,0.843275,0.019947,0.002992,0.843275,1,True,11
4,XGBoost,-836.934085,-1079.71084,0.026928,0.004986,0.465276,0.026928,0.004986,0.465276,1,True,9
5,LightGBM,-853.121513,-1076.375947,0.050864,0.002991,0.413402,0.050864,0.002991,0.413402,1,True,4
6,WeightedEnsemble_L2,-870.027327,-1049.086784,0.178476,0.024959,14.739529,0.008976,0.000998,0.306183,2,True,12
7,CatBoost,-897.857793,-1071.357649,0.016954,0.003019,10.258826,0.016954,0.003019,10.258826,1,True,6
8,LightGBMXT,-901.971875,-1081.782104,0.023935,0.004987,0.50186,0.023935,0.004987,0.50186,1,True,3
9,NeuralNetFastAI,-956.172422,-1075.964564,0.074755,0.012965,3.295841,0.074755,0.012965,3.295841,1,True,8


In [14]:
# 가장 우수한 모델의 이름을 가져옵니다.
best_model_name = predictor.get_model_best()

# 가장 우수한 모델로 예측 수행
predictions_HLM = predictor.predict(test_HLM, model=best_model_name)

# 예측 결과 출력
print(predictions_HLM)

0      45.860165
1      83.559540
2      41.642948
3      69.140381
4      79.184532
         ...    
478    21.077591
479    94.191612
480    68.198441
481    77.650368
482    66.206009
Name: HLM, Length: 483, dtype: float32


In [20]:
subm = pd.read_csv('./origin_data\sample_submission.csv')

for idx, value in predictions_MLM.items():
    subm.loc[idx, 'MLM'] = value

for idx, value in predictions_HLM.items():
    subm.loc[idx, 'HLM'] = value

subm.to_csv('./autog.csv', index = False)
