In [2]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor

# Autogluon 기본 사용법
1. 데이터 로딩
2. 타겟변수 지정
3. TabularPredictor 설정(타겟변수, 모델 성능 지표) - 모델 설정
4. 훈련(데이터, 제한시간설정, 분석사전 설정 지정) - .fit()
5. 데이터에서 일부 데이터를 테스트 데이터로 추출 - .sample()
6. 분석이 끝난 모델로 테스트 데이터에서 추론 - .predict()
7. 평가

## 1. 데이터 로딩

In [3]:
data = pd.read_csv("https://raw.githubusercontent.com/haram4th/ADsP/main/salary2.csv")
data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train_data, test_data = train_test_split(data, stratify = data['class'], test_size = 0.4, random_state = 7)

## 2. 타겟변수 지정

In [6]:
target_column = 'class'

## 3. 제한시간, 검정지표(accuracy, rmse, roc_auc) 지정

In [12]:
time_limit = 300 # 300초 설정하기
metric = 'accuracy' # 성능지표(f1, recall, roc_auc_score 등등. 적을 땐 풀네임으로)

## 4. 모델정의 TablularPredictor()

In [8]:
model = TabularPredictor(label = target_column, eval_metric = metric)

No path specified. Models will be saved in: "AutogluonModels/ag-20241017_024017"


## 5. 모델 훈련

In [13]:
model.fit(train_data, time_limit = time_limit, presets = ['medium_quality'])

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.15
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Mar 29 23:14:13 UTC 2024
CPU Count:          12
Memory Avail:       13.57 GB / 15.32 GB (88.5%)
Disk Space Avail:   94.86 GB / 237.85 GB (39.9%)
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 300s
AutoGluon will save models to "AutogluonModels/ag-20241017_024017"
Train Data Rows:    29305
Train Data Columns: 13
Label Column:       class
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [' <=50K', ' >50K']
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 

	Ran out of time, stopping training early. (Stopping on epoch 1)
  self.model = torch.load(net_filename)
	0.8324	 = Validation score   (accuracy)
	30.08s	 = Training   runtime
	1.29s	 = Validation runtime
Fitting model: LightGBMLarge ... Training model for up to 9.37s of the 9.37s of remaining time.
	0.8732	 = Validation score   (accuracy)
	0.98s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 299.78s of the 8.29s of remaining time.
	Ensemble Weights: {'XGBoost': 0.87, 'LightGBM': 0.087, 'NeuralNetFastAI': 0.043}
	0.8784	 = Validation score   (accuracy)
	0.12s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 291.97s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 2057.5 rows/s (2500 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20241017_024017")


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f5d399f85e0>

## 6. 생성된 모델에 테스트 데이터 넣어 예측하기

In [14]:
pred = model.predict(test_data)

In [15]:
pred

14235     <=50K
28608      >50K
16506     <=50K
7311      <=50K
23094     <=50K
          ...  
9697       >50K
20656     <=50K
39937     <=50K
2502      <=50K
18868      >50K
Name: class, Length: 19537, dtype: object

## 7. 모델 성능 평가하기

In [16]:
result = model.evaluate(test_data)
result_df = pd.DataFrame([result], index = [0])
# 여러 모델 성능 비교
leader_board = model.leaderboard(test_data)
# 중요 변수 출력
feature_importance = model.feature_importance(test_data)
best_model_name = model.model_best
# best 모델 로딩
best_model = model._trainer.load_model(best_model_name)
# 파라미터 확인
best_model_params = best_model.params

Computing feature importance via permutation shuffling for 13 features using 5000 rows with 5 shuffle sets...
	148.87s	= Expected runtime (29.77s per shuffle set)
	145.73s	= Actual runtime (Completed 5 of 5 shuffle sets)


## 8. 결과 출력

In [18]:
print('=' * 20, 'result_df', '=' * 20)
display(result_df)
print()
print('=' * 20, 'leader_board', '=' * 20)
display(leader_board)
print()
print('=' * 20, 'feature_importance', '=' * 20)
display(feature_importance)
print()
print('=' * 20, 'best_model_name', '=' * 20)
display(best_model_name)
print()
print('=' * 20, 'best_model', '=' * 20)
display(best_model)
print()
print('=' * 20, 'best_model_params', '=' * 20)
display(best_model_params)
print()



Unnamed: 0,accuracy,balanced_accuracy,mcc,roc_auc,f1,precision,recall
0,0.87516,0.798891,0.640167,0.930933,0.714436,0.789188,0.65262





Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.87516,0.8784,accuracy,8.066276,1.215091,239.038045,0.005139,0.000671,0.119031,2,True,14
1,XGBoost,0.875006,0.8776,accuracy,0.082606,0.010931,1.117751,0.082606,0.010931,1.117751,1,True,11
2,CatBoost,0.874187,0.872,accuracy,0.037307,0.005273,4.417121,0.037307,0.005273,4.417121,1,True,7
3,LightGBM,0.873932,0.8728,accuracy,0.034364,0.005462,0.654504,0.034364,0.005462,0.654504,1,True,4
4,LightGBMLarge,0.872242,0.8732,accuracy,0.047844,0.007256,0.977769,0.047844,0.007256,0.977769,1,True,13
5,LightGBMXT,0.86436,0.8652,accuracy,0.024359,0.005821,1.323138,0.024359,0.005821,1.323138,1,True,3
6,NeuralNetFastAI,0.854532,0.8584,accuracy,7.944167,1.198027,237.146759,7.944167,1.198027,237.146759,1,True,10
7,RandomForestGini,0.851359,0.846,accuracy,1.138604,0.11476,1.453418,1.138604,0.11476,1.453418,1,True,5
8,RandomForestEntr,0.851308,0.8424,accuracy,1.069191,0.112392,1.314699,1.069191,0.112392,1.314699,1,True,6
9,ExtraTreesEntr,0.847418,0.838,accuracy,1.782945,0.122434,1.279069,1.782945,0.122434,1.279069,1,True,9





Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
capital-gain,0.053,0.005079,1e-05,5,0.063458,0.042542
marital-status,0.03912,0.004154,1.5e-05,5,0.047672,0.030568
education-num,0.02088,0.002741,3.5e-05,5,0.026523,0.015237
occupation,0.01536,0.001424,9e-06,5,0.018292,0.012428
capital-loss,0.0146,0.002953,0.00019,5,0.02068,0.00852
age,0.0126,0.002672,0.000229,5,0.018102,0.007098
hours-per-week,0.00576,0.002406,0.002936,5,0.010714,0.000806
workclass,0.00384,0.000767,0.000181,5,0.005419,0.002261
sex,0.0016,0.001175,0.019097,5,0.004019,-0.000819
relationship,0.00092,0.001869,0.166374,5,0.004768,-0.002928





'WeightedEnsemble_L2'




<autogluon.core.models.ensemble.weighted_ensemble_model.WeightedEnsembleModel at 0x7f5cca252980>




{'use_orig_features': False,
 'max_base_models': 25,
 'max_base_models_per_type': 5,
 'save_bag_folds': True}




# 함수화하고 분석 간단히 하기

In [29]:
def automl(data, target, time = 300, metric = None) :
    if metric in ('accuracy', 'roc_auc', 'recall', 'precision', 'f1') :
        train_data, test_data = train_test_split(data, stratify = data[target], test_size = 0.4, random_state = 7)
    else :
        train_data, test_data = train_test_split(data, test_size = 0.4, random_state = 7)
    
    target_column = target
    time_limit = time
    
    model = TabularPredictor(label = target_column, eval_metric = metric)
    model.fit(train_data, time_limit = time_limit, presets = ['medium_quality'])
    
    pred = model.predict(test_data)
    result = model.evaluate(test_data)
    result_df = pd.DataFrame([result], index = [0])
    

    leader_board = model.leaderboard(test_data)

    feature_importance = model.feature_importance(test_data)
    
    best_model_name = model.model_best

    best_model = model._trainer.load_model(best_model_name)

    best_model_params = best_model.params
    
    print('=' * 20, 'result_df', '=' * 20)
    display(result_df)
    print()
    print('=' * 20, 'leader_board', '=' * 20)
    display(leader_board)
    print()
    print('=' * 20, 'feature_importance', '=' * 20)
    display(feature_importance)
    print()
    print('=' * 20, 'best_model_name', '=' * 20)
    display(best_model_name)
    print()
    print('=' * 20, 'best_model', '=' * 20)
    display(best_model)
    print()
    print('=' * 20, 'best_model_params', '=' * 20)
    display(best_model_params)
    print()
    return best_model_name, best_model, result_df, leader_board, feature_importance

In [30]:
data = pd.read_csv("https://raw.githubusercontent.com/haram4th/ablearn/main/Taitanic_train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [31]:
automl(data, 'Survived')

No path specified. Models will be saved in: "AutogluonModels/ag-20241017_033735"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.15
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Mar 29 23:14:13 UTC 2024
CPU Count:          12
Memory Avail:       12.78 GB / 15.32 GB (83.4%)
Disk Space Avail:   93.94 GB / 237.85 GB (39.5%)
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 300s
AutoGluon will save models to "AutogluonModels/ag-20241017_033735"
Train Data Rows:    534
Train Data Columns: 11
Label Column:       Survived
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])
Problem Type:       binar

  self.model = torch.load(net_filename)
	0.8037	 = Validation score   (accuracy)
	1.08s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: LightGBMLarge ... Training model for up to 282.28s of the 282.28s of remaining time.
	0.8411	 = Validation score   (accuracy)
	0.62s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 299.7s of the 281.58s of remaining time.
	Ensemble Weights: {'LightGBM': 0.5, 'XGBoost': 0.5}
	0.8692	 = Validation score   (accuracy)
	0.09s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 18.65s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 12927.5 rows/s (107 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20241017_033735")
Computing feature importance via permutation shuffling for 11 features using 357 rows with 5 shuffle sets...
	3.19s	= Expected runtime (0.64s p



Unnamed: 0,accuracy,balanced_accuracy,mcc,roc_auc,f1,precision,recall
0,0.787115,0.764966,0.555635,0.843537,0.712121,0.803419,0.639456





Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBM,0.803922,0.859813,accuracy,0.00698,0.002656,0.410363,0.00698,0.002656,0.410363,1,True,4
1,LightGBMXT,0.789916,0.850467,accuracy,0.00726,0.003144,0.410979,0.00726,0.003144,0.410979,1,True,3
2,XGBoost,0.789916,0.859813,accuracy,0.03281,0.005087,0.18298,0.03281,0.005087,0.18298,1,True,11
3,WeightedEnsemble_L2,0.787115,0.869159,accuracy,0.044133,0.008277,0.679038,0.004344,0.000535,0.085695,2,True,14
4,CatBoost,0.781513,0.850467,accuracy,0.008972,0.002418,0.831892,0.008972,0.002418,0.831892,1,True,7
5,RandomForestGini,0.781513,0.831776,accuracy,0.138137,0.051502,0.731441,0.138137,0.051502,0.731441,1,True,5
6,RandomForestEntr,0.77591,0.841121,accuracy,0.137138,0.050545,0.746582,0.137138,0.050545,0.746582,1,True,6
7,NeuralNetFastAI,0.773109,0.850467,accuracy,0.158462,0.05812,10.869005,0.158462,0.05812,10.869005,1,True,10
8,ExtraTreesGini,0.770308,0.803738,accuracy,0.150839,0.060558,0.631722,0.150839,0.060558,0.631722,1,True,8
9,NeuralNetTorch,0.767507,0.803738,accuracy,0.216488,0.009766,1.081756,0.216488,0.009766,1.081756,1,True,12





Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
Sex,0.126611,0.008724,3e-06,5,0.144574,0.108648
Pclass,0.107003,0.009784,8e-06,5,0.127148,0.086858
Age,0.012325,0.008309,0.014736,5,0.029434,-0.004784
Name,0.007843,0.013921,0.13812,5,0.036507,-0.020821
SibSp,0.005042,0.002344,0.00429,5,0.009867,0.000217
Fare,0.005042,0.008724,0.132924,5,0.023005,-0.012921
Embarked,0.002241,0.002344,0.04965,5,0.007066,-0.002585
Ticket,0.00112,0.001534,0.088904,5,0.004279,-0.002039
PassengerId,0.0,0.00524,0.5,5,0.01079,-0.01079
Cabin,-0.00112,0.005809,0.655771,5,0.010839,-0.01308





'WeightedEnsemble_L2'




<autogluon.core.models.ensemble.weighted_ensemble_model.WeightedEnsembleModel at 0x7f5cca1bc2e0>




{'use_orig_features': False,
 'max_base_models': 25,
 'max_base_models_per_type': 5,
 'save_bag_folds': True}




('WeightedEnsemble_L2',
 <autogluon.core.models.ensemble.weighted_ensemble_model.WeightedEnsembleModel at 0x7f5cca1bc2e0>,
    accuracy  balanced_accuracy       mcc  ...        f1  precision    recall
 0  0.787115           0.764966  0.555635  ...  0.712121   0.803419  0.639456
 
 [1 rows x 7 columns],
                   model  score_test  ...  can_infer fit_order
 0              LightGBM    0.803922  ...       True         4
 1            LightGBMXT    0.789916  ...       True         3
 2               XGBoost    0.789916  ...       True        11
 3   WeightedEnsemble_L2    0.787115  ...       True        14
 4              CatBoost    0.781513  ...       True         7
 5      RandomForestGini    0.781513  ...       True         5
 6      RandomForestEntr    0.775910  ...       True         6
 7       NeuralNetFastAI    0.773109  ...       True        10
 8        ExtraTreesGini    0.770308  ...       True         8
 9        NeuralNetTorch    0.767507  ...       True        12
 10