In [1]:
import os
import pandas as pd
import random
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
import autogluon.core as ag
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv('../new_open/train_merge.csv')
test = pd.read_csv('../new_open/test_merge.csv')
submission = pd.read_csv('../new_open/sample_submission.csv')
train

Unnamed: 0,ARI_CO,ARI_PO,SHIP_TYPE_CATEGORY,DIST,ID,BREADTH,BUILT,DEADWEIGHT,DEPTH,DRAUGHT,...,month_sin,month_cos,day_sin,day_cos,weekday_sin,weekday_cos,rounded_hour_sin,rounded_hour_cos,ship_cluster,CI_HOUR
0,17,21,2,30.881018,24710,30.0,24,24300,10.0,10.0,...,-2.449294e-16,1.000000e+00,-0.299363,-0.954139,0.000000,1.000000,-0.707107,0.707107,1,3.450000
1,7,81,0,0.000000,23140,30.0,13,35900,10.0,10.0,...,-1.000000e+00,-1.836970e-16,-0.998717,-0.050649,0.781831,0.623490,0.965926,-0.258819,2,0.000000
2,4,14,2,0.000000,19009,50.0,12,146000,30.0,20.0,...,8.660254e-01,5.000000e-01,0.571268,0.820763,0.781831,0.623490,-0.500000,0.866025,3,0.000000
3,8,101,2,0.000000,24048,20.0,18,6910,10.0,10.0,...,5.000000e-01,8.660254e-01,-0.299363,-0.954139,-0.433884,-0.900969,0.866025,0.500000,1,0.000000
4,17,21,2,27.037650,911,50.0,10,116000,20.0,10.0,...,5.000000e-01,8.660254e-01,-0.848644,0.528964,-0.781831,0.623490,0.866025,-0.500000,0,253.554444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391934,8,66,2,0.000000,23490,20.0,27,6820,10.0,10.0,...,1.224647e-16,-1.000000e+00,0.937752,0.347305,0.781831,0.623490,0.965926,0.258819,1,0.000000
391935,17,21,0,5.884603,10196,10.0,12,3160,10.0,10.0,...,-8.660254e-01,5.000000e-01,-0.101168,-0.994869,0.974928,-0.222521,0.258819,0.965926,2,144.061389
391936,21,61,0,70.660241,8823,30.0,8,60300,20.0,10.0,...,1.000000e+00,6.123234e-17,-0.998717,-0.050649,0.781831,0.623490,-0.258819,0.965926,2,41.482222
391937,19,35,2,9.448179,9246,30.0,29,23800,10.0,10.0,...,5.000000e-01,8.660254e-01,0.998717,-0.050649,0.433884,-0.900969,0.965926,-0.258819,1,7.485278


In [3]:
# 각 ARI_CO 별로 데이터 분할 후 학습 및 예측
unique_ari_co = train['year'].unique()
label = 'CI_HOUR'
eval_metric = 'mean_absolute_error'

In [4]:
for ari_co in unique_ari_co:
    # ARI_CO 별 데이터 분할
    train_subset = train[train['year'] == ari_co].copy()
    test_subset = test[test['year'] == ari_co].copy()
    train_subset.drop(['year'],axis=1,inplace=True)
    test_subset.drop(['year'],axis=1,inplace=True)    

    # 데이터셋 변환
    train_data = TabularDataset(train_subset)
    test_data = TabularDataset(test_subset)
    
    # 모델 학습
    predictor = TabularPredictor(label=label, problem_type='regression', eval_metric=eval_metric).fit(train_data, 
                      presets='best_quality', 
                      num_stack_levels=3,
                      #excluded_model_types = excluded_model_types,
                      num_gpus=0)
    
    # 예측 및 결과 저장
    y_pred = predictor.predict(test_data)
    submission.loc[test_subset.index, 'CI_HOUR'] = y_pred.values
    print(ari_co)
    print(len(train_subset))
    predictor.leaderboard(train, silent=True)    

# 최종 결과 확인
print(submission)
submission

No path specified. Models will be saved in: "AutogluonModels\ag-20231029_000421\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231029_000421\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   80.62 GB / 999.46 GB (8.1%)
Train Data Rows:    43255
Train Data Columns: 38
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    59419.95 MB
	Train Data (Original)  Memory Usage: 13.15 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator..

2018
43255


No path specified. Models will be saved in: "AutogluonModels\ag-20231029_002451\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231029_002451\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   75.00 GB / 999.46 GB (7.5%)
Train Data Rows:    4771
Train Data Columns: 38
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    57616.68 MB
	Train Data (Original)  Memory Usage: 1.45 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...


2014
4771


No path specified. Models will be saved in: "AutogluonModels\ag-20231029_003313\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231029_003313\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   74.14 GB / 999.46 GB (7.4%)
Train Data Rows:    29137
Train Data Columns: 38
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    57043.77 MB
	Train Data (Original)  Memory Usage: 8.86 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...

2015
29137


No path specified. Models will be saved in: "AutogluonModels\ag-20231029_004732\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231029_004732\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   70.85 GB / 999.46 GB (7.1%)
Train Data Rows:    44123
Train Data Columns: 38
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    53734.81 MB
	Train Data (Original)  Memory Usage: 13.41 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator..

2020
44123


No path specified. Models will be saved in: "AutogluonModels\ag-20231029_010837\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231029_010837\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   65.40 GB / 999.46 GB (6.5%)
Train Data Rows:    61179
Train Data Columns: 38
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    52198.35 MB
	Train Data (Original)  Memory Usage: 18.6 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...

2021
61179


No path specified. Models will be saved in: "AutogluonModels\ag-20231029_013441\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231029_013441\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   59.76 GB / 999.46 GB (6.0%)
Train Data Rows:    41311
Train Data Columns: 38
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    57127.96 MB
	Train Data (Original)  Memory Usage: 12.56 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator..

2016
41311


No path specified. Models will be saved in: "AutogluonModels\ag-20231029_015309\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231029_015309\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   54.83 GB / 999.46 GB (5.5%)
Train Data Rows:    73469
Train Data Columns: 38
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    55739.32 MB
	Train Data (Original)  Memory Usage: 22.33 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator..

2022
73469


No path specified. Models will be saved in: "AutogluonModels\ag-20231029_022352\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231029_022352\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   71.04 GB / 999.46 GB (7.1%)
Train Data Rows:    43709
Train Data Columns: 38
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    53926.4 MB
	Train Data (Original)  Memory Usage: 13.29 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...

2017
43709


No path specified. Models will be saved in: "AutogluonModels\ag-20231029_024434\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231029_024434\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   76.18 GB / 999.46 GB (7.6%)
Train Data Rows:    40785
Train Data Columns: 38
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    54811.69 MB
	Train Data (Original)  Memory Usage: 12.4 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...

2019
40785


No path specified. Models will be saved in: "AutogluonModels\ag-20231029_030542\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231029_030542\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   70.85 GB / 999.46 GB (7.1%)
Train Data Rows:    10200
Train Data Columns: 38
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    53876.45 MB
	Train Data (Original)  Memory Usage: 3.1 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...


2023
10200
          SAMPLE_ID    CI_HOUR
0       TEST_000000  26.615799
1       TEST_000001  22.190899
2       TEST_000002  26.190678
3       TEST_000003  62.119541
4       TEST_000004   0.013697
...             ...        ...
220486  TEST_220486  61.366642
220487  TEST_220487  52.535957
220488  TEST_220488  31.153988
220489  TEST_220489   0.040449
220490  TEST_220490  25.380772

[220491 rows x 2 columns]


Unnamed: 0,SAMPLE_ID,CI_HOUR
0,TEST_000000,26.615799
1,TEST_000001,22.190899
2,TEST_000002,26.190678
3,TEST_000003,62.119541
4,TEST_000004,0.013697
...,...,...
220486,TEST_220486,61.366642
220487,TEST_220487,52.535957
220488,TEST_220488,31.153988
220489,TEST_220489,0.040449


In [6]:
submission.isna().sum()

SAMPLE_ID    0
CI_HOUR      0
dtype: int64

In [7]:
submit = submission.copy()
submit

Unnamed: 0,SAMPLE_ID,CI_HOUR
0,TEST_000000,26.615799
1,TEST_000001,22.190899
2,TEST_000002,26.190678
3,TEST_000003,62.119541
4,TEST_000004,0.013697
...,...,...
220486,TEST_220486,61.366642
220487,TEST_220487,52.535957
220488,TEST_220488,31.153988
220489,TEST_220489,0.040449


In [8]:
submit['DIST'] = test['DIST'].apply(lambda x: 0 if x==0 else 1)
submit['CI_HOUR'] = submit['CI_HOUR']*submit['DIST']
submit.drop(['DIST'],axis=1,inplace=True)
submit

Unnamed: 0,SAMPLE_ID,CI_HOUR
0,TEST_000000,26.615799
1,TEST_000001,22.190899
2,TEST_000002,26.190678
3,TEST_000003,62.119541
4,TEST_000004,0.000000
...,...,...
220486,TEST_220486,61.366642
220487,TEST_220487,52.535957
220488,TEST_220488,31.153988
220489,TEST_220489,0.000000


In [9]:
submit.to_csv(f'../Sub/autogluon_merge_year.csv', index=False)