# Import

In [1]:
import os
import pandas as pd
import random
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
import autogluon.core as ag
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) 

  from .autonotebook import tqdm as notebook_tqdm


# Data Load

In [2]:
train = pd.read_csv('../new_open/train_merge_new_fillna.csv')#.drop(columns=['SAMPLE_ID'])
test = pd.read_csv('../new_open/test_merge_new_fillna.csv')#.drop(columns=['SAMPLE_ID'])

# Data Pre-processing

In [3]:
train_data = TabularDataset(train)
test_data = TabularDataset(test)

# Autogluon

In [4]:
label = 'CI_HOUR'
eval_metric = 'mean_absolute_error'

In [5]:
predictor = TabularPredictor(
    label=label, problem_type='regression', eval_metric=eval_metric
).fit(train_data, 
      presets='best_quality', 
      num_stack_levels=3,
      #excluded_model_types = excluded_model_types,
      num_gpus=1)

No path specified. Models will be saved in: "AutogluonModels\ag-20231027_154220\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=8, num_bag_sets=1
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231027_154220\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   205.90 GB / 999.46 GB (20.6%)
Train Data Rows:    391939
Train Data Columns: 39
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    56576.16 MB
	Train Data (Original)  Memory Usage: 122.29 MB (0.2% of 

In [6]:
predictor.leaderboard(silent = True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-43.312421,260.235794,6139.154505,0.006046,5.937312,3,True,22
1,NeuralNetTorch_BAG_L2,-43.338322,259.165604,6110.817958,1.735779,1889.701726,2,True,20
2,WeightedEnsemble_L4,-43.35803,304.809967,9021.343034,0.006015,5.860852,4,True,32
3,NeuralNetTorch_BAG_L3,-43.36949,301.210653,8942.357446,1.642129,1288.974524,3,True,30
4,WeightedEnsemble_L5,-43.483443,343.554647,12073.502057,0.007003,5.864294,5,True,42
5,NeuralNetTorch_BAG_L4,-43.500454,340.937408,12033.703365,1.601252,1550.170796,4,True,40
6,XGBoost_BAG_L4,-45.647607,340.369805,10505.996299,1.033649,22.46373,4,True,39
7,XGBoost_BAG_L3,-45.693977,300.567465,7675.800641,0.998941,22.417719,3,True,29
8,WeightedEnsemble_L2,-45.697413,41.746309,2971.969173,0.007016,6.576867,2,True,12
9,NeuralNetTorch_BAG_L1,-46.051779,1.147125,2347.781247,1.147125,2347.781247,1,True,10


In [7]:
predictor.feature_importance(train_data) 

Computing feature importance via permutation shuffling for 39 features using 5000 rows with 5 shuffle sets...
	4701.92s	= Expected runtime (940.38s per shuffle set)
	841.94s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
DIST,33.718465,1.443961,4.025977e-07,5,36.691601,30.745328
AIR_TEMPERATURE,20.535054,2.34633,2.010168e-05,5,25.366181,15.703927
PORT_SIZE,13.383557,0.433339,1.317043e-07,5,14.275809,12.491306
month_sin,10.448891,0.869073,5.690231e-06,5,12.238324,8.659458
DEADWEIGHT,8.050769,0.303832,2.429625e-07,5,8.676363,7.425176
SHIP_TYPE_CATEGORY,5.5224,0.605824,1.710481e-05,5,6.769799,4.275
ARI_CO,5.450869,0.344041,1.894331e-06,5,6.159254,4.742484
GT,4.916247,0.041069,5.843499e-10,5,5.00081,4.831685
V_WIND,4.194757,0.513256,2.636758e-05,5,5.251558,3.137956
year,3.800577,0.532664,4.51134e-05,5,4.897339,2.703815


In [8]:
model_to_use = predictor.get_model_best()
model_pred = predictor.predict(test_data, model=model_to_use)

In [9]:
pred_y = np.where(model_pred < 0, 0, model_pred)
pred_y

array([4.0620503e+00, 2.6316820e+01, 2.2498863e+01, ..., 8.4081512e+01,
       7.5976555e+01, 1.3036174e-02], dtype=float32)

# Submission

In [11]:
submit = pd.read_csv('../new_open/sample_submission.csv')
submit['CI_HOUR'] = pred_y

In [12]:
submit['DIST'] = test['DIST'].apply(lambda x: 0 if x==0 else 1)
submit['CI_HOUR'] = submit['CI_HOUR']*submit['DIST']
submit.drop(['DIST'],axis=1,inplace=True)
submit

Unnamed: 0,SAMPLE_ID,CI_HOUR
0,TEST_000000,4.062050
1,TEST_000001,26.316820
2,TEST_000002,22.498863
3,TEST_000003,38.439987
4,TEST_000004,0.000000
...,...,...
220486,TEST_220486,0.000000
220487,TEST_220487,0.000000
220488,TEST_220488,84.081512
220489,TEST_220489,75.976555


In [13]:
submit.to_csv('../Sub/autogluon_stack3_new_fill.csv', index=False)