# Import

In [1]:
import os
import pandas as pd
import random
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
import autogluon.core as ag
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) 

  from .autonotebook import tqdm as notebook_tqdm


# Data Load

In [2]:
train = pd.read_csv('../new_open/train_merge.csv')
test = pd.read_csv('../new_open/test_merge.csv')

# Data Pre-processing

In [3]:
train_importance_features = ['DIST','PORT_SIZE','AIR_TEMPERATURE','month_sin','DEADWEIGHT','year','SHIP_TYPE_CATEGORY',
                                                    'GT','ARI_CO','ARI_PO','CI_HOUR']
test_importance_features = ['DIST','PORT_SIZE','AIR_TEMPERATURE','month_sin','DEADWEIGHT','year','SHIP_TYPE_CATEGORY','GT','ARI_CO','ARI_PO']

In [4]:
train = train[train_importance_features]
test = test[test_importance_features]

In [5]:
train_data = TabularDataset(train)
test_data = TabularDataset(test)

# Autogluon

In [6]:
label = 'CI_HOUR'
eval_metric = 'mean_absolute_error'

In [7]:
predictor = TabularPredictor(
    label=label, problem_type='regression', eval_metric=eval_metric
).fit(train_data, 
      presets='best_quality', 
      #num_stack_levels=3,
      #excluded_model_types = excluded_model_types,
      num_gpus=1)

No path specified. Models will be saved in: "AutogluonModels\ag-20231017_153547\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231017_153547\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   225.15 GB / 999.46 GB (22.5%)
Train Data Rows:    391939
Train Data Columns: 10
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    53846.47 MB
	Train Data (Original)  Memory Usage: 31.36 MB (0.1% of a

In [8]:
predictor.leaderboard(silent = True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-43.889767,313.114894,9808.823813,0.006018,5.758163,3,True,22
1,NeuralNetTorch_BAG_L2,-43.896655,312.678795,9784.030322,1.551738,2692.451726,2,True,20
2,WeightedEnsemble_L2,-45.421039,9.956121,5393.717547,0.007554,6.824445,2,True,12
3,NeuralNetTorch_BAG_L1,-45.620447,0.921176,5326.004818,0.921176,5326.004818,1,True,10
4,XGBoost_BAG_L2,-46.933744,311.557139,7110.613924,0.430081,19.035327,2,True,19
5,NeuralNetFastAI_BAG_L2,-49.943908,313.48237,7796.646273,2.355312,705.067677,2,True,18
6,LightGBMLarge_BAG_L2,-50.832766,314.112571,7142.438267,2.985513,50.85967,2,True,21
7,LightGBM_BAG_L2,-50.844334,313.334266,7106.418397,2.207208,14.8398,2,True,14
8,LightGBMXT_BAG_L2,-50.888068,319.187536,7140.024059,8.060478,48.445462,2,True,13
9,CatBoost_BAG_L2,-51.133146,311.186835,7161.434779,0.059777,69.856183,2,True,16


In [9]:
predictor.feature_importance(train_data) 

Computing feature importance via permutation shuffling for 10 features using 5000 rows with 5 shuffle sets...
	537.75s	= Expected runtime (107.55s per shuffle set)
	182.22s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
AIR_TEMPERATURE,70.003117,3.250183,5.56027e-07,5,76.69529,63.310945
year,68.92026,2.87335,3.616946e-07,5,74.836527,63.003992
DIST,38.51209,0.860028,2.982319e-08,5,40.282899,36.741281
GT,16.609616,1.11987,2.464823e-06,5,18.915444,14.303788
DEADWEIGHT,14.880555,0.959635,2.064074e-06,5,16.856457,12.904652
month_sin,13.239382,0.813221,1.699662e-06,5,14.913815,11.564949
SHIP_TYPE_CATEGORY,9.382925,0.610877,2.143841e-06,5,10.640729,8.125121
PORT_SIZE,9.166551,0.950402,1.367062e-05,5,11.123442,7.209659
ARI_PO,8.639412,0.751797,6.811987e-06,5,10.187372,7.091452
ARI_CO,8.555548,0.478783,1.172017e-06,5,9.541369,7.569727


In [10]:
model_to_use = predictor.get_model_best()
model_pred = predictor.predict(test_data, model=model_to_use)

In [11]:
pred_y = np.where(model_pred < 0, 0, model_pred)
pred_y

array([3.4718659e+00, 3.8846161e+01, 2.6964029e+01, ..., 1.8417068e+01,
       5.6248903e-03, 2.2183777e+01], dtype=float32)

In [13]:
submit = pd.read_csv('../new_open/sample_submission.csv')
submit['CI_HOUR'] = pred_y

In [14]:
submit['DIST'] = test['DIST'].apply(lambda x: 0 if x==0 else 1)
submit['CI_HOUR'] = submit['CI_HOUR']*submit['DIST']
submit.drop(['DIST'],axis=1,inplace=True)
submit

Unnamed: 0,SAMPLE_ID,CI_HOUR
0,TEST_000000,3.471866
1,TEST_000001,38.846161
2,TEST_000002,26.964029
3,TEST_000003,112.199425
4,TEST_000004,0.000000
...,...,...
220486,TEST_220486,78.987816
220487,TEST_220487,80.308891
220488,TEST_220488,18.417068
220489,TEST_220489,0.000000


In [15]:
submit.to_csv('../Sub/autogluon_new_2.csv', index=False)