# Import 

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
from xgboost  import XGBClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
import catboost as cb

# Load data

In [6]:
train = pd.read_parquet('data/final_train.parquet')
test = pd.read_parquet('data/final_test.parquet')

# Encoder

- 삭제 : id, base_date,day_of_week,base_hour , end_latitude, end_longitude, start_latitude, start_longitude, start_node_name, end_node_name, road_name
  

- One-Hot Encoder  : rain_snow, start_turn_restricted, end_turn_restricted, day_of_week ,week,time 

In [7]:
del_col = ['id','base_date','day_of_week','end_latitude', 'end_longitude', 'start_latitude', 'start_longitude', 'start_node_name', 'end_node_name', 'road_name','day','month']
cat_col= ['rain_snow', 'start_turn_restricted', 'end_turn_restricted','vacation']

In [8]:
train_drop = train.drop(columns = del_col)
test_drop = test.drop(columns = del_col)

In [9]:
transformer = make_column_transformer((OneHotEncoder(), cat_col))

train_transformed = transformer.fit_transform(train_drop)
test_transformed = transformer.transform(test_drop)

train_tra = pd.DataFrame(train_transformed, columns=transformer.get_feature_names_out())
test_tra = pd.DataFrame(test_transformed, columns=transformer.get_feature_names_out())

In [10]:
train_drop = train_drop.drop(columns=cat_col)
test_drop = test_drop.drop(columns=cat_col)

In [11]:
train_data = pd.concat([train_drop, train_tra], axis=1)
test_data = pd.concat([test_drop, test_tra], axis=1)

In [12]:
train_data.head()

Unnamed: 0,base_hour,lane_count,road_rating,multi_linked,connect_code,maximum_speed_limit,weight_restricted,road_type,target,week,...,center_start,center_end,onehotencoder__rain_snow_눈비,onehotencoder__rain_snow_없음,onehotencoder__start_turn_restricted_없음,onehotencoder__start_turn_restricted_있음,onehotencoder__end_turn_restricted_없음,onehotencoder__end_turn_restricted_있음,onehotencoder__vacation_방학,onehotencoder__vacation_학기
0,17,1,106,0,0,60.0,32400.0,3,52.0,0.0,...,0,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,21,2,103,0,0,60.0,0.0,0,30.0,0.0,...,1,1,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,7,2,103,0,0,80.0,0.0,0,61.0,2.0,...,0,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,13,2,107,0,0,50.0,0.0,0,20.0,1.0,...,2,2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
4,8,2,103,0,0,80.0,0.0,0,38.0,0.0,...,0,0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0


# Model

In [13]:
y_train = train_data['target']
X_train = train_data.drop(columns=['target'])

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,
                                                    test_size=0.2, random_state=42)

In [11]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(3760973, 22) (3760973,)
(940244, 22) (940244,)


## LGBMRegressor

In [12]:
LR = lgb.LGBMRegressor(random_state=42, n_jobs=-1).fit(X_train,y_train,)

In [13]:
pred= LR.predict(X_val)

In [14]:
mean_absolute_error(y_val, pred)

5.227474509459903

In [15]:
pred2 = LR.predict(test_data)

In [22]:
sample_submission = pd.read_csv('data/sample_submission.csv')

In [17]:
sample_submission['target'] = pred2
sample_submission.to_csv("submit.csv", index = False)

## XGboost

In [19]:
XGB=xgb.XGBRegressor(random_state=42, n_jobs=-1).fit(X_train,y_train)

In [23]:
pred_xg= XGB.predict(X_val)

In [24]:
mean_absolute_error(y_val, pred_xg)

4.663731015551993

## RandomForestRegressor

In [25]:
Random_model = RandomForestRegressor(random_state=42).fit(X_train, y_train)

In [26]:
pred_random= Random_model.predict(X_val)

In [27]:
mean_absolute_error(y_val, pred_random)

4.537415594989837

## Catboost

In [17]:
cat=cb.CatBoostRegressor(random_state=42,verbose=0).fit(X_train,y_train)

In [19]:
pred_cat= cat.predict(X_val)

In [20]:
mean_absolute_error(y_val, pred_cat)

4.560542796092552

## 모델 결과 
- `LGBMRegressor `-> 5.227474509459903
- `XGboost` -> 4.663731015551993
- `RandomForestRegressor` -> 4.537415594989837
- `Catboost` -> 4.560542796092552  
  
`RandomForestRegressor` 와 `Catboost` 결과가 가장 좋게 나왔다.   
RandomForestRegressor보다는 Catboost가 빠르므로  `Catboost Hyperparameter tuning`을 할 계획

In [21]:
pred_cat = cat.predict(test_data)

In [23]:
sample_submission['target'] = pred_cat
sample_submission.to_csv("submit_cat.csv", index = False)