# Import 

In [199]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

# Load data

In [200]:
train = pd.read_parquet('data/final_train.parquet')
test = pd.read_parquet('data/final_test.parquet')

# Encoder

- 삭제 : id, base_date,day_of_week,base_hour , end_latitude, end_longitude, start_latitude, start_longitude, start_node_name, end_node_name, road_name
  

- One-Hot Encoder  : rain_snow, start_turn_restricted, end_turn_restricted, day_of_week ,week,time 

In [201]:
del_col = ['id','base_date','day_of_week','end_latitude', 'end_longitude', 'start_latitude', 'start_longitude', 'start_node_name', 'end_node_name', 'road_name','day','vacation','month']
cat_col= ['rain_snow', 'start_turn_restricted', 'end_turn_restricted']

In [202]:
train2 = train.drop(columns = del_col)
test2 = test.drop(columns = del_col)

In [203]:
train2

Unnamed: 0,base_hour,lane_count,road_rating,multi_linked,connect_code,maximum_speed_limit,weight_restricted,road_type,start_turn_restricted,end_turn_restricted,target,rain_snow,week,year,distance,center_start,center_end
0,17,1,106,0,0,60.0,32400.0,3,없음,없음,52.0,눈비,0.0,2022,0.025694,0,0
1,21,2,103,0,0,60.0,0.0,0,있음,없음,30.0,눈비,0.0,2022,0.525560,1,1
2,7,2,103,0,0,80.0,0.0,0,없음,없음,61.0,눈비,2.0,2021,0.608016,0,0
3,13,2,107,0,0,50.0,0.0,0,없음,없음,20.0,없음,1.0,2022,0.107285,2,2
4,8,2,103,0,0,80.0,0.0,0,없음,없음,38.0,없음,0.0,2021,0.337736,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4701212,16,1,107,0,0,50.0,0.0,0,없음,없음,20.0,없음,0.0,2021,0.426736,0,0
4701213,2,2,107,0,0,80.0,43200.0,3,없음,없음,65.0,눈비,0.0,2022,0.048433,0,0
4701214,22,2,103,0,0,60.0,0.0,0,없음,없음,30.0,눈비,0.0,2022,0.341969,0,0
4701215,2,2,103,0,0,80.0,0.0,0,없음,없음,73.0,없음,0.0,2021,0.209584,0,0


In [204]:
transformer = make_column_transformer((OneHotEncoder(), cat_col))

train_transformed = transformer.fit_transform(train2)
test_transformed = transformer.transform(test2)

train3 = pd.DataFrame(train_transformed, columns=transformer.get_feature_names_out())
test3 = pd.DataFrame(test_transformed, columns=transformer.get_feature_names_out())

In [205]:
train3.head()

Unnamed: 0,onehotencoder__rain_snow_눈비,onehotencoder__rain_snow_없음,onehotencoder__start_turn_restricted_없음,onehotencoder__start_turn_restricted_있음,onehotencoder__end_turn_restricted_없음,onehotencoder__end_turn_restricted_있음
0,1.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,1.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,1.0,0.0
4,0.0,1.0,1.0,0.0,1.0,0.0


In [206]:
train2 = train2.drop(columns=cat_col)
test2 = test2.drop(columns=cat_col)

In [207]:
train_data = pd.concat([train2, train3], axis=1)
test_data = pd.concat([test2, test3], axis=1)

In [208]:
train_data

Unnamed: 0,base_hour,lane_count,road_rating,multi_linked,connect_code,maximum_speed_limit,weight_restricted,road_type,target,week,year,distance,center_start,center_end,onehotencoder__rain_snow_눈비,onehotencoder__rain_snow_없음,onehotencoder__start_turn_restricted_없음,onehotencoder__start_turn_restricted_있음,onehotencoder__end_turn_restricted_없음,onehotencoder__end_turn_restricted_있음
0,17,1,106,0,0,60.0,32400.0,3,52.0,0.0,2022,0.025694,0,0,1.0,0.0,1.0,0.0,1.0,0.0
1,21,2,103,0,0,60.0,0.0,0,30.0,0.0,2022,0.525560,1,1,1.0,0.0,0.0,1.0,1.0,0.0
2,7,2,103,0,0,80.0,0.0,0,61.0,2.0,2021,0.608016,0,0,1.0,0.0,1.0,0.0,1.0,0.0
3,13,2,107,0,0,50.0,0.0,0,20.0,1.0,2022,0.107285,2,2,0.0,1.0,1.0,0.0,1.0,0.0
4,8,2,103,0,0,80.0,0.0,0,38.0,0.0,2021,0.337736,0,0,0.0,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4701212,16,1,107,0,0,50.0,0.0,0,20.0,0.0,2021,0.426736,0,0,0.0,1.0,1.0,0.0,1.0,0.0
4701213,2,2,107,0,0,80.0,43200.0,3,65.0,0.0,2022,0.048433,0,0,1.0,0.0,1.0,0.0,1.0,0.0
4701214,22,2,103,0,0,60.0,0.0,0,30.0,0.0,2022,0.341969,0,0,1.0,0.0,1.0,0.0,1.0,0.0
4701215,2,2,103,0,0,80.0,0.0,0,73.0,0.0,2021,0.209584,0,0,0.0,1.0,1.0,0.0,1.0,0.0


# Model

In [209]:
y_train = train_data['target']
X_train = train_data.drop(columns=['target'])

In [210]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,
                                                    test_size=0.2, random_state=42)

In [211]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(3760973, 19) (3760973,)
(940244, 19) (940244,)


In [212]:
LR = lgb.LGBMRegressor(random_state=42, n_jobs=-1).fit(X_train,y_train,)

In [213]:
pred= LR.predict(X_val)

In [214]:
mean_absolute_error(y_val, pred)

5.591138455995262

In [215]:
pred2 = LR.predict(test_data)

In [216]:
sample_submission = pd.read_csv('data/sample_submission.csv')

In [217]:
sample_submission['target'] = pred2
sample_submission.to_csv("submit.csv", index = False)