# Import 

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

# Load data

In [2]:
train = pd.read_parquet('data/final_train.parquet')
test = pd.read_parquet('data/final_test.parquet')

# Encoder

- 삭제 : id, base_date,day_of_week,base_hour , end_latitude, end_longitude, start_latitude, start_longitude, start_node_name, end_node_name, road_name
  

- One-Hot Encoder  : rain_snow, start_turn_restricted, end_turn_restricted, day_of_week ,week,time 

In [3]:
del_col = ['id','base_date','day_of_week','base_hour','end_latitude', 'end_longitude', 'start_latitude', 'start_longitude', 'start_node_name', 'end_node_name', 'road_name']
cat_col= ['rain_snow', 'start_turn_restricted', 'end_turn_restricted', 'week','time']

In [4]:
train2 = train.drop(columns = del_col)
test2 = test.drop(columns = del_col)

In [5]:
train2

Unnamed: 0,lane_count,road_rating,multi_linked,connect_code,maximum_speed_limit,weight_restricted,road_type,start_turn_restricted,end_turn_restricted,target,rain_snow,week,year,month,day,time,center_start,center_end
0,1,106,0,0,60.0,32400.0,3,없음,없음,52.0,눈비,평일,2022,6,23,저녁,0,0
1,2,103,0,0,60.0,0.0,0,있음,없음,30.0,눈비,평일,2022,7,28,저녁,1,1
2,2,103,0,0,80.0,0.0,0,없음,없음,61.0,눈비,휴일,2021,10,10,아침,0,0
3,2,107,0,0,50.0,0.0,0,없음,없음,20.0,없음,평일,2022,3,11,점심,2,2
4,2,103,0,0,80.0,0.0,0,없음,없음,38.0,없음,평일,2021,10,5,아침,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4701212,1,107,0,0,50.0,0.0,0,없음,없음,20.0,없음,평일,2021,11,4,저녁,0,0
4701213,2,107,0,0,80.0,43200.0,3,없음,없음,65.0,눈비,평일,2022,3,31,밤,0,0
4701214,2,103,0,0,60.0,0.0,0,없음,없음,30.0,눈비,평일,2022,6,13,밤,0,0
4701215,2,103,0,0,80.0,0.0,0,없음,없음,73.0,없음,평일,2021,10,20,밤,0,0


In [6]:
transformer = make_column_transformer((OneHotEncoder(), cat_col))

train_transformed = transformer.fit_transform(train2)
test_transformed = transformer.transform(test2)

train3 = pd.DataFrame(train_transformed, columns=transformer.get_feature_names_out())
test3 = pd.DataFrame(test_transformed, columns=transformer.get_feature_names_out())

In [7]:
train3.head()

Unnamed: 0,onehotencoder__rain_snow_눈비,onehotencoder__rain_snow_없음,onehotencoder__start_turn_restricted_없음,onehotencoder__start_turn_restricted_있음,onehotencoder__end_turn_restricted_없음,onehotencoder__end_turn_restricted_있음,onehotencoder__week_평일,onehotencoder__week_휴일,onehotencoder__time_밤,onehotencoder__time_아침,onehotencoder__time_저녁,onehotencoder__time_점심
0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [8]:
train2 = train2.drop(columns=cat_col)
test2 = test2.drop(columns=cat_col)

In [9]:
train_data = pd.concat([train2, train3], axis=1)
test_data = pd.concat([test2, test3], axis=1)

In [10]:
train_data

Unnamed: 0,lane_count,road_rating,multi_linked,connect_code,maximum_speed_limit,weight_restricted,road_type,target,year,month,...,onehotencoder__start_turn_restricted_없음,onehotencoder__start_turn_restricted_있음,onehotencoder__end_turn_restricted_없음,onehotencoder__end_turn_restricted_있음,onehotencoder__week_평일,onehotencoder__week_휴일,onehotencoder__time_밤,onehotencoder__time_아침,onehotencoder__time_저녁,onehotencoder__time_점심
0,1,106,0,0,60.0,32400.0,3,52.0,2022,6,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,2,103,0,0,60.0,0.0,0,30.0,2022,7,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,2,103,0,0,80.0,0.0,0,61.0,2021,10,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,2,107,0,0,50.0,0.0,0,20.0,2022,3,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,2,103,0,0,80.0,0.0,0,38.0,2021,10,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4701212,1,107,0,0,50.0,0.0,0,20.0,2021,11,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4701213,2,107,0,0,80.0,43200.0,3,65.0,2022,3,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4701214,2,103,0,0,60.0,0.0,0,30.0,2022,6,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4701215,2,103,0,0,80.0,0.0,0,73.0,2021,10,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


# Model

In [11]:
y_train = train_data['target']
X_train = train_data.drop(columns=['target'])

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,
                                                    test_size=0.2, random_state=42)

In [13]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(3760973, 24) (3760973,)
(940244, 24) (940244,)


In [14]:
LR = lgb.LGBMRegressor(random_state=42, n_jobs=-1).fit(X_train,y_train,)

In [15]:
pred= LR.predict(X_val)

In [16]:
mean_absolute_error(y_val, pred)

7.202347649650799

In [17]:
pred2 = LR.predict(test_data)

In [18]:
sample_submission = pd.read_csv('data/sample_submission.csv')

In [19]:
sample_submission['target'] = pred2
sample_submission.to_csv("submit.csv", index = False)