# Import 

In [18]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

# Load data

In [2]:
train = pd.read_parquet('data/final_train.parquet')
test = pd.read_parquet('data/final_test.parquet')

# Encoder

- id, end_latitude, end_longitude, start_latitude, start_longitude, start_node_name, end_node_name, road_name 삭제  
  

- rain_snow, start_turn_restricted, end_turn_restricted, day_of_week  : One-Hot Encoder  

In [3]:
del_col = ['id', 'end_latitude', 'end_longitude', 'start_latitude', 'start_longitude', 'start_node_name', 'end_node_name', 'road_name']
cat_col= ['rain_snow', 'start_turn_restricted', 'end_turn_restricted', 'day_of_week' ]

In [4]:
train2 = train.drop(columns = del_col)
test2 = test.drop(columns = del_col)

In [5]:
transformer = make_column_transformer((OneHotEncoder(), cat_col))

train_transformed = transformer.fit_transform(train2)
test_transformed = transformer.transform(test2)

train3 = pd.DataFrame(train_transformed, columns=transformer.get_feature_names_out())
test3 = pd.DataFrame(test_transformed, columns=transformer.get_feature_names_out())

In [6]:
train3.head()

Unnamed: 0,onehotencoder__rain_snow_눈비,onehotencoder__rain_snow_없음,onehotencoder__start_turn_restricted_없음,onehotencoder__start_turn_restricted_있음,onehotencoder__end_turn_restricted_없음,onehotencoder__end_turn_restricted_있음,onehotencoder__day_of_week_금,onehotencoder__day_of_week_목,onehotencoder__day_of_week_수,onehotencoder__day_of_week_월,onehotencoder__day_of_week_일,onehotencoder__day_of_week_토,onehotencoder__day_of_week_화
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
train2 = train2.drop(columns=cat_col)
test2 = test2.drop(columns=cat_col)

In [8]:
train_data = pd.concat([train2, train3], axis=1)
test_data = pd.concat([test2, test3], axis=1)

In [9]:
train_data

Unnamed: 0,base_hour,lane_count,road_rating,multi_linked,connect_code,maximum_speed_limit,weight_restricted,road_type,target,year,...,onehotencoder__start_turn_restricted_있음,onehotencoder__end_turn_restricted_없음,onehotencoder__end_turn_restricted_있음,onehotencoder__day_of_week_금,onehotencoder__day_of_week_목,onehotencoder__day_of_week_수,onehotencoder__day_of_week_월,onehotencoder__day_of_week_일,onehotencoder__day_of_week_토,onehotencoder__day_of_week_화
0,17,1,106,0,0,60.0,32400.0,3,52.0,2022,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,21,2,103,0,0,60.0,0.0,0,30.0,2022,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,7,2,103,0,0,80.0,0.0,0,61.0,2021,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,13,2,107,0,0,50.0,0.0,0,20.0,2022,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,2,103,0,0,80.0,0.0,0,38.0,2021,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4701212,16,1,107,0,0,50.0,0.0,0,20.0,2021,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4701213,2,2,107,0,0,80.0,43200.0,3,65.0,2022,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4701214,22,2,103,0,0,60.0,0.0,0,30.0,2022,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4701215,2,2,103,0,0,80.0,0.0,0,73.0,2021,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Model

In [10]:
y_train = train_data['target']
X_train = train_data.drop(columns=['target'])

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,
                                                    test_size=0.2, random_state=42)

In [12]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(3760973, 24) (3760973,)
(940244, 24) (940244,)


In [14]:
LR = lgb.LGBMRegressor(random_state=42, n_jobs=-1).fit(X_train,y_train,)

In [16]:
pred= LR.predict(X_val)

In [19]:
mean_absolute_error(y_val, pred)

7.541587989460597

In [20]:
pred2 = LR.predict(test_data)

In [21]:
sample_submission = pd.read_csv('data/sample_submission.csv')

In [22]:
sample_submission['target'] = pred2
sample_submission.to_csv("submit.csv", index = False)