In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import gridspec
import matplotlib.font_manager as fm
import seaborn as sns
import warnings
from scipy.special import boxcox1p, inv_boxcox1p

pd.options.display.float_format = '{:.5f}'.format
fontpath = 'C:/Users/TaeSoo/AppData/Local/Microsoft/Windows/Fonts/NanumGothic.ttf'
%matplotlib inline
warnings.filterwarnings(action='ignore')

plt.rcParams['figure.dpi'] = 140
plt.rcParams['font.family'] = 'NanumGothic'

In [2]:
train = pd.read_csv('../원본데이터/train.csv')
test = pd.read_csv('../원본데이터/test.csv')

In [3]:
train = train.drop(['id', 'vehicle_restricted', 'height_restricted'], axis = 1)
test = test.drop(['id', 'vehicle_restricted', 'height_restricted'], axis = 1)

In [4]:
train_df = train.copy()
test_df = test.copy()

### 전처리

road_name에서 '-'로 표기된 null값들이 있음

In [5]:
dataset = pd.concat([train_df, test_df], axis = 0)

In [6]:
road_name_df = pd.read_csv('../preprocessing/road_naming.csv', index_col = 0)

In [7]:
dataset = pd.merge(dataset, road_name_df, how = 'left', on = ['start_latitude', 'start_longitude', 'end_latitude', 'end_longitude'])

In [8]:
dataset['pre_road_name'] = dataset['pre_road_name'].fillna(dataset['road_name'])
dataset['road_name'] = dataset['pre_road_name']

In [9]:
train_df = dataset[:train.shape[0]].drop(['pre_road_name'], axis = 1)
test_df = dataset[train.shape[0]:].drop(['target', 'pre_road_name'], axis = 1)

### 파생변수 생성

1. 휴일 변수 (외부)
2. 년 / 분기 / 일별 변수 (내부)
3. 계절 변수 (내부)
4. 방학 변수 (내부)
5. 관광객 수 변수 (외부)
6. 날씨 변수 (외부)

7. 도로 길이 변수 (내부)
8. 제주공항과의 거리 변수 (내부)
9. 권역별 정보 변수 (geo api)

In [10]:
holiday_df = pd.read_csv('../외부데이터/holiday.csv')
tourist_df = pd.read_csv('../외부데이터/tourist.csv', index_col = 0)
weather_df = pd.read_csv('../외부데이터/weather_last3.csv', index_col = 0)

In [11]:
dataset = pd.concat([train_df, test_df], axis = 0)

##### 1. 휴일 변수

In [12]:
holiday_df.columns = ['base_date', 'holiday']
dataset = pd.merge(dataset, holiday_df, how = 'left')
dataset['holiday'] = dataset['holiday'].fillna(0)
dataset['holiday'] = dataset['holiday'].apply(lambda x : 1 if x != 0 else 0)
dataset['weekend'] = dataset['day_of_week'].apply(lambda x : 1 if (x == '토' or x == '일') else 0)
dataset['day_off'] = dataset['holiday'] + dataset['weekend']
dataset['day_off'] = dataset['day_off'].apply(lambda x : 0 if x >= 1 else 1)

In [13]:
dataset['diff_day_off'] = 3
day_off_index = dataset[dataset['day_off'] == 0].index
pre1_day_off_index = day_off_index - 1
pre2_day_off_index = day_off_index - 2
post1_day_off_index = day_off_index + 1
post2_day_off_index = day_off_index + 2

In [14]:
dataset.loc[pre2_day_off_index, 'diff_day_off'] = 2
dataset.loc[post2_day_off_index, 'diff_day_off'] = 2
dataset.loc[pre1_day_off_index, 'diff_day_off'] = 1
dataset.loc[post1_day_off_index, 'diff_day_off'] = 1
dataset.loc[day_off_index, 'diff_day_off'] = 0

##### 2. 년 / 분기 / 달 / 날짜별 변수

In [15]:
dataset['base_date'] = dataset['base_date'].astype('str')
dataset['base_date'] = pd.to_datetime(dataset['base_date'])

dataset['year'] = dataset['base_date'].dt.year
dataset['quarter'] = dataset['base_date'].dt.quarter
dataset['month'] = dataset['base_date'].dt.month
dataset['day'] = dataset['base_date'].dt.day

##### 3. 계절 변수

In [16]:
dataset['season'] = dataset['month'].apply(lambda x : 1 if (x == 3 or x == 4 or x == 5) else 2 \
    if (x == 6 or x == 8) else 3 if (x == 9 or x == 10 or x == 11) else 4 if (x == 7) else 5)

##### 4. 방학 변수

In [17]:
dataset['vacation'] = dataset['month'].apply(lambda x : 1 if (x == 7 or x == 8 or x == 1 or x == 2) else 0)

##### 5. 관광객 수 변수

In [18]:
tourist_df['month'] = tourist_df['base_date'].astype('str').str[4:].astype('int')
dataset = pd.merge(dataset, tourist_df, how = 'left', on = 'month')
dataset = dataset.drop(['base_date_y'], axis = 1)

##### 6. 날씨 변수

In [19]:
dataset = pd.merge(dataset, weather_df, how = 'left', on = ['month', 'day'])

##### 7. 도로의 길이

In [20]:
def cal_dist(x1, y1, x2, y2):
    distance = ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
    return distance

In [21]:
dataset['road_dist'] = cal_dist(dataset['start_latitude'], dataset['start_longitude'], dataset['end_latitude'], dataset['end_longitude'])

##### 8. 제주공항과의 거리

제주공항 gps : 33.5101562, 126.4861157

In [22]:
dataset['airport_dist'] = cal_dist((dataset['start_latitude'] + dataset['end_latitude'])/2, (dataset['start_longitude'] + dataset['end_longitude'])/2, 33.5101562, 126.4861157)

##### 9. 권역별 정보 추가

In [23]:
suburb_name_df = pd.read_csv('../외부데이터/suburb_naming.csv', index_col = 0)

In [24]:
dataset = pd.merge(dataset, suburb_name_df, how = 'left', on = ['start_latitude', 'start_longitude', 'end_latitude', 'end_longitude'])

In [25]:
train_df = dataset[:train.shape[0]]
test_df = dataset[train.shape[0]:].drop(['target'], axis = 1)

In [28]:
train_df.to_csv('pre_train.csv', encoding = 'utf-8-sig')
test_df.to_csv('pre_test.csv', encoding = 'utf-8-sig')

In [26]:
y_train = train_df['target']
X_train = train_df.drop(['target'], axis = 1)

In [27]:
from supervised.automl import AutoML

In [28]:
automl = AutoML(algorithms = ['LightGBM', 'Xgboost', 'CatBoost'], mode = 'Compete', total_time_limit = 5*3600, eval_metric = 'mae')

In [29]:
automl.fit(X_train, y_train)

AutoML directory: AutoML_3
The task is regression with evaluation metric mae
AutoML will use algorithms: ['LightGBM', 'Xgboost', 'CatBoost']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'mix_encoding', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree mae 9.243501 trained in 114.6 seconds
Disable stacking for split validation
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
2_Default_LightGBM mae 3.012099 trained in 2049.96 seconds
3_Default_Xgboost mae 3.027972 trained in 3453.96 seconds
* Step not_so_random will try to check up to 27 models
13_LightGBM mae 3.209315 trained in 1750.01 seconds
4_Xgboost

In [30]:
automl.report()

Best model,name,model_type,metric_type,metric_value,train_time
,1_DecisionTree,Decision Tree,mae,9.2435,116.16
,2_Default_LightGBM,LightGBM,mae,3.0121,2051.65
,3_Default_Xgboost,Xgboost,mae,3.02797,3455.41
,13_LightGBM,LightGBM,mae,3.20931,1751.7
,4_Xgboost,Xgboost,mae,2.93115,4619.02
,13_LightGBM_GoldenFeatures,LightGBM,mae,3.17236,1993.11
,15_LightGBM,LightGBM,mae,3.01224,2324.14
,23_DecisionTree,Decision Tree,mae,9.2435,123.94
the best,Ensemble,Ensemble,mae,2.92144,1.66

Metric,Score
MAE,3.20931
MSE,20.8201
RMSE,4.5629
R2,0.91807
MAPE,0.0994031

Metric,Score
MAE,3.17236
MSE,20.422
RMSE,4.51907
R2,0.919636
MAPE,0.0984268

Metric,Score
MAE,3.01224
MSE,18.6943
RMSE,4.32369
R2,0.926435
MAPE,0.0927278

Metric,Score
MAE,9.2435
MSE,136.995
RMSE,11.7045
R2,0.460903
MAPE,0.298544

Metric,Score
MAE,9.2435
MSE,136.995
RMSE,11.7045
R2,0.460903
MAPE,0.298544

Metric,Score
MAE,3.0121
MSE,18.701
RMSE,4.32446
R2,0.926409
MAPE,0.0927638

Metric,Score
MAE,3.02797
MSE,19.0516
RMSE,4.36481
R2,0.925029
MAPE,0.0931593

Metric,Score
MAE,2.93115
MSE,18.016
RMSE,4.24453
R2,0.929104
MAPE,0.0896268

Model,Weight
15_LightGBM,1
2_Default_LightGBM,1
4_Xgboost,6

Metric,Score
MAE,2.92144
MSE,17.8587
RMSE,4.22595
R2,0.929724
MAPE,0.089612


In [31]:
pred = automl.predict(test_df)

In [34]:
sample = pd.read_csv('../원본데이터/sample_submission.csv')

In [35]:
sub = pd.DataFrame()
sub['id'] = sample['id']
sub['target'] = pred
sub.to_csv('submission_1.csv',index = False)