In [1]:
import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns; sns.set()

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

#import pyresample
import numpy as np

print(tf.__version__)

2.0.0


In [2]:
train_file_path = 'train.csv'
test_file_path = 'test.csv'

column_names = ['date', 'in_out', 'latitude', 'longitude',
                '6~7_ride', '7~8_ride', '8~9_ride',
                '9~10_ride', '10~11_ride', '11~12_ride',
                '6~7_takeoff', '7~8_takeoff', '8~9_takeoff',
                '9~10_takeoff', '10~11_takeoff', '18~20_ride']
column_names_test = ['date', 'in_out', 'latitude', 'longitude',
                '6~7_ride', '7~8_ride', '8~9_ride',
                '9~10_ride', '10~11_ride', '11~12_ride',
                '6~7_takeoff', '7~8_takeoff', '8~9_takeoff',
                '9~10_takeoff', '10~11_takeoff']

train_raw = pd.read_csv(train_file_path, usecols=column_names,
                        na_values='?', skipinitialspace=True)
test_raw = pd.read_csv(test_file_path, usecols=column_names_test,
                       na_values='?', skipinitialspace=True)

In [3]:
train_raw.head()

Unnamed: 0,date,in_out,latitude,longitude,6~7_ride,7~8_ride,8~9_ride,9~10_ride,10~11_ride,11~12_ride,6~7_takeoff,7~8_takeoff,8~9_takeoff,9~10_takeoff,10~11_takeoff,18~20_ride
0,2019-09-01,시외,33.4899,126.49373,0.0,1.0,2.0,5.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019-09-01,시외,33.48944,126.48508,1.0,4.0,4.0,2.0,5.0,6.0,0.0,0.0,0.0,0.0,0.0,5.0
2,2019-09-01,시외,33.48181,126.47352,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,2019-09-01,시내,33.50577,126.49252,0.0,17.0,6.0,26.0,14.0,16.0,0.0,0.0,0.0,0.0,0.0,53.0
4,2019-09-01,시내,33.25579,126.4126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [4]:
LON = np.linspace(train_raw['longitude'].min(), train_raw['longitude'].max(), 100)
LAT = np.linspace(train_raw['latitude'].min(), train_raw['latitude'].max(), 100)

# 제일 거리가 가까운 grid 인덱스 찾기
def geo_idx(dd, dd_array):
    geo_idx = (np.abs(dd_array - dd)).argmin()
    return geo_idx

def make_dataset(df):
    dataset = df.copy()
    dataset['date'] = pd.to_datetime(dataset['date'])
    dataset['weekday'] = dataset['date'].dt.weekday
    
    # 주말 데이터
    dataset['weekend'] = (dataset['weekday'] >= 5).astype(float)
    
    # 휴일 데이터
    holidays = pd.to_datetime(['2019-09-12', '2019-09-13', '2019-09-14', 
                           '2019-10-03', '2019-10-09'])
    dataset['holidays'] = dataset['date'].isin(holidays)*1.0
    
    # 요일 one hot encoding
    dataset['mon'] = (dataset['weekday'] == 0)*1.0
    dataset['tue'] = (dataset['weekday'] == 1)*1.0
    dataset['wed'] = (dataset['weekday'] == 2)*1.0
    dataset['thu'] = (dataset['weekday'] == 3)*1.0
    dataset['fri'] = (dataset['weekday'] == 4)*1.0
    dataset['sat'] = (dataset['weekday'] == 5)*1.0
    dataset['sun'] = (dataset['weekday'] == 6)*1.0
    
    # 필요 없는 칼럼 지우기
    dataset.drop(['date', 'weekday'], 1, inplace=True)
    
    in_out = dataset.pop('in_out')
    dataset['in'] = (in_out == '시내')*1.0
    dataset['out'] = (in_out == '시외')*1.0
    
    dataset['grid_lon'] = dataset['longitude'].map(lambda x: geo_idx(x, LON))
    dataset['grid_lat'] = dataset['latitude'].map(lambda x: geo_idx(x, LAT))
    
    dataset = pd.get_dummies(dataset, columns=['grid_lon', 'grid_lat'])
    dataset.drop(['latitude', 'longitude'], 1, inplace=True)
    
    return dataset

In [5]:
train_dataset = make_dataset(train_raw)
train_labels = train_dataset.pop('18~20_ride')
test_dataset = make_dataset(test_raw)
train_dataset.tail()

Unnamed: 0,6~7_ride,7~8_ride,8~9_ride,9~10_ride,10~11_ride,11~12_ride,6~7_takeoff,7~8_takeoff,8~9_takeoff,9~10_takeoff,...,grid_lat_41,grid_lat_42,grid_lat_43,grid_lat_44,grid_lat_45,grid_lat_46,grid_lat_96,grid_lat_97,grid_lat_98,grid_lat_99
415418,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
415419,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
415420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
415421,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
415422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
set(train_dataset.columns) - set(test_dataset.columns)

set()

In [7]:
test_dataset['grid_lat_47'] = 0

In [8]:
test_dataset.tail()

Unnamed: 0,6~7_ride,7~8_ride,8~9_ride,9~10_ride,10~11_ride,11~12_ride,6~7_takeoff,7~8_takeoff,8~9_takeoff,9~10_takeoff,...,grid_lat_42,grid_lat_43,grid_lat_44,grid_lat_45,grid_lat_46,grid_lat_96,grid_lat_97,grid_lat_98,grid_lat_99,grid_lat_47
228165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
228166,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
228167,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
228168,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
228169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
numeric_columns = [
    '6~7_ride', '7~8_ride', '8~9_ride',
    '9~10_ride', '10~11_ride', '11~12_ride',
    '6~7_takeoff', '7~8_takeoff', '8~9_takeoff',
    '9~10_takeoff', '10~11_takeoff'
]

categorical_columns = train_dataset.columns.difference(numeric_columns)

train_stats = train_dataset.describe()
train_stats.drop(categorical_columns, 1, inplace=True)
train_stats = train_stats.T
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
6~7_ride,415423.0,0.305893,1.109766,0.0,0.0,0.0,0.0,85.0
7~8_ride,415423.0,0.829699,2.255116,0.0,0.0,0.0,1.0,94.0
8~9_ride,415423.0,0.81535,2.317561,0.0,0.0,0.0,1.0,136.0
9~10_ride,415423.0,0.642475,1.959844,0.0,0.0,0.0,1.0,78.0
10~11_ride,415423.0,0.599618,1.885941,0.0,0.0,0.0,0.0,124.0
11~12_ride,415423.0,0.579393,1.942137,0.0,0.0,0.0,0.0,99.0
6~7_takeoff,415423.0,0.11287,0.597714,0.0,0.0,0.0,0.0,45.0
7~8_takeoff,415423.0,0.34487,1.279179,0.0,0.0,0.0,0.0,66.0
8~9_takeoff,415423.0,0.516481,1.65885,0.0,0.0,0.0,0.0,59.0
9~10_takeoff,415423.0,0.430922,1.485124,0.0,0.0,0.0,0.0,65.0


In [10]:
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

normed_train_data = norm(train_dataset[numeric_columns])
normed_train_data[categorical_columns] = train_dataset[categorical_columns]
normed_test_data = norm(test_dataset[numeric_columns])
normed_test_data[categorical_columns] = test_dataset[categorical_columns]

In [11]:
normed_train_data= normed_train_data.apply(lambda x: np.log(x+1))
normed_test_data= normed_test_data.apply(lambda x: np.log(x+1))

In [12]:
normed_train_data.tail()

Unnamed: 0,6~7_ride,7~8_ride,8~9_ride,9~10_ride,10~11_ride,11~12_ride,6~7_takeoff,7~8_takeoff,8~9_takeoff,9~10_takeoff,...,holidays,in,mon,out,sat,sun,thu,tue,wed,weekend
415418,1.465273,-0.458737,-0.433577,-0.397229,-0.382639,-0.354288,-0.209286,-0.314167,-0.37302,-0.342715,...,0.0,0.693147,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0
415419,1.465273,-0.458737,-0.433577,-0.397229,-0.382639,-0.354288,-0.209286,-0.314167,-0.37302,-0.342715,...,0.0,0.693147,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0
415420,-0.322463,-0.458737,-0.433577,-0.397229,-0.382639,-0.354288,-0.209286,0.413532,-0.37302,-0.342715,...,0.0,0.693147,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0
415421,0.485787,-0.458737,-0.433577,-0.397229,-0.382639,-0.354288,-0.209286,-0.314167,-0.37302,-0.342715,...,0.0,0.693147,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0
415422,-0.322463,-0.458737,-0.433577,-0.397229,-0.382639,-0.354288,-0.209286,-0.314167,1.131389,-0.342715,...,0.0,0.693147,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=[len(train_dataset.keys())]),
    layers.Dense(128, activation='relu'),
    layers.Dense(1)
])

model.compile(
    loss='mean_squared_error',
    optimizer='adam',
    metrics=['mae', 'mse']
)

In [14]:
import matplotlib.pyplot as plt

def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  plt.figure(figsize=(8,12))

  plt.subplot(2,1,1)
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error [MPG]')
  plt.plot(hist['epoch'], hist['mae'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mae'],
           label = 'Val Error')
  plt.ylim([0,5])
  plt.legend()

  plt.subplot(2,1,2)
  plt.xlabel('Epoch')
  plt.ylabel('Mean Square Error [$MPG^2$]')
  plt.plot(hist['epoch'], hist['mse'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mse'],
           label = 'Val Error')
  plt.ylim([0,20])
  plt.legend()
  plt.show()

In [15]:
# 에포크가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다
class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    if epoch % 100 == 0: print('')
    print('.', end='')

In [None]:
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=15)

EPOCHS = 1000
history = model.fit(normed_train_data, train_labels, epochs=EPOCHS, batch_size=1000,
                    validation_split=0.2, verbose=2, callbacks=[early_stop])

plot_history(history)

Train on 332338 samples, validate on 83085 samples
Epoch 1/1000
332338/332338 - 4s - loss: 13.5348 - mae: 1.2588 - mse: 13.5348 - val_loss: 8.3748 - val_mae: 1.1289 - val_mse: 8.3748
Epoch 2/1000
332338/332338 - 3s - loss: 10.5574 - mae: 1.1699 - mse: 10.5574 - val_loss: 7.6648 - val_mae: 1.0766 - val_mse: 7.6648
Epoch 3/1000
332338/332338 - 3s - loss: 9.6996 - mae: 1.1433 - mse: 9.6996 - val_loss: 7.7420 - val_mae: 1.0619 - val_mse: 7.7420
Epoch 4/1000
332338/332338 - 3s - loss: 9.3745 - mae: 1.1343 - mse: 9.3745 - val_loss: 7.3490 - val_mae: 1.0608 - val_mse: 7.3490
Epoch 5/1000
332338/332338 - 3s - loss: 9.0417 - mae: 1.1216 - mse: 9.0417 - val_loss: 7.5191 - val_mae: 1.1050 - val_mse: 7.5191
Epoch 6/1000
332338/332338 - 3s - loss: 8.7649 - mae: 1.1167 - mse: 8.7649 - val_loss: 7.1477 - val_mae: 1.0510 - val_mse: 7.1477
Epoch 7/1000
332338/332338 - 3s - loss: 8.6034 - mae: 1.1105 - mse: 8.6034 - val_loss: 6.8007 - val_mae: 0.9895 - val_mse: 6.8007
Epoch 8/1000
332338/332338 - 3s - l

In [None]:
# EPOCHS = 30
# history = model.fit(normed_train_data, train_labels, epochs=EPOCHS, batch_size=1000,
#                     verbose=2, callbacks=[early_stop])

In [None]:
predictions = model.predict(normed_test_data)

In [None]:
submission_path = 'submission_sample.csv'
submission = pd.read_csv(submission_path)
submission['18~20_ride'] = predictions
submission.to_csv("submission_final.csv", index=False)

In [None]:
submission.tail()

In [None]:
import seabron as sns
