# New York City Taxi Trip Duration 06 - DNN with TensorFlow


###   
### 목표 : 추가로 수행한 EDA를 통해서 나온 새로운 변수들을 추가해 예측모델의 성능을 높인다.
###   
###   

### ● TensorFlow를 이용해 심층신경망(DNN)을 활용한 모델을 만든다.
###   
###   

## 1.  데이터 전처리

In [1]:
import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians 
import time
start = time.time()
print("시작")

train = pd.read_csv("train.csv", parse_dates = ['pickup_datetime', 'dropoff_datetime'])
test = pd.read_csv("test.csv", parse_dates = ['pickup_datetime'])
weather = pd.read_csv("weather.csv", parse_dates = ['datetime'])  # 데이터 전처리한 파일
submission = pd.read_csv("sample_submission.csv")

# 날짜데이터 분할

train['pickup_year'] = train['pickup_datetime'].dt.year  
train['pickup_month'] = train['pickup_datetime'].dt.month
train['pickup_day'] = train['pickup_datetime'].dt.day
train['pickup_hour'] = train['pickup_datetime'].dt.hour
train['pickup_minute'] = train['pickup_datetime'].dt.minute
train['pickup_second'] = train['pickup_datetime'].dt.second
train["pickup_dayofweek"] = train["pickup_datetime"].dt.dayofweek 

test['pickup_year'] = test['pickup_datetime'].dt.year  
test['pickup_month'] = test['pickup_datetime'].dt.month
test['pickup_day'] = test['pickup_datetime'].dt.day
test['pickup_hour'] = test['pickup_datetime'].dt.hour
test['pickup_minute'] = test['pickup_datetime'].dt.minute
test['pickup_second'] = test['pickup_datetime'].dt.second
test["pickup_dayofweek"] = test["pickup_datetime"].dt.dayofweek 

# 원핫인코딩

train['store_and_fwd_flag_Y'] = train['store_and_fwd_flag'] == 'Y'
train['store_and_fwd_flag_N'] = train['store_and_fwd_flag'] == 'N'

test['store_and_fwd_flag_Y'] = test['store_and_fwd_flag'] == 'Y'
test['store_and_fwd_flag_N'] = test['store_and_fwd_flag'] == 'N'


train['vendor_1'] = train['vendor_id'] == 1
train['vendor_2'] = train['vendor_id'] == 2

test['vendor_1'] = test['vendor_id'] == 1
test['vendor_2'] = test['vendor_id'] == 2


train['passenger_0'] = train['passenger_count'] == 0
train['passenger_1'] = train['passenger_count'] == 1
train['passenger_2'] = train['passenger_count'] == 2
train['passenger_3'] = train['passenger_count'] == 3
train['passenger_4'] = train['passenger_count'] == 4
train['passenger_5'] = train['passenger_count'] == 5
train['passenger_6'] = train['passenger_count'] == 6
train['passenger_7'] = train['passenger_count'] == 7
train['passenger_8'] = train['passenger_count'] == 8
train['passenger_9'] = train['passenger_count'] == 9

test['passenger_0'] = test['passenger_count'] == 0
test['passenger_1'] = test['passenger_count'] == 1
test['passenger_2'] = test['passenger_count'] == 2
test['passenger_3'] = test['passenger_count'] == 3
test['passenger_4'] = test['passenger_count'] == 4
test['passenger_5'] = test['passenger_count'] == 5
test['passenger_6'] = test['passenger_count'] == 6
test['passenger_7'] = test['passenger_count'] == 7
test['passenger_8'] = test['passenger_count'] == 8
test['passenger_9'] = test['passenger_count'] == 9

# 요일 원핫인코딩 추가

train["mon"] = train["pickup_dayofweek"] == 0
train["tue"] = train["pickup_dayofweek"] == 1
train["wed"] = train["pickup_dayofweek"] == 2
train["thu"] = train["pickup_dayofweek"] == 3
train["fri"] = train["pickup_dayofweek"] == 4
train["sat"] = train["pickup_dayofweek"] == 5
train["sun"] = train["pickup_dayofweek"] == 6

test["mon"] = test["pickup_dayofweek"] == 0
test["tue"] = test["pickup_dayofweek"] == 1
test["wed"] = test["pickup_dayofweek"] == 2
test["thu"] = test["pickup_dayofweek"] == 3
test["fri"] = test["pickup_dayofweek"] == 4
test["sat"] = test["pickup_dayofweek"] == 5
test["sun"] = test["pickup_dayofweek"] == 6

# 이동거리 추가(하버사인 + 맨하탄)

def haversine(train):
    lon1 = train['pickup_longitude']
    lat1 = train['pickup_latitude']
    lon2 = train['dropoff_longitude']
    lat2 = train['dropoff_latitude']
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * np.arcsin(sqrt(a)) 
    haversine_km = 6367 * c
    return haversine_km

def haversine(test):
    lon1 = test['pickup_longitude']
    lat1 = test['pickup_latitude']
    lon2 = test['dropoff_longitude']
    lat2 = test['dropoff_latitude']
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * np.arcsin(sqrt(a)) 
    haversine_km = 6367 * c
    return haversine_km

def manhattan_distance(train):
    lon1 = train['pickup_longitude']
    lat1 = train['pickup_latitude']
    lon2 = train['dropoff_longitude']
    lat2 = train['dropoff_latitude']
    manhattan_km = (abs(lon1 - lon2)*111) + (abs(lat1 - lat2)*85.397)
    return manhattan_km

def manhattan_distance(test):
    lon1 = test['pickup_longitude']
    lat1 = test['pickup_latitude']
    lon2 = test['dropoff_longitude']
    lat2 = test['dropoff_latitude']
    km = (abs(lon1 - lon2)*111) + (abs(lat1 - lat2)*85.397)
    return km

train['haversine_distance'] = train.apply(lambda train: haversine(train), axis=1)
test['haversine_distance'] = test.apply(lambda test: haversine(test), axis=1)
train['manhattan_distance'] = train.apply(lambda train: manhattan_distance(train), axis=1)
test['manhattan_distance'] = test.apply(lambda test: manhattan_distance(test), axis=1)


# 날씨데이터 추가

weather['year'] = weather['datetime'].dt.year  
weather['month'] = weather['datetime'].dt.month
weather['day'] = weather['datetime'].dt.day

weather = weather.rename(columns = {'year' : 'pickup_year', 'month' : 'pickup_month', 'day' : 'pickup_day'})

train = pd.merge(train, weather)
test = pd.merge(test, weather)

feature_names  = ['pickup_longitude', 'pickup_latitude',
                  'dropoff_longitude', 'dropoff_latitude',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
                  'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
                  'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
                  'passenger_7', 'passenger_8', 'passenger_9', 
                  'haversine_distance', 'manhattan_distance', 
                  'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
                  'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
                  'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
                  'Precip. (mm)', 'rain', 'snow', 'fog']

x_train = train[feature_names]
x_test = test[feature_names]
label_name = "trip_duration"
y_train = train[label_name]
y_train = np.log1p(y_train)

print("완료!")
print('Running Time : %.02f초' % (time.time() - start))


시작
완료!
Running Time : 519.65초


#### 대체적으로 좋은 결과를 보여주었단 basic_features를 사용
#### 텐서플로에는 행렬형태로 들어가고, placeholder에 넣을 때 컬럼 갯수를 조절할 수 있다.
#### 전처리가 완료된 pandas DataFrame 형태의 데이터를 TensorFlow의 Tensor로 변환한다.

In [2]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(1458644, 58)
(625134, 58)
(1458644,)


#### 데이터 프레임을 행렬 형태로 변환

In [3]:
x_train = np.array(x_train, dtype=np.float32)
y_train = np.array(y_train, dtype=np.float32)
x_test = np.array(x_test, dtype=np.float32)

#### y_train을 확실하게 (1458644 , 1) 형태로 변환

In [4]:
x_train = x_train.reshape(1458644, 58)
y_train = y_train.reshape(1458644, 1)
x_test = x_test.reshape(625134, 58)

In [5]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(1458644, 58)
(625134, 58)
(1458644, 1)


####  
####  

## 2. TensorFlow 신경망 모델 구성

### 텐서플로우 실행

In [6]:
import tensorflow as tf

hello = tf.constant('hello, tensorflow!')
print(hello)
sess = tf.Session()
print(sess.run(hello))

Tensor("Const:0", shape=(), dtype=string)
b'hello, tensorflow!'


### 1) 입출력 정의

In [7]:
X = tf.placeholder(tf.float32, [None, 58]) 
Y = tf.placeholder(tf.float32, [None, 1])  
keep_prob = tf.placeholder(tf.float32)

### 2) 은닉층 구성

#### (1) ReLU 함수

In [24]:
W1 = tf.Variable(tf.random_normal([58 ,256], stddev = 0.01))
b1 = tf.Variable(tf.random_normal([256]))
L1 = tf.add(tf.matmul(X, W1), b1)
L1 = tf.nn.relu(L1)
L1 = tf.nn.dropout(L1, keep_prob)


W2 = tf.Variable(tf.random_normal([256, 256], stddev = 0.01))
b2 = tf.Variable(tf.random_normal([256]))
L2 = tf.add(tf.matmul(L1, W2), b2)
L2 = tf.nn.relu(L2)
L2 = tf.nn.dropout(L2, keep_prob)

W3 = tf.Variable(tf.random_normal([256, 1], stddev = 0.01))
b3 = tf.Variable(tf.random_normal([1]))
model = tf.add(tf.matmul(L2, W3), b3)

In [11]:
W1 = tf.Variable(tf.random_normal([58 ,128], stddev = 0.01))
b1 = tf.Variable(tf.random_normal([128]))
L1 = tf.add(tf.matmul(X, W1), b1b
L1 = tf.nn.relu(L1)
L1 = tf.nn.dropout(L1, keep_prob)


W2 = tf.Variable(tf.random_normal([128, 128], stddev = 0.01))
b2 = tf.Variable(tf.random_normal([128]))
L2 = tf.add(tf.matmul(L1, W2), b2)
L2 = tf.nn.relu(L2)
L2 = tf.nn.dropout(L2, keep_prob)

W3 = tf.Variable(tf.random_normal([128, 1], stddev = 0.01))
b3 = tf.Variable(tf.random_normal([1]))
model = tf.add(tf.matmul(L2, W3), b3)

#### (1) 활성화 함수 X 

In [9]:
# 렐루 함수 X 

W1 = tf.Variable(tf.random_normal([58 ,256], stddev = 0.01))
b1 = tf.Variable(tf.random_normal([256]))
L1 = tf.add(tf.matmul(X, W1), b1)
L1 = tf.nn.dropout(L1, keep_prob)


W2 = tf.Variable(tf.random_normal([256, 256], stddev = 0.01))
b2 = tf.Variable(tf.random_normal([256]))
L2 = tf.add(tf.matmul(L1, W2), b2)
L2 = tf.nn.dropout(L2, keep_prob)

W3 = tf.Variable(tf.random_normal([256, 1], stddev = 0.01))
b3 = tf.Variable(tf.random_normal([1]))
model = tf.add(tf.matmul(L2, W3), b3)

### 3) 손실함수 및 최적화

#### (1) 손실함수

In [10]:
cost = tf.reduce_mean(tf.square(model - Y))

#### (2) 최적화 함수

In [11]:
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

In [33]:
optimizer = tf.train.GradientDescentOptimizer(0.001).minimize(cost) 

# NaN 값이 나와 사용 중지

#### 4)세션 열기  
#### 현재 GTX 1050으로는 약 140만 건의 데이터 처리를 못해 CPU로 학습

##### (1) 기본 세션 

In [12]:
init = tf.global_variables_initializer()
sess.run(init)

##### (2) CPU 사용

In [29]:
init = tf.global_variables_initializer()
config = tf.ConfigProto(device_count = {'GPU': 0})
sess = tf.Session(config=config)
sess.run(init)

##### (3) GPU 사용

In [23]:
init = tf.global_variables_initializer()
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.33)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

sess.run(init)

##### (4) CPU와 GPU 성능 확인

In [21]:
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
sess.list_devices()

[_DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 268435456),
 _DeviceAttributes(/job:localhost/replica:0/task:0/device:GPU:0, GPU, 1425725849)]

In [16]:
tf.__version__

'1.10.0'

### 5) 학습

In [13]:
# 학습 10번

print("시작")
start = time.time()

for epoch in range(10):
    sess.run(optimizer, feed_dict = {X : x_train, Y : y_train, keep_prob : 0.8})
    
    if (epoch+1) % 10 == 0:
        print((epoch+1)," 번 : ", sess.run(cost, feed_dict = {X : x_train, Y : y_train, keep_prob : 0.8}))
        intime = time.time() - start
        m, s = divmod(intime, 60)
        h, m = divmod(m, 60)
        print("Time ", "%d:%02d:%02d" % (h, m, s))
        
print("최적화 완료")
print("")
print("")



print("예측값 : ", sess.run(model, feed_dict = {X : x_train, Y : y_train, keep_prob : 0.8}))
print("실제값 : ", sess.run(Y, feed_dict = {X : x_train, Y : y_train, keep_prob : 0.8}))

end = time.time() - start
m, s = divmod(end, 60)
h, m = divmod(m, 60)
print("최종 소요 시간 ", "%d:%02d:%02d" % (h, m, s)) 

10  번 :  5.30902
Time  0:08:22
최적화 완료


예측값 :  [[ 8.03012562]
 [ 6.0846467 ]
 [ 8.27035809]
 ..., 
 [ 7.02959538]
 [ 9.34763432]
 [ 8.17633533]]
실제값 :  [[ 6.12249279]
 [ 7.20563507]
 [ 6.5453496 ]
 ..., 
 [ 6.59441328]
 [ 6.66568375]
 [ 5.29330492]]
최종 소요 시간  0:08:39


####  
####  

## 3. 예측 및 결과 확인

### 1) 예측

In [14]:
predictions = sess.run(model, feed_dict = {X : x_test, keep_prob : 0.8})
predictions = np.expm1(predictions)
predictions

array([[ 1349.14111328],
       [  683.11120605],
       [ 2508.55639648],
       ..., 
       [ 3487.59448242],
       [ 1471.28503418],
       [ 3873.23339844]], dtype=float32)

#### 2) 결과 제출

In [15]:
submission['trip_duration'] = predictions
submission.to_csv('TS10.csv', index = False)

####  
####  
## 4. 결과 보고서

#### 은닉층 3개 : [None, 256] / [256 ,256] / [256, 1]    keep_prob : 0.8.  
#### 손실함수 cost = tf.reduce_mean(tf.square(model - Y))
#### AdamOptimizer(0.001)   ReLU 함수

In [None]:
# 기본 변수(49)
100번 학습 : 0.92080 
    
# 기본 변수 + dropoff 위도 경도 추가(51)
10 번 학습 : 0.95022
    
# 기본 변수 + dropoff 위도 경도 추가(51)
10 번 학습 : 0.90311
           
# 기본 변수 + dropoff 위도 경도 요일 추가(58)
100번 학습 : 0.90646

#### GradientDescentOptimizer(0.001)

In [None]:
# 기본 변수 (49)
10번 학습 : NAN값

#### AdamOptimizer(0.01)   ReLU 함수

In [None]:
# 기본 변수(49)
10 번 학습 : 1.24397

#### 은닉층 3개 : [None, 128] / [128 ,128] / [128, 1]   keep_prob : 0.8
#### AdamOptimizer(0.001) ReLU 함수

In [None]:
# 기본 변수 + dropoff 위도 경도 요일 추가(58)
10 번 학습 : 1.34005
100번 학습 : 1.09061

#### AdamOptimizer(0.001) ReLU 함수 X

In [None]:
# 기본 변수 + dropoff 위도 경도 요일 추가(58)
10 번 학습 : 2.30351