# New York City Taxi Trip Duration 03 - EDA version2

###   
### 목표 : 예측 모델의 성능을 높이기 위해 EDA를 추가로 수행한다.
###   
###   

### ● 위도와 경도를 이용해 이동거리를 계산한다.
### ● 새로운 날씨데이터를 가져와 추가한다.
###   
###   

## 1. 기존 데이터 전처리

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv", parse_dates = ['pickup_datetime', 'dropoff_datetime'])
test = pd.read_csv("test.csv", parse_dates = ['pickup_datetime'])
submission = pd.read_csv("sample_submission.csv")

train['pickup_year'] = train['pickup_datetime'].dt.year  
train['pickup_month'] = train['pickup_datetime'].dt.month
train['pickup_day'] = train['pickup_datetime'].dt.day
train['pickup_hour'] = train['pickup_datetime'].dt.hour
train['pickup_minute'] = train['pickup_datetime'].dt.minute
train['pickup_second'] = train['pickup_datetime'].dt.second
train["pickup_dayofweek"] = train["pickup_datetime"].dt.dayofweek 

test['pickup_year'] = test['pickup_datetime'].dt.year  
test['pickup_month'] = test['pickup_datetime'].dt.month
test['pickup_day'] = test['pickup_datetime'].dt.day
test['pickup_hour'] = test['pickup_datetime'].dt.hour
test['pickup_minute'] = test['pickup_datetime'].dt.minute
test['pickup_second'] = test['pickup_datetime'].dt.second
test["pickup_dayofweek"] = test["pickup_datetime"].dt.dayofweek 


train['store_and_fwd_flag_Y'] = train['store_and_fwd_flag'] == 'Y'
train['store_and_fwd_flag_N'] = train['store_and_fwd_flag'] == 'N'

test['store_and_fwd_flag_Y'] = test['store_and_fwd_flag'] == 'Y'
test['store_and_fwd_flag_N'] = test['store_and_fwd_flag'] == 'N'


train['vendor_1'] = train['vendor_id'] == 1
train['vendor_2'] = train['vendor_id'] == 2

test['vendor_1'] = test['vendor_id'] == 1
test['vendor_2'] = test['vendor_id'] == 2


train['passenger_0'] = train['passenger_count'] == 0
train['passenger_1'] = train['passenger_count'] == 1
train['passenger_2'] = train['passenger_count'] == 2
train['passenger_3'] = train['passenger_count'] == 3
train['passenger_4'] = train['passenger_count'] == 4
train['passenger_5'] = train['passenger_count'] == 5
train['passenger_6'] = train['passenger_count'] == 6
train['passenger_7'] = train['passenger_count'] == 7
train['passenger_8'] = train['passenger_count'] == 8
train['passenger_9'] = train['passenger_count'] == 9

test['passenger_0'] = test['passenger_count'] == 0
test['passenger_1'] = test['passenger_count'] == 1
test['passenger_2'] = test['passenger_count'] == 2
test['passenger_3'] = test['passenger_count'] == 3
test['passenger_4'] = test['passenger_count'] == 4
test['passenger_5'] = test['passenger_count'] == 5
test['passenger_6'] = test['passenger_count'] == 6
test['passenger_7'] = test['passenger_count'] == 7
test['passenger_8'] = test['passenger_count'] == 8
test['passenger_9'] = test['passenger_count'] == 9

print("완료!")

완료!


####     
####     

## 2. 이동거리 구하기

### 1) 하버사인 공식(Haversine formula)

#### 구에서 두 점 사이의 거리 구하는 공식. 위도와 경도 좌표를 가지고 구인 지구에서의 좌표사이의 거리를 구한다.
#### 참고 : https://gist.github.com/rochacbruno/2883505

In [3]:
import time
start = time.time()

from math import sin, cos, sqrt, atan2, radians 
import numpy as np
def haversine(train):
    lon1 = train['pickup_longitude']
    lat1 = train['pickup_latitude']
    lon2 = train['dropoff_longitude']
    lat2 = train['dropoff_latitude']
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * np.arcsin(sqrt(a)) 
    haversine_km = 6367 * c
    return haversine_km

train['haversine_distance'] = train.apply(lambda train: haversine(train), axis=1)

print('Running Time : %.02f초' % (time.time() - start))
train['haversine_distance'].head()

Running Time : 189.10초


0    1.497580
1    1.804374
2    6.381090
3    1.484566
4    1.187842
Name: haversine_distance, dtype: float64

In [4]:
import time
start = time.time()

import numpy as np
def haversine(test):
    lon1 = test['pickup_longitude']
    lat1 = test['pickup_latitude']
    lon2 = test['dropoff_longitude']
    lat2 = test['dropoff_latitude']
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * np.arcsin(sqrt(a)) 
    haversine_km = 6367 * c
    return haversine_km

test['haversine_distance'] = test.apply(lambda test: haversine(test), axis=1)


print('Running Time : %.02f초' % (time.time() - start))
test['haversine_distance'].head()

Running Time : 78.99초


0    2.744701
1    2.757507
2    1.305335
3    5.265780
4    0.960239
Name: haversine_distance, dtype: float64

### 2) 맨하탄 거리(Manhattan Distance)

#### 택시거리 혹은 시가지 거리라고도 한다. 도로 사정 상 두 좌표 사이를 바로 잇는 선이 아닌 도로를 따라 이동하는 최단거리를 측정한다.
#### d = |x1 - x2| + |y1 - y2|
#### 경도 1도는 88.8km, 1분 1.48km  위도 1도 111km 

In [6]:
import time
start = time.time()

def manhattan_distance(train):
    lon1 = train['pickup_longitude']
    lat1 = train['pickup_latitude']
    lon2 = train['dropoff_longitude']
    lat2 = train['dropoff_latitude']
    manhattan_km = (abs(lon1 - lon2)*111) + (abs(lat1 - lat2)*85.397)
    return manhattan_km

train['manhattan_distance'] = train.apply(lambda train: manhattan_distance(train), axis=1)

print('Running Time : %.02f초' % (time.time() - start))
train['manhattan_distance'].head()

Running Time : 174.18초


0    2.144611
1    2.749269
2    7.518789
3    1.378987
4    0.927187
Name: manhattan_distance, dtype: float64

In [7]:
import time
start = time.time()

def manhattan_distance(test):
    lon1 = test['pickup_longitude']
    lat1 = test['pickup_latitude']
    lon2 = test['dropoff_longitude']
    lat2 = test['dropoff_latitude']
    km = (abs(lon1 - lon2)*111) + (abs(lat1 - lat2)*85.397)
    return km

test['manhattan_distance'] = test.apply(lambda test: manhattan_distance(test), axis=1)

print('Running Time : %.02f초' % (time.time() - start))
test['manhattan_distance'].head()

Running Time : 73.13초


0    2.332044
1    2.587666
2    1.940002
3    6.907787
4    1.443189
Name: manhattan_distance, dtype: float64

####   
####   

## 3. 날씨 데이터 추가하기