In [1]:
import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians 
import time
start = time.time()
print("시작")

train = pd.read_csv("train.csv", parse_dates = ['pickup_datetime', 'dropoff_datetime'])
test = pd.read_csv("test.csv", parse_dates = ['pickup_datetime'])
weather = pd.read_csv("weather.csv", parse_dates = ['datetime'])  # 데이터 전처리한 파일
submission = pd.read_csv("sample_submission.csv")

# 날짜데이터 분할

train['pickup_year'] = train['pickup_datetime'].dt.year  
train['pickup_month'] = train['pickup_datetime'].dt.month
train['pickup_day'] = train['pickup_datetime'].dt.day
train['pickup_hour'] = train['pickup_datetime'].dt.hour
train['pickup_minute'] = train['pickup_datetime'].dt.minute
train['pickup_second'] = train['pickup_datetime'].dt.second
train["pickup_dayofweek"] = train["pickup_datetime"].dt.dayofweek 

test['pickup_year'] = test['pickup_datetime'].dt.year  
test['pickup_month'] = test['pickup_datetime'].dt.month
test['pickup_day'] = test['pickup_datetime'].dt.day
test['pickup_hour'] = test['pickup_datetime'].dt.hour
test['pickup_minute'] = test['pickup_datetime'].dt.minute
test['pickup_second'] = test['pickup_datetime'].dt.second
test["pickup_dayofweek"] = test["pickup_datetime"].dt.dayofweek 

# 원핫인코딩

train['store_and_fwd_flag_Y'] = train['store_and_fwd_flag'] == 'Y'
train['store_and_fwd_flag_N'] = train['store_and_fwd_flag'] == 'N'

test['store_and_fwd_flag_Y'] = test['store_and_fwd_flag'] == 'Y'
test['store_and_fwd_flag_N'] = test['store_and_fwd_flag'] == 'N'


train['vendor_1'] = train['vendor_id'] == 1
train['vendor_2'] = train['vendor_id'] == 2

test['vendor_1'] = test['vendor_id'] == 1
test['vendor_2'] = test['vendor_id'] == 2


train['passenger_0'] = train['passenger_count'] == 0
train['passenger_1'] = train['passenger_count'] == 1
train['passenger_2'] = train['passenger_count'] == 2
train['passenger_3'] = train['passenger_count'] == 3
train['passenger_4'] = train['passenger_count'] == 4
train['passenger_5'] = train['passenger_count'] == 5
train['passenger_6'] = train['passenger_count'] == 6
train['passenger_7'] = train['passenger_count'] == 7
train['passenger_8'] = train['passenger_count'] == 8
train['passenger_9'] = train['passenger_count'] == 9

test['passenger_0'] = test['passenger_count'] == 0
test['passenger_1'] = test['passenger_count'] == 1
test['passenger_2'] = test['passenger_count'] == 2
test['passenger_3'] = test['passenger_count'] == 3
test['passenger_4'] = test['passenger_count'] == 4
test['passenger_5'] = test['passenger_count'] == 5
test['passenger_6'] = test['passenger_count'] == 6
test['passenger_7'] = test['passenger_count'] == 7
test['passenger_8'] = test['passenger_count'] == 8
test['passenger_9'] = test['passenger_count'] == 9

# 요일 원핫인코딩 추가

train["mon"] = train["pickup_dayofweek"] == 0
train["tue"] = train["pickup_dayofweek"] == 1
train["wed"] = train["pickup_dayofweek"] == 2
train["thu"] = train["pickup_dayofweek"] == 3
train["fri"] = train["pickup_dayofweek"] == 4
train["sat"] = train["pickup_dayofweek"] == 5
train["sun"] = train["pickup_dayofweek"] == 6

test["mon"] = test["pickup_dayofweek"] == 0
test["tue"] = test["pickup_dayofweek"] == 1
test["wed"] = test["pickup_dayofweek"] == 2
test["thu"] = test["pickup_dayofweek"] == 3
test["fri"] = test["pickup_dayofweek"] == 4
test["sat"] = test["pickup_dayofweek"] == 5
test["sun"] = test["pickup_dayofweek"] == 6

# 이동거리 추가(하버사인 + 맨하탄)

def haversine(train):
    lon1 = train['pickup_longitude']
    lat1 = train['pickup_latitude']
    lon2 = train['dropoff_longitude']
    lat2 = train['dropoff_latitude']
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * np.arcsin(sqrt(a)) 
    haversine_km = 6367 * c
    return haversine_km

def haversine(test):
    lon1 = test['pickup_longitude']
    lat1 = test['pickup_latitude']
    lon2 = test['dropoff_longitude']
    lat2 = test['dropoff_latitude']
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * np.arcsin(sqrt(a)) 
    haversine_km = 6367 * c
    return haversine_km

def manhattan_distance(train):
    lon1 = train['pickup_longitude']
    lat1 = train['pickup_latitude']
    lon2 = train['dropoff_longitude']
    lat2 = train['dropoff_latitude']
    manhattan_km = (abs(lon1 - lon2)*111) + (abs(lat1 - lat2)*85.397)
    return manhattan_km

def manhattan_distance(test):
    lon1 = test['pickup_longitude']
    lat1 = test['pickup_latitude']
    lon2 = test['dropoff_longitude']
    lat2 = test['dropoff_latitude']
    km = (abs(lon1 - lon2)*111) + (abs(lat1 - lat2)*85.397)
    return km

train['haversine_distance'] = train.apply(lambda train: haversine(train), axis=1)
test['haversine_distance'] = test.apply(lambda test: haversine(test), axis=1)
train['manhattan_distance'] = train.apply(lambda train: manhattan_distance(train), axis=1)
test['manhattan_distance'] = test.apply(lambda test: manhattan_distance(test), axis=1)


# 날씨데이터 추가

weather['year'] = weather['datetime'].dt.year  
weather['month'] = weather['datetime'].dt.month
weather['day'] = weather['datetime'].dt.day

weather = weather.rename(columns = {'year' : 'pickup_year', 'month' : 'pickup_month', 'day' : 'pickup_day'})

train = pd.merge(train, weather)
test = pd.merge(test, weather)


print("완료!")
print('Running Time : %.02f초' % (time.time() - start))


시작
완료!
Running Time : 197.03초


In [2]:
all = pd.concat([train, test], ignore_index = True)
print(train.shape)
print(test.shape)
print(all.shape)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


(1458644, 64)
(625134, 62)
(2083778, 64)


### 뉴욕주

#### 위도 	40°29'40"N - 45°0'42"N( 40.494444 ~ 45.011667 )
#### 경도 	71°47'25"W - 79°45'54"W( -71.790278 ~ -79.765 )


### 뉴욕시(1)   40.664167   ,  -73.938611
### 뉴욕시(2)   40.71250     ,  -740.00000
### 뉴욕시(3)   40.6643       ,  -73.9385  
### 뉴욕시(4)   40.7569545 ,  -73.990494

##  
##  
##  
### 뉴욕주 기준 

In [None]:
'pickup_latitude'
'pickup_longitude'
'dropoff_longitude'
'dropoff_latitude'

In [11]:
all_1 = all[all['pickup_latitude'] >= 40.494444] 
all_1 = all[all['pickup_latitude'] <= 45.011667] 
all_1 = all[all['dropoff_latitude'] >= 40.494444] 
all_1 = all[all['dropoff_latitude'] <= 45.011667] 

all_1 = all[all['pickup_longitude'] <= -71.790278] 
all_1 = all[all['pickup_longitude'] >= -79.765 ] 
all_1 = all[all['dropoff_longitude'] <= -71.790278] 
all_1 = all[all['dropoff_longitude'] >= -79.765 ] 

In [15]:
train1 = train[train['pickup_latitude'] >= 40.494444] 
train1 = train[train['pickup_latitude'] <= 45.011667] 
train1 = train[train['dropoff_latitude'] >= 40.494444] 
train1 = train[train['dropoff_latitude'] <= 45.011667] 

train1 = train[train['pickup_longitude'] <= -71.790278] 
train1 = train[train['pickup_longitude'] >= -79.765 ] 
train1 = train[train['dropoff_longitude'] <= -71.790278] 
train1 = train[train['dropoff_longitude'] >= -79.765 ] 

In [16]:
test1 = test[test['pickup_latitude'] >= 40.494444] 
test1 = test[test['pickup_latitude'] <= 45.011667] 
test1 = test[test['dropoff_latitude'] >= 40.494444] 
test1 = test[test['dropoff_latitude'] <= 45.011667] 

test1 = test[test['pickup_longitude'] <= -71.790278] 
test1 = test[test['pickup_longitude'] >= -79.765 ] 
test1 = test[test['dropoff_longitude'] <= -71.790278] 
test1 = test[test['dropoff_longitude'] >= -79.765 ] 

In [12]:
print('dropoff 위도(lat) 최고')

all_1[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'haversine_distance', 'manhattan_distance']][all['dropoff_latitude'] == all['dropoff_latitude'].max()] 

dropoff 위도(lat) 최고


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,haversine_distance,manhattan_distance
1970990,-73.981995,40.746468,-67.496796,48.857597,1035.392995,1412.523234


In [10]:
print(all.shape)
print(all_1.shape)

(2083778, 64)
(2083772, 64)


In [20]:
# features_4  : 초 제거
feature_names = ['pickup_longitude', 'pickup_latitude',
                  'dropoff_longitude', 'dropoff_latitude',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_dayofweek',
                  'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
                  'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
                  'passenger_7', 'passenger_8', 'passenger_9', 
                  'haversine_distance', 'manhattan_distance', 
                  'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
                  'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
                  'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
                  'Precip. (mm)', 'rain', 'snow', 'fog']


In [28]:
x_train = train1[feature_names]
x_test = test[feature_names]
label_name = "trip_duration"
y_train = train1[label_name]
y_train = np.log1p(y_train)

In [29]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(1458639, 57)
(625134, 57)
(1458639,)


In [30]:
start = time.time()
print("시작")
import xgboost as xgb

XGB_model = xgb.XGBRegressor(max_depth = 12, n_estimators = 2000, min_child_weight = 1, gamma = 0.1,
                             colsample_bytree = 1, seed = 36, learning_rate = 0.02, subsample = 0.9).fit(x_train, y_train)

predictions = XGB_model.predict(x_test)
predictions = np.expm1(predictions)
print("완료")
print('학습 및 예측시간 : %.02f초' % (time.time() - start))
predictions[0:10]

시작
완료
학습 및 예측시간 : 20495.47초


array([ 839.2719 ,  577.2296 ,  394.98343,  975.8435 ,  347.4576 ,
        899.0211 , 1270.925  , 1084.2709 , 2846.2808 ,  499.62845],
      dtype=float32)

In [31]:
submission['trip_duration'] = predictions
submission.to_csv('XGB127.csv', index = False)

In [None]:
# 0.38494  -> 가장 좋았던 파라미터 조건에서 0.00002가 더 오름