# New York City Taxi Trip Duration 04 - Machine Learning with new variables

###   
### 목표 : 추가로 수행한 EDA를 통해서 나온 새로운 변수들을 추가해 예측모델의 성능을 높인다.
###   
###   

### ● 이동거리와 날씨데이터를 조합해 모델링한다.
###   
###   

## 1.  데이터 전처리

In [3]:
import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians 
import time
start = time.time()

train = pd.read_csv("train.csv", parse_dates = ['pickup_datetime', 'dropoff_datetime'])
test = pd.read_csv("test.csv", parse_dates = ['pickup_datetime'])
weather = pd.read_csv("weather.csv", parse_dates = ['datetime'])  # 데이터 전처리한 파일
submission = pd.read_csv("sample_submission.csv")

# 날짜데이터 분할

train['pickup_year'] = train['pickup_datetime'].dt.year  
train['pickup_month'] = train['pickup_datetime'].dt.month
train['pickup_day'] = train['pickup_datetime'].dt.day
train['pickup_hour'] = train['pickup_datetime'].dt.hour
train['pickup_minute'] = train['pickup_datetime'].dt.minute
train['pickup_second'] = train['pickup_datetime'].dt.second
train["pickup_dayofweek"] = train["pickup_datetime"].dt.dayofweek 

test['pickup_year'] = test['pickup_datetime'].dt.year  
test['pickup_month'] = test['pickup_datetime'].dt.month
test['pickup_day'] = test['pickup_datetime'].dt.day
test['pickup_hour'] = test['pickup_datetime'].dt.hour
test['pickup_minute'] = test['pickup_datetime'].dt.minute
test['pickup_second'] = test['pickup_datetime'].dt.second
test["pickup_dayofweek"] = test["pickup_datetime"].dt.dayofweek 

# 원핫인코딩

train['store_and_fwd_flag_Y'] = train['store_and_fwd_flag'] == 'Y'
train['store_and_fwd_flag_N'] = train['store_and_fwd_flag'] == 'N'

test['store_and_fwd_flag_Y'] = test['store_and_fwd_flag'] == 'Y'
test['store_and_fwd_flag_N'] = test['store_and_fwd_flag'] == 'N'


train['vendor_1'] = train['vendor_id'] == 1
train['vendor_2'] = train['vendor_id'] == 2

test['vendor_1'] = test['vendor_id'] == 1
test['vendor_2'] = test['vendor_id'] == 2


train['passenger_0'] = train['passenger_count'] == 0
train['passenger_1'] = train['passenger_count'] == 1
train['passenger_2'] = train['passenger_count'] == 2
train['passenger_3'] = train['passenger_count'] == 3
train['passenger_4'] = train['passenger_count'] == 4
train['passenger_5'] = train['passenger_count'] == 5
train['passenger_6'] = train['passenger_count'] == 6
train['passenger_7'] = train['passenger_count'] == 7
train['passenger_8'] = train['passenger_count'] == 8
train['passenger_9'] = train['passenger_count'] == 9

test['passenger_0'] = test['passenger_count'] == 0
test['passenger_1'] = test['passenger_count'] == 1
test['passenger_2'] = test['passenger_count'] == 2
test['passenger_3'] = test['passenger_count'] == 3
test['passenger_4'] = test['passenger_count'] == 4
test['passenger_5'] = test['passenger_count'] == 5
test['passenger_6'] = test['passenger_count'] == 6
test['passenger_7'] = test['passenger_count'] == 7
test['passenger_8'] = test['passenger_count'] == 8
test['passenger_9'] = test['passenger_count'] == 9

# 이동거리 추가(하버사인 + 맨하탄)

def haversine(train):
    lon1 = train['pickup_longitude']
    lat1 = train['pickup_latitude']
    lon2 = train['dropoff_longitude']
    lat2 = train['dropoff_latitude']
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * np.arcsin(sqrt(a)) 
    haversine_km = 6367 * c
    return haversine_km

def haversine(test):
    lon1 = test['pickup_longitude']
    lat1 = test['pickup_latitude']
    lon2 = test['dropoff_longitude']
    lat2 = test['dropoff_latitude']
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * np.arcsin(sqrt(a)) 
    haversine_km = 6367 * c
    return haversine_km

def manhattan_distance(train):
    lon1 = train['pickup_longitude']
    lat1 = train['pickup_latitude']
    lon2 = train['dropoff_longitude']
    lat2 = train['dropoff_latitude']
    manhattan_km = (abs(lon1 - lon2)*111) + (abs(lat1 - lat2)*85.397)
    return manhattan_km

def manhattan_distance(test):
    lon1 = test['pickup_longitude']
    lat1 = test['pickup_latitude']
    lon2 = test['dropoff_longitude']
    lat2 = test['dropoff_latitude']
    km = (abs(lon1 - lon2)*111) + (abs(lat1 - lat2)*85.397)
    return km

train['haversine_distance'] = train.apply(lambda train: haversine(train), axis=1)
test['haversine_distance'] = test.apply(lambda test: haversine(test), axis=1)
train['manhattan_distance'] = train.apply(lambda train: manhattan_distance(train), axis=1)
test['manhattan_distance'] = test.apply(lambda test: manhattan_distance(test), axis=1)


# 날씨데이터 추가

weather['year'] = weather['datetime'].dt.year  
weather['month'] = weather['datetime'].dt.month
weather['day'] = weather['datetime'].dt.day

weather = weather.rename(columns = {'year' : 'pickup_year', 'month' : 'pickup_month', 'day' : 'pickup_day'})

train = pd.merge(train, weather)
test = pd.merge(test, weather)




print("완료!")
print('Running Time : %.02f초' % (time.time() - start))



완료!
Running Time : 519.83초


####  
####  

## 2. 변수 선택 및 데이터 조정

### 1) 변수 선택

In [2]:
print(train.shape)
train.columns

(1458644, 57)


Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'pickup_year', 'pickup_month', 'pickup_day',
       'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
       'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 'vendor_1', 'vendor_2',
       'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
       'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7',
       'passenger_8', 'passenger_9', 'haversine_distance',
       'manhattan_distance', 'datetime', 'T_high', 'T_avg', 'T_low', 'D_high',
       'D_avg', 'D_low', 'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg',
       'S_low', 'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
       'Precip. (mm)', 'rain', 'snow', 'fog'],
      dtype='object')

In [3]:
print(test.shape)
test.columns

(625134, 55)


Index(['id', 'vendor_id', 'pickup_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'store_and_fwd_flag', 'pickup_year', 'pickup_month',
       'pickup_day', 'pickup_hour', 'pickup_minute', 'pickup_second',
       'pickup_dayofweek', 'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
       'vendor_1', 'vendor_2', 'passenger_0', 'passenger_1', 'passenger_2',
       'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6',
       'passenger_7', 'passenger_8', 'passenger_9', 'haversine_distance',
       'manhattan_distance', 'datetime', 'T_high', 'T_avg', 'T_low', 'D_high',
       'D_avg', 'D_low', 'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg',
       'S_low', 'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
       'Precip. (mm)', 'rain', 'snow', 'fog'],
      dtype='object')

#### 전체 변수

In [4]:
feature_names = [  'pickup_longitude', 'pickup_latitude',
                 
                   'pickup_year', 'pickup_month', 'pickup_day',
                   'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
                 
                   'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
                   
                   'vendor_id', 'vendor_1', 'vendor_2',
                  
                   'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
                   'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
                   'passenger_7', 'passenger_8', 'passenger_9', 
                 
                   'haversine_distance', 'manhattan_distance', 
                 
                   'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
                   'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
                   'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
                   'Precip. (mm)', 'rain', 'snow', 'fog']

In [5]:
basic_features = ['pickup_longitude', 'pickup_latitude',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
                  'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
                  'passenger_7', 'passenger_8', 'passenger_9', 
                  'haversine_distance', 'manhattan_distance', 
                  'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
                  'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
                  'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
                  'Precip. (mm)', 'rain', 'snow', 'fog']

features_1  : 날씨데이터 제외
features_1 = [ 'pickup_longitude', 'pickup_latitude',
               'pickup_year', 'pickup_month', 'pickup_day',
               'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
               'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
               'vendor_id', 'vendor_1', 'vendor_2',
               'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
               'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
               'passenger_7', 'passenger_8', 'passenger_9', 
               'haversine_distance', 'manhattan_distance']

features_2  : 거리 데이터 제외
features_2 = [ 'pickup_longitude', 'pickup_latitude',
               'pickup_year', 'pickup_month', 'pickup_day',
               'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
               'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
               'vendor_id', 'vendor_1', 'vendor_2',
               'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
               'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
               'passenger_7', 'passenger_8', 'passenger_9', 
               'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
               'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
               'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
               'Precip. (mm)', 'rain', 'snow', 'fog']

features_3  : 날씨데이터 비, 눈, 안개만 적용
features_3 =['pickup_longitude', 'pickup_latitude',
             'pickup_year', 'pickup_month', 'pickup_day',
             'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
             'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
             'vendor_id', 'vendor_1', 'vendor_2',
             'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
             'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
             'passenger_7', 'passenger_8', 'passenger_9', 
             'haversine_distance', 'manhattan_distance', 
             'rain', 'snow', 'fog']

features_4  : 원핫인코딩 제거
features_4 = ['pickup_longitude', 'pickup_latitude',
              'pickup_year', 'pickup_month', 'pickup_day',
              'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
              'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
              'vendor_id', 'passenger_count', 
              'haversine_distance', 'manhattan_distance', 
              'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
              'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
              'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
              'Precip. (mm)', 'rain', 'snow', 'fog']

features_5  : 맨하탄 거리 데이터 제외
features_5 = [ 'pickup_longitude', 'pickup_latitude',
               'pickup_year', 'pickup_month', 'pickup_day',
               'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
               'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
               'vendor_id', 'vendor_1', 'vendor_2',
               'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
               'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
               'passenger_7', 'passenger_8', 'passenger_9', 
               'haversine_distance', 
               'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
               'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
               'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
               'Precip. (mm)', 'rain', 'snow', 'fog']

features_6  : 하버사인 거리 데이터 제외
features_6 = [ 'pickup_longitude', 'pickup_latitude',
               'pickup_year', 'pickup_month', 'pickup_day',
               'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
               'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
               'vendor_id', 'vendor_1', 'vendor_2',
               'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
               'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
               'passenger_7', 'passenger_8', 'passenger_9', 
               'manhattan_distance', 
               'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
               'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
               'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
               'Precip. (mm)', 'rain', 'snow', 'fog']

In [4]:
feature_names = basic_features = ['pickup_longitude', 'pickup_latitude',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
                  'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
                  'passenger_7', 'passenger_8', 'passenger_9', 
                  'haversine_distance', 'manhattan_distance', 
                  'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
                  'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
                  'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
                  'Precip. (mm)', 'rain', 'snow', 'fog']
# bf

### 2) 데이터 조정

In [5]:
x_train = train[feature_names]
print(x_train.shape)
x_train.head()

(1458644, 49)


Unnamed: 0,pickup_longitude,pickup_latitude,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,pickup_dayofweek,store_and_fwd_flag_Y,...,V_high,V_avg,V_low,W_high,W_avg,W_high.1,Precip. (mm),rain,snow,fog
0,-73.982155,40.767937,2016,3,14,17,24,55,0,False,...,16.0,9.0,3.0,45.0,23.0,64.0,7.37,1.0,0.0,0.0
1,-73.97509,40.758766,2016,3,14,14,5,39,0,False,...,16.0,9.0,3.0,45.0,23.0,64.0,7.37,1.0,0.0,0.0
2,-73.994484,40.745087,2016,3,14,15,4,38,0,False,...,16.0,9.0,3.0,45.0,23.0,64.0,7.37,1.0,0.0,0.0
3,-73.944359,40.714489,2016,3,14,4,24,36,0,False,...,16.0,9.0,3.0,45.0,23.0,64.0,7.37,1.0,0.0,0.0
4,-73.952881,40.766468,2016,3,14,14,57,56,0,False,...,16.0,9.0,3.0,45.0,23.0,64.0,7.37,1.0,0.0,0.0


In [6]:
x_test = test[feature_names]
print(x_test.shape)
x_test.head()

(625134, 49)


Unnamed: 0,pickup_longitude,pickup_latitude,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,pickup_dayofweek,store_and_fwd_flag_Y,...,V_high,V_avg,V_low,W_high,W_avg,W_high.1,Precip. (mm),rain,snow,fog
0,-73.988129,40.732029,2016,6,30,23,59,58,3,False,...,16.0,16.0,14.0,13.0,5.0,24.0,0.0,0.0,0.0,0.0
1,-73.964203,40.679993,2016,6,30,23,59,53,3,False,...,16.0,16.0,14.0,13.0,5.0,24.0,0.0,0.0,0.0,0.0
2,-73.997437,40.737583,2016,6,30,23,59,47,3,False,...,16.0,16.0,14.0,13.0,5.0,24.0,0.0,0.0,0.0,0.0
3,-73.95607,40.7719,2016,6,30,23,59,41,3,False,...,16.0,16.0,14.0,13.0,5.0,24.0,0.0,0.0,0.0,0.0
4,-73.970215,40.761475,2016,6,30,23,59,33,3,False,...,16.0,16.0,14.0,13.0,5.0,24.0,0.0,0.0,0.0,0.0


In [7]:
label_name = "trip_duration"

y_train = train[label_name]
print(y_train.shape)
y_train.head()

(1458644,)


0     455
1    1346
2     695
3     755
4    1050
Name: trip_duration, dtype: int64

In [8]:
y_train = np.log1p(y_train)
y_train.head()

0    6.122493
1    7.205635
2    6.545350
3    6.628041
4    6.957497
Name: trip_duration, dtype: float64

####     
####       
####   

## 3. Linear Regression

#### 모델 돌리기

In [14]:
from sklearn import linear_model

LR_model = linear_model.LinearRegression()
LR_model.fit(x_train, y_train)

predictions = LR_model.predict(x_test)
predictions = np.expm1(predictions)
predictions[0:10]

array([ 685.07408975,  702.87255708,  564.03686909,  853.14454653,
        528.57905577,  789.90883246,  707.14854398,  700.36404714,
       3877.86642006,  602.12794563])

#### 제출

In [15]:
submission['trip_duration'] = predictions
submission.to_csv('LR14.csv', index = False)

#### LinearRegression 결과보고서

In [None]:
basic_features : 0.64015
features_1     : 0.64069
features_2     : 0.77631 
features_3     : 0.64048
features_4     : 0.64039

#### 날씨와 거리 데이터 중 거리 데이터의 중요성이 더 높다
#### 하지만 날씨와 거리데이터 둘 다 있을 때의 모델 성능이 더 좋다

####     
####    
## 4. Decision Tree Regressor

#### 모델 돌리기 

In [45]:
from sklearn.tree import DecisionTreeRegressor

DT_model = DecisionTreeRegressor(max_depth = 10, random_state = 50)
DT_model.fit(x_train, y_train)

predictions = DT_model.predict(x_test)
predictions = np.expm1(predictions)
predictions[0:10]

array([ 820.48587235,  626.61969549,  497.69934466, 1236.69622674,
        341.09250067, 1235.0757096 ,  788.51670979,  820.48587235,
       2242.52379527,  594.44304042])

#### 제출

In [46]:
submission['trip_duration'] = predictions
submission.to_csv('DC24.csv', index = False)

#### DecisionTree 결과보고서

In [None]:
basic_features / max_depth = 10, random_state = 50  : 0.46417
features_1 / max_depth = 10, random_state = 50      : 0.46269
features_2 / max_depth = 10, random_state = 50      : 0.74402 
features_3 / max_depth = 10, random_state = 50      : 0.46275
features_4 / max_depth = 10, random_state = 50      : 0.46328

####    
####    
## 5. Random Forest Regressor

#### 모델 돌리기 

In [16]:
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor(n_estimators = 250, random_state =  50, max_depth = 20, max_features = 12)
RF_model.fit(x_train, y_train)

predictions = RF_model.predict(x_test)
predictions = np.expm1(predictions)
predictions[0:10]

array([ 593.35975472,  504.26851688,  477.89505549, 1190.92422195,
        333.80019158,  919.15610774,  950.94966967,  961.53271751,
       2218.12467135,  619.7200613 ])

#### 제출

In [17]:
submission['trip_duration'] = predictions
submission.to_csv('RF41.csv', index = False)

#### Random Forest 결과보고서

In [None]:
basic_features / n_estimators = 75 random_state =  50 max_depth = 20 max_features = 12 : 0.44136
features_1    /  n_estimators = 75 random_state =  50 max_depth = 20 max_features = 12 : 0.43559
features_2    /  n_estimators = 75 random_state =  50 max_depth = 20 max_features = 12 : 0.73843
features_3    /  n_estimators = 75 random_state =  50 max_depth = 20 max_features = 12 : 0.43655
    
features_3    /  n_estimators = 100 random_state =  50 max_depth = 20 max_features = 12 : 0.43614
features_1    /  n_estimators = 100 random_state =  50 max_depth = 20 max_features = 12 : 0.43525
features_1    /  n_estimators = 150 random_state =  50 max_depth = 20 max_features = 12 : 0.43498
features_1    /  n_estimators = 250 random_state =  50 max_depth = 20 max_features = 12 : 0.43465  
features_5    /  n_estimators = 250 random_state =  50 max_depth = 20 max_features = 12 : 0.44902
features_6    /  n_estimators = 250 random_state =  50 max_depth = 20 max_features = 12 : 0.46349
    
features_1    /  n_estimators = 300 random_state =  50 V : 0.43459
features_1    /  n_estimators = 300 random_state =  50 : 0.43388
features_1    /  n_estimators = 400 random_state =  50 : 메모리 오류
features_5    /  n_estimators = 300 random_state =  50 : 메모리 오류??
    

#### 맨하탄 거리와 하버사인 거리 둘 다 있는 게 더 좋다. 둘 중 하나라면 하버사인 거리만 있는 게 더 좋다

####   
####   
## 6. XGBoost Regressor

#### 모델 돌리기

In [19]:
import xgboost as xgb

XGB_model = xgb.XGBRegressor(max_depth = 12, n_estimators = 2000, min_child_weight = 8, gamma = 0.4,
                             colsample_bytree = 0.6, nthread = 5,  seed = 25, learning_rate = 0.1, subsample = 0.9   )

XGB_model.fit(x_train, y_train)

predictions = XGB_model.predict(x_test)
predictions = np.expm1(predictions)
predictions[0:10]

array([ 785.45746,  535.2258 ,  338.36087,  989.7581 ,  356.78693,
        926.82916,  846.9355 , 1135.723  , 2662.5266 ,  477.0653 ],
      dtype=float32)

#### 제출

In [20]:
submission['trip_duration'] = predictions
submission.to_csv('XGB74.csv', index = False)

#### XGBoost 결과보고서

In [25]:
basic_features

max_depth = 12, n_estimators = 300, min_child_weight = 8, gamma = 0.4,
colsample_bytree = 0.6, nthread = 5,  seed = 25, learning_rate = 0.05, subsample = 0.9  : 0.42594
    
# n_estimators만 조정 
  
n_estimators = 200 : 0.42797
n_estimators = 300 : 0.42594
n_estimators = 400 : 0.42520 
n_estimators = 450 : 0.42494   
n_estimators = 500 : 0.42466 
n_estimators = 550 : 0.42448    
n_estimators = 600 : 0.42429
n_estimators = 700 : 0.42408
n_estimators = 800 : 0.42390
n_estimators = 1000 : 0.42353
n_estimators = 1200 : 0.42329
n_estimators = 1500 : 0.42325
n_estimators = 2000 : 0.42322 ☆
n_estimators = 2050 : 0.42324
n_estimators = 2100 : 0.42324
n_estimators = 2300 : 0.42329    
    
# learning_rate만 조정 

learning_rate = 0.01 : 0.42425
learning_rate = 0.001 : 0.93367    
learning_rate = 0.1 : 0.42720
learning_rate = 0.06 : 0.42457  
learning_rate = 0.04 : 
    
# subsample 만 조정
  
subsample = 0.5 : 

    
# colsample_bytree 만 조정

colsample_bytree = 0.7 : 


# gamma 만 조정

gamma = 0.3 : 
    
# min_child_weight 만 조정

min_child_weight = 9 : 0.72810
    
# seed만 조정  

seed = 30 : 0.72804
    
    
features_1

max_depth = 12, n_estimators = 300, min_child_weight = 8, gamma = 0.4,
colsample_bytree = 0.6, nthread = 5,  seed = 25, learning_rate = 0.05, subsample = 0.9  : 0.42647
    
    
features_2

max_depth = 12, n_estimators = 300, min_child_weight = 8, gamma = 0.4,
colsample_bytree = 0.6, nthread = 5,  seed = 25, learning_rate = 0.05, subsample = 0.9  : 0.72881
    
  
features_3

max_depth = 12, n_estimators = 300, min_child_weight = 8, gamma = 0.4,
colsample_bytree = 0.6, nthread = 5,  seed = 25, learning_rate = 0.05, subsample = 0.9  : 0.42605
    
    
features_4

max_depth = 12, n_estimators = 300, min_child_weight = 8, gamma = 0.4,
colsample_bytree = 0.6, nthread = 5,  seed = 25, learning_rate = 0.05, subsample = 0.9  : 0.42681 
    
    
features_5

max_depth = 12, n_estimators = 300, min_child_weight = 8, gamma = 0.4,
colsample_bytree = 0.6, nthread = 5,  seed = 25, learning_rate = 0.05, subsample = 0.9  : 0.43740 
   

features_6

max_depth = 12, n_estimators = 300, min_child_weight = 8, gamma = 0.4,
colsample_bytree = 0.6, nthread = 5,  seed = 25, learning_rate = 0.05, subsample = 0.9  : 0.45226

SyntaxError: invalid syntax (<ipython-input-25-ec500b9e0b63>, line 4)

####   
####   
## 7. New York City Taxi Trip Duration - Machine Learning의 결론

In [None]:
#### ◎  
#### ◎ 