In [116]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import  matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

from random import sample
seed_list = list(range(10000))

import warnings
warnings.simplefilter('ignore')
os.chdir('C:\\Users\\yeonjun.in\\Desktop\\연준\\캐글\\제주도\\data')

TODAY = str(datetime.now().year)+str(datetime.now().month)+str(datetime.now().day)

In [117]:
train = pd.read_csv('train.csv')
test = pd.read_csv("test.csv")

sub = pd.read_csv('submission_sample.csv')

os.chdir('C:\\Users\\yeonjun.in\\Desktop\\연준\\캐글\\제주도\\code\\experiment')
experiment_db = pd.read_csv('experiment_DB.csv')

In [46]:
train.columns  = ['id', 'date', 'bus_route_id', 'in_out', 'station_code', 'station_name',
 'latitude', 'longitude', 'ride_6_7', 'ride_7_8', 'ride_8_9',
 'ride_9_10', 'ride_10_11', 'ride_11_12', 'takeoff_6_7', 'takeoff_7_8',
 'takeoff_8_9', 'takeoff_9_10', 'takeoff_10_11', 'takeoff_11_12',
 'ride_18_20']
test.columns = ['id', 'date', 'bus_route_id', 'in_out', 'station_code', 'station_name',
 'latitude', 'longitude', 'ride_6_7', 'ride_7_8', 'ride_8_9',
 'ride_9_10', 'ride_10_11', 'ride_11_12', 'takeoff_6_7', 'takeoff_7_8',
 'takeoff_8_9', 'takeoff_9_10', 'takeoff_10_11', 'takeoff_11_12'] 

In [47]:
train['date'] = pd.to_datetime(train['date'])
train['weekday'] = train['date'].dt.weekday
train = pd.get_dummies(train,columns=['weekday'])
test['date'] = pd.to_datetime(test['date'])
test['weekday'] = test['date'].dt.weekday
test = pd.get_dummies(test,columns=['weekday'])

In [48]:
train['in_out'] = train['in_out'].map({'시내':0,'시외':1})
test['in_out'] = test['in_out'].map({'시내':0,'시외':1})

In [49]:
import geopy.distance

coords_jejusi = (33.500770, 126.522761) #제주시의 위도 경도
coords_seoquipo = (33.259429, 126.558217) #서귀포시의 위도 경도


train['dis_jejusi'] = [geopy.distance.vincenty((train['latitude'].iloc[i],train['longitude'].iloc[i]), coords_jejusi).km for i in range(len(train))]
train['dis_seoquipo'] = [geopy.distance.vincenty((train['latitude'].iloc[i],train['longitude'].iloc[i]), coords_seoquipo).km for i in range(len(train))]

test['dis_jejusi'] = [geopy.distance.vincenty((test['latitude'].iloc[i],test['longitude'].iloc[i]), coords_jejusi).km for i in range(len(test))]
test['dis_seoquipo'] = [geopy.distance.vincenty((test['latitude'].iloc[i],test['longitude'].iloc[i]), coords_seoquipo).km for i in range(len(test))]

In [55]:
train['ride_6_12'] = train[['ride_6_7','ride_7_8','ride_8_9','ride_9_10','ride_10_11','ride_11_12']].sum(axis=1)
test['ride_6_12'] = test[['ride_6_7','ride_7_8','ride_8_9','ride_9_10','ride_10_11','ride_11_12']].sum(axis=1)

train['takeoff_6_12'] = train[['takeoff_6_7','takeoff_7_8','takeoff_8_9','takeoff_9_10','takeoff_10_11','takeoff_11_12']].sum(axis=1)
test['takeoff_6_12'] = test[['takeoff_6_7','takeoff_7_8','takeoff_8_9','takeoff_9_10','takeoff_10_11','takeoff_11_12']].sum(axis=1)

In [56]:
input_var=['in_out','latitude','longitude','ride_6_7', 'ride_7_8', 'ride_8_9', 
           'ride_9_10','ride_10_11', 'ride_11_12','ride_6_12',
           'takeoff_6_7', 'takeoff_7_8', 'takeoff_8_9','takeoff_9_10', 
           'takeoff_10_11', 'takeoff_11_12','takeoff_6_12',
           'weekday_0', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4',
           'weekday_5', 'weekday_6', 
           'dis_jejusi', 'dis_seoquipo']
target=['ride_18_20']

In [57]:
X_train=train[input_var]
y_train=train[target]

X_test=test[input_var]

In [64]:
NFOLDS = 6
random_seed = sample(seed_list,1)

tscv = TimeSeriesSplit(max_train_size=None, n_splits=NFOLDS)
stk = StratifiedKFold(n_splits=NFOLDS,random_state = random_seed,shuffle=True)

In [97]:
experiment_info = ''

In [130]:
minute = str(start.minute)+'min'

In [59]:
start = datetime.now()

time = str(start.hour)+'hr'
minute = str(start.minute)+'min'

cv_train = np.zeros(len(y_train))
cv_pred = np.zeros(test.shape[0])
fold_scores = []


for fold_, (tr_index, vl_index) in enumerate(stk.split(X_train,y_train)):
# for fold_, (tr_index, vl_index) in enumerate(tscv.split(train)):    
    print('Fold:', fold_+1)
    tst = X_test[input_var]
    tr, vl = X_train.iloc[tr_index], X_train.iloc[vl_index]
    y_tr, y_vl = y_train.iloc[tr_index], y_train.iloc[vl_index]
    
#     print(tr_index,vl_index)
    
    rf = RandomForestRegressor(random_state=1217,n_estimators=100,criterion='mse')
    rf.fit(tr,y_tr)
    
    pred = rf.predict(vl)
    
    print(np.sqrt(mean_squared_error(y_vl,pred)))
    cv_train[vl_index] += pred
    cv_pred += rf.predict(tst)
    
    print('-'*40+'\n\n')
    
cv_pred /= NFOLDS

vl_error = np.sqrt(mean_squared_error(np.array(y_train).flatten(),cv_train))

print('cv score:')
print(vl_error)
# print(fold_scores)
# print(best_trees,np.mean(best_trees))

sub['18~20_ride'] = cv_pred

end = datetime.now()

end - start

Fold: 1
2.8753174644073054
----------------------------------------
Fold: 2
3.0306451838366897
----------------------------------------
Fold: 3
2.7955248031319453
----------------------------------------
Fold: 4
2.5793443742928908
----------------------------------------
Fold: 5
2.8278930992691547
----------------------------------------
Fold: 6
2.900931575027954
----------------------------------------
cv score:
2.8382224883251794


In [133]:
os.chdir('C:\\\Users\\yeonjun.in\\Desktop\\연준\\캐글\\제주도\\submission')
sub['18~20_ride'] = cv_pred
sub.to_csv(TODAY+'_'+time+'_'+minute+'_'+'sub'+'.csv',index=False)

In [118]:
experiment_result = pd.DataFrame({
    'date' : [TODAY],
    'time' : [time],
    'score' : [vl_error],
    'feature_list' : [input_var],
    'model' : 'rf',
    'hyper_param' : np.nan,
    'explanation' : experiment_info
})

In [119]:
os.chdir('C:\\Users\\yeonjun.in\\Desktop\\연준\\캐글\\제주도\\code\\experiment')
pd.concat([experiment_db,experiment_result],axis=0).to_csv('experiment_DB.csv',index=False)