In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import plot_importance, plot_tree
plt.style.use('fivethirtyeight')
import pickle
from matplotlib.pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 150, 6

# Load train.csv file

In [2]:
dateparse = lambda dates: pd.datetime.strptime(dates, '%d/%m/%Y %H:%M')
df = pd.read_csv('./train.csv')
df['date']

0           1/1/2017 0:00
1           1/1/2017 1:00
2           1/1/2017 2:00
3           1/1/2017 3:00
4           1/1/2017 4:00
               ...       
14001    31/12/2018 12:00
14002    31/12/2018 15:00
14003    31/12/2018 16:00
14004    31/12/2018 18:00
14005    31/12/2018 20:00
Name: date, Length: 14006, dtype: object

# Load feature.csv file(some weather data)

In [3]:
dateparse_weather = lambda dates: pd.datetime.strptime(dates, '%d/%m/%Y')

# df_weather = pd.read_csv('./weather.csv',parse_dates=['date'], date_parser=dateparse_weather)
df_weather = pd.read_csv('./feature.csv')
df_weather['date']

0        1/1/2017
1        2/1/2017
2        3/1/2017
3        4/1/2017
4        5/1/2017
          ...    
725    27/12/2018
726    28/12/2018
727    29/12/2018
728    30/12/2018
729    31/12/2018
Name: date, Length: 730, dtype: object

In [4]:
df_weather['join_weather'] = df_weather['date']
df_weather

Unnamed: 0,date,precipitation,windSpeed,join_weather
0,1/1/2017,0.0,34.2,1/1/2017
1,2/1/2017,0.0,17.6,2/1/2017
2,3/1/2017,0.0,26.1,3/1/2017
3,4/1/2017,0.0,27.7,4/1/2017
4,5/1/2017,0.0,14.3,5/1/2017
...,...,...,...,...
725,27/12/2018,0.4,27.2,27/12/2018
726,28/12/2018,0.4,34.8,28/12/2018
727,29/12/2018,0.4,40.3,29/12/2018
728,30/12/2018,0.4,37.7,30/12/2018


# Parse the date 

In [5]:
df['join_weather'] = df['date']
def parsedate(x):
    x = x.split(' ')
    return x[0]
df['join_weather'] = df['join_weather'].apply(lambda x : parsedate(x))

# convert df['date'] to time stamp 

In [6]:
df['date'] = df['date'].apply(lambda dates: pd.datetime.strptime(dates, '%d/%m/%Y %H:%M'))

  """Entry point for launching an IPython kernel.


In [7]:
df['join_weather'][0] == df_weather['join_weather'][0]

True

In [8]:
df_weather['join_weather'].apply(lambda x : type(x))

0      <class 'str'>
1      <class 'str'>
2      <class 'str'>
3      <class 'str'>
4      <class 'str'>
           ...      
725    <class 'str'>
726    <class 'str'>
727    <class 'str'>
728    <class 'str'>
729    <class 'str'>
Name: join_weather, Length: 730, dtype: object

In [9]:
df_weather.drop([ "date"], axis = 1, inplace=True)

In [10]:
df_weather

Unnamed: 0,precipitation,windSpeed,join_weather
0,0.0,34.2,1/1/2017
1,0.0,17.6,2/1/2017
2,0.0,26.1,3/1/2017
3,0.0,27.7,4/1/2017
4,0.0,14.3,5/1/2017
...,...,...,...
725,0.4,27.2,27/12/2018
726,0.4,34.8,28/12/2018
727,0.4,40.3,29/12/2018
728,0.4,37.7,30/12/2018


# join df and df_weather

In [11]:
# df_joined = df.set_index('join_weather').join(df_weather.set_index('join_weather'))
df_joined = pd.merge(left = df , right = df_weather, how = 'inner', left_on='join_weather', right_on='join_weather' )


In [12]:
df_joined

Unnamed: 0,id,date,speed,join_weather,precipitation,windSpeed
0,0,2017-01-01 00:00:00,43.002930,1/1/2017,0.0,34.2
1,1,2017-01-01 01:00:00,46.118696,1/1/2017,0.0,34.2
2,2,2017-01-01 02:00:00,44.294158,1/1/2017,0.0,34.2
3,3,2017-01-01 03:00:00,41.067468,1/1/2017,0.0,34.2
4,4,2017-01-01 04:00:00,46.448653,1/1/2017,0.0,34.2
...,...,...,...,...,...,...
14001,14001,2018-12-31 12:00:00,19.865269,31/12/2018,0.0,26.8
14002,14002,2018-12-31 15:00:00,17.820375,31/12/2018,0.0,26.8
14003,14003,2018-12-31 16:00:00,12.501851,31/12/2018,0.0,26.8
14004,14004,2018-12-31 18:00:00,15.979319,31/12/2018,0.0,26.8


In [13]:
for i in range(len(df_joined['date'])):
    if df_joined['date'][i]!=df['date'][i]:
        print(i)

In [14]:
df_joined.to_csv('df_join.csv')


In [15]:
# change 4
df_joined['speed'] = np.log(df_joined['speed'])



In [16]:
df_joined

Unnamed: 0,id,date,speed,join_weather,precipitation,windSpeed
0,0,2017-01-01 00:00:00,3.761268,1/1/2017,0.0,34.2
1,1,2017-01-01 01:00:00,3.831218,1/1/2017,0.0,34.2
2,2,2017-01-01 02:00:00,3.790853,1/1/2017,0.0,34.2
3,3,2017-01-01 03:00:00,3.715216,1/1/2017,0.0,34.2
4,4,2017-01-01 04:00:00,3.838347,1/1/2017,0.0,34.2
...,...,...,...,...,...,...
14001,14001,2018-12-31 12:00:00,2.988973,31/12/2018,0.0,26.8
14002,14002,2018-12-31 15:00:00,2.880342,31/12/2018,0.0,26.8
14003,14003,2018-12-31 16:00:00,2.525877,31/12/2018,0.0,26.8
14004,14004,2018-12-31 18:00:00,2.771295,31/12/2018,0.0,26.8


In [17]:
# change!!!!!!
avg = np.mean(df_joined.speed)
print('avg = ', avg)
std = np.std(df_joined.speed)
print('std = ',std)

avg =  3.378492406001744
std =  0.506381512763295


# Normalization

In [18]:
df_joined.speed = (df_joined.speed - avg)/std
df_joined

Unnamed: 0,id,date,speed,join_weather,precipitation,windSpeed
0,0,2017-01-01 00:00:00,0.755904,1/1/2017,0.0,34.2
1,1,2017-01-01 01:00:00,0.894041,1/1/2017,0.0,34.2
2,2,2017-01-01 02:00:00,0.814327,1/1/2017,0.0,34.2
3,3,2017-01-01 03:00:00,0.664961,1/1/2017,0.0,34.2
4,4,2017-01-01 04:00:00,0.908120,1/1/2017,0.0,34.2
...,...,...,...,...,...,...
14001,14001,2018-12-31 12:00:00,-0.769221,31/12/2018,0.0,26.8
14002,14002,2018-12-31 15:00:00,-0.983744,31/12/2018,0.0,26.8
14003,14003,2018-12-31 16:00:00,-1.683742,31/12/2018,0.0,26.8
14004,14004,2018-12-31 18:00:00,-1.199090,31/12/2018,0.0,26.8


In [19]:
df = df_joined
df = df.set_index('date')
# df = pd.read_csv('./train.csv', parse_dates=['date'], index_col='date', date_parser=dateparse)
df

Unnamed: 0_level_0,id,speed,join_weather,precipitation,windSpeed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-01 00:00:00,0,0.755904,1/1/2017,0.0,34.2
2017-01-01 01:00:00,1,0.894041,1/1/2017,0.0,34.2
2017-01-01 02:00:00,2,0.814327,1/1/2017,0.0,34.2
2017-01-01 03:00:00,3,0.664961,1/1/2017,0.0,34.2
2017-01-01 04:00:00,4,0.908120,1/1/2017,0.0,34.2
...,...,...,...,...,...
2018-12-31 12:00:00,14001,-0.769221,31/12/2018,0.0,26.8
2018-12-31 15:00:00,14002,-0.983744,31/12/2018,0.0,26.8
2018-12-31 16:00:00,14003,-1.683742,31/12/2018,0.0,26.8
2018-12-31 18:00:00,14004,-1.199090,31/12/2018,0.0,26.8


In [20]:
df[0:len(df)]

Unnamed: 0_level_0,id,speed,join_weather,precipitation,windSpeed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-01 00:00:00,0,0.755904,1/1/2017,0.0,34.2
2017-01-01 01:00:00,1,0.894041,1/1/2017,0.0,34.2
2017-01-01 02:00:00,2,0.814327,1/1/2017,0.0,34.2
2017-01-01 03:00:00,3,0.664961,1/1/2017,0.0,34.2
2017-01-01 04:00:00,4,0.908120,1/1/2017,0.0,34.2
...,...,...,...,...,...
2018-12-31 12:00:00,14001,-0.769221,31/12/2018,0.0,26.8
2018-12-31 15:00:00,14002,-0.983744,31/12/2018,0.0,26.8
2018-12-31 16:00:00,14003,-1.683742,31/12/2018,0.0,26.8
2018-12-31 18:00:00,14004,-1.199090,31/12/2018,0.0,26.8


### cross validation

In [21]:
data_train = []
data_valid = []
fold_num = 10
fold_len = len(df)//fold_num
for k in range(fold_num):
    start = k*fold_len
    if k != fold_num-1 :
        end = (k+1)*fold_len
    else:
        end = len(df)
    data_valid.append(df[start:end])
    
    if k == 0:
        data_train.append(df[end:len(df)])
    elif k == fold_num-1:
        data_train.append(df[0:start])
    else:
        data_train.append(pd.concat([df[0:start],df[end:len(df)]]))
        

In [22]:
print(len(data_train))
print(len(data_valid))

10
10


In [23]:
def create_features(df, label=None):
    df['date'] = df.index # index: DatetimeIndex
    df['hour'] = df['date'].dt.hour # dt: DatetimeProperties, hour: Series
    df['day_of_week'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['day_of_year'] = df['date'].dt.dayofyear
    df['day_of_month'] = df['date'].dt.day
    df['week_of_year'] = df['date'].dt.weekofyear
    
    X = df[['hour', 'day_of_week', 'quarter', 'month', 'day_of_year', 'day_of_month', 'week_of_year','precipitation','windSpeed']]
    # X = df[['hour', 'day_of_week', 'day_of_year', 'day_of_month', 'week_of_year']]
    if label:
        y = df[label]
        return X, y
    return X

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import tree
import lightgbm as lgb


best_i = 0
loss = 9999999
max_train = np.zeros(fold_num, dtype = int)
min_train = np.zeros(fold_num, dtype = int)
max_test = np.zeros(fold_num, dtype = int)
min_test = np.zeros(fold_num, dtype = int)
for i in range(fold_num):
    X_train, y_train = create_features(data_train[i], label='speed')
    X_test, y_test = create_features(data_valid[i], label='speed')
    #xgboost
#     reg = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05)
#     reg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=10)
#     y_pred = reg.predict(X_test)

    # lgbboost
    param = {'max_depth': 6,'num_leaves':31, 'num_trees':80, 'objective':'regression', 'iterations':1000, 'learning_rate':0.1, 'metric':'mse'}
    data_train2 = lgb.Dataset(X_train, y_train, silent=True)
    reg_lgb = lgb.train(param, data_train2)
    y_pred = reg_lgb.predict(X_test)

    new_loss = sum((y_pred - y_test)**2)/len(y_pred)
    print("=================================")
    print("(%d) MSE:%.8f"%(i, new_loss))
    if i==0 or new_loss < loss:
#         pickle.dump(reg, open("./reg_best.dat", "wb"))
        pickle.dump(reg_lgb, open("./lgb_reg2feature_best.dat", "wb"))
        loss = new_loss
        best_i = i
print(best_i)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 794
[LightGBM] [Info] Number of data points in the train set: 12606, number of used features: 9
[LightGBM] [Info] Start training from score -0.006542
(0) MSE:0.17323950
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 788
[LightGBM] [Info] Number of data points in the train set: 12606, number of used features: 9
[LightGBM] [Info] Start training from score -0.003124
(1) MSE:0.13175249
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 12606, number of used features: 9
[LightGBM] [Info] Start training from score 0.002972
(2) MSE:0.12863314
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 778
[LightGBM] [Info] Number of data points in the train set: 12606, number of use

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 745
[LightGBM] [Info] Number of data points in the train set: 12606, number of used features: 9
[LightGBM] [Info] Start training from score 0.002134
(8) MSE:0.19533012
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 799
[LightGBM] [Info] Number of data points in the train set: 12600, number of used features: 9
[LightGBM] [Info] Start training from score 0.000733
(9) MSE:0.11944582
5


In [25]:
reg_reload = pickle.load(open("./lgb_reg2feature_best.dat", "rb"))
X_test, y_test = create_features(data_train[1], label='speed')

y_pred = reg_reload.predict(X_test)#*(max_train[best_i] - min_train[best_i])+min_train[best_i]

loss = sum((y_pred- y_test)**2)/len(y_pred)


print(loss)

0.09829737773243206


  # Remove the CWD from sys.path while we load stuff.


# Load test data and create features

In [26]:
df_test = pd.read_csv('./test.csv')

In [27]:
df_test

Unnamed: 0,id,date
0,0,1/1/2018 2:00
1,1,1/1/2018 5:00
2,2,1/1/2018 7:00
3,3,1/1/2018 8:00
4,4,1/1/2018 10:00
...,...,...
3499,3499,31/12/2018 17:00
3500,3500,31/12/2018 19:00
3501,3501,31/12/2018 21:00
3502,3502,31/12/2018 22:00


In [28]:
df_test['join_weather'] = df_test['date']
def parsedate(x):
    x = x.split(' ')
    return x[0]
df_test['join_weather'] = df_test['join_weather'].apply(lambda x : parsedate(x))
df_test

Unnamed: 0,id,date,join_weather
0,0,1/1/2018 2:00,1/1/2018
1,1,1/1/2018 5:00,1/1/2018
2,2,1/1/2018 7:00,1/1/2018
3,3,1/1/2018 8:00,1/1/2018
4,4,1/1/2018 10:00,1/1/2018
...,...,...,...
3499,3499,31/12/2018 17:00,31/12/2018
3500,3500,31/12/2018 19:00,31/12/2018
3501,3501,31/12/2018 21:00,31/12/2018
3502,3502,31/12/2018 22:00,31/12/2018


In [29]:
df_test['date'] = df_test['date'].apply(lambda dates: pd.datetime.strptime(dates, '%d/%m/%Y %H:%M'))

  """Entry point for launching an IPython kernel.


In [30]:
df_test_joined = pd.merge(left=df_test, right=df_weather, how='inner', left_on='join_weather', right_on='join_weather')

# ensure the sequence is not changed

In [31]:
for i in range(len(df_test_joined['date'])):
    if df_test_joined['date'][i]!=df_test['date'][i]:
        print(i)

In [32]:
df_test_joined = df_test_joined.set_index('date')

In [33]:
df_test_joined

Unnamed: 0_level_0,id,join_weather,precipitation,windSpeed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01 02:00:00,0,1/1/2018,0.0,28.3
2018-01-01 05:00:00,1,1/1/2018,0.0,28.3
2018-01-01 07:00:00,2,1/1/2018,0.0,28.3
2018-01-01 08:00:00,3,1/1/2018,0.0,28.3
2018-01-01 10:00:00,4,1/1/2018,0.0,28.3
...,...,...,...,...
2018-12-31 17:00:00,3499,31/12/2018,0.0,26.8
2018-12-31 19:00:00,3500,31/12/2018,0.0,26.8
2018-12-31 21:00:00,3501,31/12/2018,0.0,26.8
2018-12-31 22:00:00,3502,31/12/2018,0.0,26.8


In [34]:
df_test_joined.to_csv('./df_test_joined.csv')

In [35]:
test_features = create_features(df_test_joined)

  # Remove the CWD from sys.path while we load stuff.


In [36]:
test_features

Unnamed: 0_level_0,hour,day_of_week,quarter,month,day_of_year,day_of_month,week_of_year,precipitation,windSpeed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-01 02:00:00,2,0,1,1,1,1,1,0.0,28.3
2018-01-01 05:00:00,5,0,1,1,1,1,1,0.0,28.3
2018-01-01 07:00:00,7,0,1,1,1,1,1,0.0,28.3
2018-01-01 08:00:00,8,0,1,1,1,1,1,0.0,28.3
2018-01-01 10:00:00,10,0,1,1,1,1,1,0.0,28.3
...,...,...,...,...,...,...,...,...,...
2018-12-31 17:00:00,17,0,4,12,365,31,1,0.0,26.8
2018-12-31 19:00:00,19,0,4,12,365,31,1,0.0,26.8
2018-12-31 21:00:00,21,0,4,12,365,31,1,0.0,26.8
2018-12-31 22:00:00,22,0,4,12,365,31,1,0.0,26.8


In [37]:
print('avg = ', avg)
print('std = ',std)

avg =  3.378492406001744
std =  0.506381512763295


In [38]:
y_pred_test = reg_reload.predict(test_features)*std+avg
y_pred_test

array([3.89047164, 3.87530504, 3.55386745, ..., 3.79180715, 3.7141735 ,
       3.77005882])

In [39]:
y_pred_test_final = np.exp(y_pred_test)
print('y_pred_test_final',y_pred_test_final)

y_pred_test_final [48.93396042 48.19739805 34.94821697 ... 44.33645069 41.02466631
 43.38261641]


In [40]:
id = range(len(y_pred_test))
ans = pd.DataFrame({'id':id,'speed':y_pred_test_final}).set_index('id')
ans

Unnamed: 0_level_0,speed
id,Unnamed: 1_level_1
0,48.933960
1,48.197398
2,34.948217
3,23.573881
4,33.061345
...,...
3499,14.022032
3500,25.532709
3501,44.336451
3502,41.024666


In [41]:
# ans.to_csv('./xgbresult.csv')
ans.to_csv('./lgbresult.csv')