In [1]:
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pickle
import os
import glob
import numpy as np
import time

In [2]:
df = pd.DataFrame()
for day in range(1, 6):
    df_tmp = pd.read_csv('../dataset/wind_data_day' + str(day) + '_max1.csv')
    df = pd.concat([df, df_tmp], axis=0)
    del df_tmp
df.shape

(20763720, 14)

In [3]:
cols = ['hour'] + ['predict_' + str(i) for i in range(1, 11)]
X = df[cols].values
y = df['real'].values
del df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
del X, y

feature_name = ['hour', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10']
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

In [4]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2'},
    'num_leaves': 550,
    'learning_rate': 0.6,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    
}

print ('start training')

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1500,
                valid_sets=lgb_eval,  # eval training data
                feature_name=feature_name,
                categorical_feature=[0], 
                early_stopping_rounds=10)

start training




[1]	valid_0's l2: 5.75944
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's l2: 1.71782
[3]	valid_0's l2: 1.03023
[4]	valid_0's l2: 0.896159
[5]	valid_0's l2: 0.854116
[6]	valid_0's l2: 0.83376
[7]	valid_0's l2: 0.82144
[8]	valid_0's l2: 0.804788
[9]	valid_0's l2: 0.795526
[10]	valid_0's l2: 0.785972
[11]	valid_0's l2: 0.77655
[12]	valid_0's l2: 0.772068
[13]	valid_0's l2: 0.763267
[14]	valid_0's l2: 0.757132
[15]	valid_0's l2: 0.751442
[16]	valid_0's l2: 0.747148
[17]	valid_0's l2: 0.742817
[18]	valid_0's l2: 0.738887
[19]	valid_0's l2: 0.735464
[20]	valid_0's l2: 0.730244
[21]	valid_0's l2: 0.726925
[22]	valid_0's l2: 0.724375
[23]	valid_0's l2: 0.722188
[24]	valid_0's l2: 0.719731
[25]	valid_0's l2: 0.71737
[26]	valid_0's l2: 0.713532
[27]	valid_0's l2: 0.711017
[28]	valid_0's l2: 0.70824
[29]	valid_0's l2: 0.706334
[30]	valid_0's l2: 0.703188
[31]	valid_0's l2: 0.700701
[32]	valid_0's l2: 0.698016
[33]	valid_0's l2: 0.696615
[34]	valid_0's l2: 0.694144
[35

[287]	valid_0's l2: 0.524718
[288]	valid_0's l2: 0.524296
[289]	valid_0's l2: 0.524154
[290]	valid_0's l2: 0.524038
[291]	valid_0's l2: 0.52368
[292]	valid_0's l2: 0.523439
[293]	valid_0's l2: 0.52323
[294]	valid_0's l2: 0.523167
[295]	valid_0's l2: 0.52265
[296]	valid_0's l2: 0.52253
[297]	valid_0's l2: 0.522241
[298]	valid_0's l2: 0.521913
[299]	valid_0's l2: 0.521551
[300]	valid_0's l2: 0.521304
[301]	valid_0's l2: 0.520974
[302]	valid_0's l2: 0.52064
[303]	valid_0's l2: 0.520292
[304]	valid_0's l2: 0.519981
[305]	valid_0's l2: 0.519834
[306]	valid_0's l2: 0.519535
[307]	valid_0's l2: 0.51945
[308]	valid_0's l2: 0.519209
[309]	valid_0's l2: 0.518928
[310]	valid_0's l2: 0.518835
[311]	valid_0's l2: 0.518486
[312]	valid_0's l2: 0.5184
[313]	valid_0's l2: 0.518286
[314]	valid_0's l2: 0.517975
[315]	valid_0's l2: 0.51773
[316]	valid_0's l2: 0.517613
[317]	valid_0's l2: 0.517302
[318]	valid_0's l2: 0.517128
[319]	valid_0's l2: 0.516837
[320]	valid_0's l2: 0.516534
[321]	valid_0's l2: 0.5

[571]	valid_0's l2: 0.478815
[572]	valid_0's l2: 0.478726
[573]	valid_0's l2: 0.478662
[574]	valid_0's l2: 0.478583
[575]	valid_0's l2: 0.478474
[576]	valid_0's l2: 0.478342
[577]	valid_0's l2: 0.478136
[578]	valid_0's l2: 0.478062
[579]	valid_0's l2: 0.477991
[580]	valid_0's l2: 0.47794
[581]	valid_0's l2: 0.477763
[582]	valid_0's l2: 0.477624
[583]	valid_0's l2: 0.477548
[584]	valid_0's l2: 0.477485
[585]	valid_0's l2: 0.477396
[586]	valid_0's l2: 0.477223
[587]	valid_0's l2: 0.477146
[588]	valid_0's l2: 0.477092
[589]	valid_0's l2: 0.477075
[590]	valid_0's l2: 0.476932
[591]	valid_0's l2: 0.476814
[592]	valid_0's l2: 0.476744
[593]	valid_0's l2: 0.476637
[594]	valid_0's l2: 0.476536
[595]	valid_0's l2: 0.476512
[596]	valid_0's l2: 0.476388
[597]	valid_0's l2: 0.476219
[598]	valid_0's l2: 0.476164
[599]	valid_0's l2: 0.476089
[600]	valid_0's l2: 0.475975
[601]	valid_0's l2: 0.47591
[602]	valid_0's l2: 0.475775
[603]	valid_0's l2: 0.475651
[604]	valid_0's l2: 0.475627
[605]	valid_0's 

[855]	valid_0's l2: 0.456945
[856]	valid_0's l2: 0.456883
[857]	valid_0's l2: 0.45679
[858]	valid_0's l2: 0.456663
[859]	valid_0's l2: 0.456626
[860]	valid_0's l2: 0.456625
[861]	valid_0's l2: 0.456526
[862]	valid_0's l2: 0.456444
[863]	valid_0's l2: 0.456343
[864]	valid_0's l2: 0.456288
[865]	valid_0's l2: 0.456253
[866]	valid_0's l2: 0.45618
[867]	valid_0's l2: 0.45613
[868]	valid_0's l2: 0.456093
[869]	valid_0's l2: 0.456073
[870]	valid_0's l2: 0.456032
[871]	valid_0's l2: 0.456018
[872]	valid_0's l2: 0.455999
[873]	valid_0's l2: 0.455898
[874]	valid_0's l2: 0.455873
[875]	valid_0's l2: 0.455796
[876]	valid_0's l2: 0.455732
[877]	valid_0's l2: 0.455718
[878]	valid_0's l2: 0.45569
[879]	valid_0's l2: 0.455603
[880]	valid_0's l2: 0.455533
[881]	valid_0's l2: 0.45547
[882]	valid_0's l2: 0.455448
[883]	valid_0's l2: 0.455366
[884]	valid_0's l2: 0.455318
[885]	valid_0's l2: 0.455221
[886]	valid_0's l2: 0.455135
[887]	valid_0's l2: 0.455095
[888]	valid_0's l2: 0.45505
[889]	valid_0's l2: 

[1135]	valid_0's l2: 0.443639
[1136]	valid_0's l2: 0.443584
[1137]	valid_0's l2: 0.443518
[1138]	valid_0's l2: 0.443494
[1139]	valid_0's l2: 0.443466
[1140]	valid_0's l2: 0.443453
[1141]	valid_0's l2: 0.443407
[1142]	valid_0's l2: 0.443396
[1143]	valid_0's l2: 0.443393
[1144]	valid_0's l2: 0.443337
[1145]	valid_0's l2: 0.443289
[1146]	valid_0's l2: 0.443277
[1147]	valid_0's l2: 0.44318
[1148]	valid_0's l2: 0.443119
[1149]	valid_0's l2: 0.443109
[1150]	valid_0's l2: 0.443086
[1151]	valid_0's l2: 0.443011
[1152]	valid_0's l2: 0.442976
[1153]	valid_0's l2: 0.442969
[1154]	valid_0's l2: 0.442948
[1155]	valid_0's l2: 0.442904
[1156]	valid_0's l2: 0.442882
[1157]	valid_0's l2: 0.442863
[1158]	valid_0's l2: 0.442821
[1159]	valid_0's l2: 0.442771
[1160]	valid_0's l2: 0.442672
[1161]	valid_0's l2: 0.442667
[1162]	valid_0's l2: 0.442636
[1163]	valid_0's l2: 0.442565
[1164]	valid_0's l2: 0.442524
[1165]	valid_0's l2: 0.442505
[1166]	valid_0's l2: 0.442508
[1167]	valid_0's l2: 0.442459
[1168]	vali

[1409]	valid_0's l2: 0.435433
[1410]	valid_0's l2: 0.435398
[1411]	valid_0's l2: 0.435382
[1412]	valid_0's l2: 0.435338
[1413]	valid_0's l2: 0.435313
[1414]	valid_0's l2: 0.435298
[1415]	valid_0's l2: 0.435291
[1416]	valid_0's l2: 0.435275
[1417]	valid_0's l2: 0.435246
[1418]	valid_0's l2: 0.435211
[1419]	valid_0's l2: 0.435175
[1420]	valid_0's l2: 0.435141
[1421]	valid_0's l2: 0.435114
[1422]	valid_0's l2: 0.4351
[1423]	valid_0's l2: 0.435086
[1424]	valid_0's l2: 0.435074
[1425]	valid_0's l2: 0.435062
[1426]	valid_0's l2: 0.43504
[1427]	valid_0's l2: 0.435036
[1428]	valid_0's l2: 0.435002
[1429]	valid_0's l2: 0.434988
[1430]	valid_0's l2: 0.434966
[1431]	valid_0's l2: 0.434947
[1432]	valid_0's l2: 0.434908
[1433]	valid_0's l2: 0.434877
[1434]	valid_0's l2: 0.434859
[1435]	valid_0's l2: 0.434804
[1436]	valid_0's l2: 0.434763
[1437]	valid_0's l2: 0.434745
[1438]	valid_0's l2: 0.434727
[1439]	valid_0's l2: 0.434722
[1440]	valid_0's l2: 0.43471
[1441]	valid_0's l2: 0.43472
[1442]	valid_0'

In [5]:
for day in range(6, 11): 
    print ('start day {}'.format(day))
    df_test = pd.read_csv('../dataset/wind_data_day' + str(day) + '_max1.csv')
    cols = ['hour'] + ['predict_' + str(i) for i in range(1, 11, 1)]
    X_test = df_test[cols].values
    y = gbm.predict(X_test)
    df_test['predict_final'] = y
    df_to_csv = df_test[['xid', 'yid', 'hour', 'predict_final']]
    df_to_csv.to_csv('../dataset/wind_data_day' + str(day) + '_lgb_all.csv')
    del X_test, y, df_to_csv
    
    for i in range(3, 21):
        t1 = time.time()
        day_hour = df_test[df_test['hour'] == i]
        df_real_day = day_hour.copy()
        xid = df_real_day[df_real_day['hour'] == i]['xid']
        yid = df_real_day[df_real_day['hour'] == i]['yid'] 
        wind = df_real_day[df_real_day['hour'] == i]['predict_final']
        df_test_hour = pd.DataFrame({'xid': list(xid),
                      'yid': list(yid),
                      'wind': list(wind)})
        pt = df_test_hour.pivot_table(index='xid', columns='yid', values='wind', aggfunc=np.sum)
        with open('../dataset/day' + str(day) + 'hour'+ str(i) +'.pickle', 'wb') as f:
            pickle.dump(pt, f)
        t2 = time.time()
        print ('cost {}s'.format(t2 - t1))

start day 6
cost 0.6106243133544922s
cost 0.4341576099395752s
cost 0.43014097213745117s
cost 0.4582192897796631s
cost 0.4532041549682617s
cost 0.4522061347961426s
cost 0.5534682273864746s
cost 0.46223020553588867s
cost 0.4291391372680664s
cost 0.45320582389831543s
cost 0.4221217632293701s
cost 0.43114614486694336s
cost 0.43114662170410156s
cost 0.4451892375946045s
cost 0.4231276512145996s
cost 0.4421727657318115s
cost 0.4371640682220459s
cost 0.42513179779052734s
start day 7
cost 0.5364265441894531s
cost 0.4411740303039551s
cost 0.4501991271972656s
cost 0.48328471183776855s
cost 0.5023360252380371s
cost 0.4852907657623291s
cost 0.7189111709594727s
cost 0.47325873374938965s
cost 0.460223913192749s
cost 0.4842872619628906s
cost 0.6366934776306152s
cost 0.4782717227935791s
cost 0.4682443141937256s
cost 0.47626590728759766s
cost 0.5304107666015625s
cost 0.4612264633178711s
cost 0.4842872619628906s
cost 0.46624135971069336s
start day 8
cost 0.44217538833618164s
cost 0.4501991271972656s
cost

In [6]:
for day in range(6, 11):
    dirpath = '../dataset/'
    wind_matrix = -1
    for hour in range(3, 21, 1):
        filename = 'day'+ str(day) +'hour' + str(hour) + '.pickle'
        with open(os.path.join(dirpath, filename), 'rb') as f:
            matrix = np.array(pickle.load(f, encoding='latin1'))

        if isinstance(wind_matrix, int):
            wind_matrix = matrix[:, :, np.newaxis]
        else:
            wind_matrix = np.concatenate([wind_matrix, matrix[:, :, np.newaxis]], axis=2)
    dirpath = '../dataset/day' + str(day) 
    with open(os.path.join(dirpath, 'wind_matrix_lgb.pickle'), 'wb') as f:
        pickle.dump(wind_matrix, f)