In [1]:
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pickle
import os
import glob
import numpy as np
import time

# CV1
### trainset: day 2, 3, 4, 5; testset: 1

In [2]:
df_train = pd.DataFrame()
for filepath in glob.glob('../dataset/wind_data_day[2,3,4,5]_max1.csv'):
    filename = os.path.basename(filepath)
    print (filename)
    df_tmp = pd.read_csv(filepath)
    df_train = pd.concat([df_train, df_tmp], axis=0)
    del df_tmp

df_test = pd.DataFrame()
for filepath in glob.glob('../dataset/wind_data_day[1]_max1.csv'):
    filename = os.path.basename(filepath)
    print (filename)
    df_tmp = pd.read_csv(filepath)
    df_test = pd.concat([df_test, df_tmp], axis=0)
    del df_tmp
df_test.shape

df_train['hour'] = df_train['hour'].astype(np.int)
df_test['hour'] = df_test['hour'].astype(np.int)

X_train = df_train.drop(['real', 'xid', 'yid'], axis=1).values
y_train = df_train['real'].values

del df_train

X_test = df_test.drop(['real', 'xid', 'yid'], axis=1).values
y_test = df_test['real'].values

del df_test

# shuffle the array
y_train = y_train[:, np.newaxis]
training_data = np.hstack( (X_train, y_train) )
np.random.shuffle(training_data)

X_train = training_data[:, :-1]
y_train = training_data[:, -1]
X_train.shape, y_train.shape

del training_data

feature_name = ['hour', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10']
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

wind_data_day2_max1.csv
wind_data_day3_max1.csv
wind_data_day4_max1.csv
wind_data_day5_max1.csv
wind_data_day1_max1.csv


In [3]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2'},
    'num_leaves': 50,
    'learning_rate': 0.1,
    'feature_fraction': 0.3,
    'bagging_fraction': 0.5,
    'bagging_freq': 10,
    
}

print ('start training')

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1500,
                valid_sets=lgb_eval,  # eval training data
                feature_name=feature_name,
                categorical_feature=[0], 
                early_stopping_rounds=10)

start training




[1]	valid_0's l2: 12.1397
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's l2: 10.0439
[3]	valid_0's l2: 8.32088
[4]	valid_0's l2: 6.96414
[5]	valid_0's l2: 5.85571
[6]	valid_0's l2: 4.91476
[7]	valid_0's l2: 4.14499
[8]	valid_0's l2: 3.58533
[9]	valid_0's l2: 3.10137
[10]	valid_0's l2: 2.71014
[11]	valid_0's l2: 2.37505
[12]	valid_0's l2: 2.11136
[13]	valid_0's l2: 1.89628
[14]	valid_0's l2: 1.73429
[15]	valid_0's l2: 1.59905
[16]	valid_0's l2: 1.48275
[17]	valid_0's l2: 1.3899
[18]	valid_0's l2: 1.31966
[19]	valid_0's l2: 1.26005
[20]	valid_0's l2: 1.21542
[21]	valid_0's l2: 1.17911
[22]	valid_0's l2: 1.1492
[23]	valid_0's l2: 1.12293
[24]	valid_0's l2: 1.1058
[25]	valid_0's l2: 1.09156
[26]	valid_0's l2: 1.07978
[27]	valid_0's l2: 1.06937
[28]	valid_0's l2: 1.06143
[29]	valid_0's l2: 1.05688
[30]	valid_0's l2: 1.05249
[31]	valid_0's l2: 1.04904
[32]	valid_0's l2: 1.0471
[33]	valid_0's l2: 1.04487
[34]	valid_0's l2: 1.0427
[35]	valid_0's l2: 1.04087
[36]	va

In [4]:
if not os.path.exists('../dataset/lgb_cv1'):
    os.makedirs('../dataset/lgb_cv1')
with open('../dataset/lgb_cv1/wind_predictor.pickle', 'wb') as f:
    pickle.dump(gbm, f)

# Cv2
### trainset: day 1, 3, 4, 5; testset: day 2

In [5]:
df_train = pd.DataFrame()
for file in glob.glob('../dataset/wind_data_day[1,3,4,5]_max1.csv'):
    filename = os.path.basename(file)
    print (filename)    
    df_tmp = pd.read_csv(file)
    df_train = pd.concat([df_train, df_tmp], axis=0)
    del df_tmp
df_train.shape

df_test = pd.DataFrame()
for file in glob.glob('../dataset/wind_data_day[2]_max1.csv'):
    filename = os.path.basename(file)
    print (filename)
    df_tmp = pd.read_csv(file)
    df_test = pd.concat([df_test, df_tmp], axis=0)
    del df_tmp
df_test.shape

df_train['hour'] = df_train['hour'].astype(np.int)
df_test['hour'] = df_test['hour'].astype(np.int)

X_train = df_train.drop(['real', 'xid', 'yid'], axis=1).values
y_train = df_train['real'].values

del df_train

X_test = df_test.drop(['real', 'xid', 'yid'], axis=1).values
y_test = df_test['real'].values

del df_test

# shuffle the array
y_train = y_train[:, np.newaxis]
training_data = np.hstack( (X_train, y_train) )
np.random.shuffle(training_data)

X_train = training_data[:, :-1]
y_train = training_data[:, -1]
X_train.shape, y_train.shape

del training_data

feature_name = ['hour', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10']
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

wind_data_day1_max1.csv
wind_data_day3_max1.csv
wind_data_day4_max1.csv
wind_data_day5_max1.csv
wind_data_day2_max1.csv


In [6]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2'},
    'num_leaves': 100,
    'learning_rate': 0.1,
    'feature_fraction': 0.3,
    'bagging_fraction': 0.5,
    'bagging_freq': 10,
    
}

print ('start training')

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1500,
                valid_sets=lgb_eval,  # eval training data
                feature_name=feature_name,
                categorical_feature=[0], 
                early_stopping_rounds=10)

start training




[1]	valid_0's l2: 28.2546
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's l2: 23.2437
[3]	valid_0's l2: 19.2345
[4]	valid_0's l2: 16.0195
[5]	valid_0's l2: 13.3032
[6]	valid_0's l2: 11.0467
[7]	valid_0's l2: 9.28649
[8]	valid_0's l2: 7.84504
[9]	valid_0's l2: 6.64757
[10]	valid_0's l2: 5.70004
[11]	valid_0's l2: 4.96322
[12]	valid_0's l2: 4.31201
[13]	valid_0's l2: 3.81335
[14]	valid_0's l2: 3.41138
[15]	valid_0's l2: 3.08051
[16]	valid_0's l2: 2.81195
[17]	valid_0's l2: 2.61526
[18]	valid_0's l2: 2.43465
[19]	valid_0's l2: 2.30907
[20]	valid_0's l2: 2.18735
[21]	valid_0's l2: 2.09723
[22]	valid_0's l2: 2.02595
[23]	valid_0's l2: 1.97429
[24]	valid_0's l2: 1.93183
[25]	valid_0's l2: 1.89965
[26]	valid_0's l2: 1.87415
[27]	valid_0's l2: 1.85505
[28]	valid_0's l2: 1.84546
[29]	valid_0's l2: 1.83058
[30]	valid_0's l2: 1.82352
[31]	valid_0's l2: 1.81519
[32]	valid_0's l2: 1.8133
[33]	valid_0's l2: 1.80876
[34]	valid_0's l2: 1.80561
[35]	valid_0's l2: 1.80902
[36

In [7]:
if not os.path.exists('../dataset/lgb_cv2'):
    os.makedirs('../dataset/lgb_cv2')
with open('../dataset/lgb_cv2/wind_predictor.pickle', 'wb') as f:
    pickle.dump(gbm, f)

# Cv3
### trainset: day 1, 2, 4, 5; testset: day 3

In [8]:
df_train = pd.DataFrame()
for file in glob.glob('../dataset/wind_data_day[1,2,4,5]_max1.csv'):
    filename = os.path.basename(file)
    print (filename)    
    df_tmp = pd.read_csv(file)
    df_train = pd.concat([df_train, df_tmp], axis=0)
    del df_tmp
df_train.shape

df_test = pd.DataFrame()
for file in glob.glob('../dataset/wind_data_day[3]_max1.csv'):
    filename = os.path.basename(file)
    print (filename)
    df_tmp = pd.read_csv(file)
    df_test = pd.concat([df_test, df_tmp], axis=0)
    del df_tmp
df_test.shape

df_train['hour'] = df_train['hour'].astype(np.int)
df_test['hour'] = df_test['hour'].astype(np.int)

X_train = df_train.drop(['real', 'xid', 'yid'], axis=1).values
y_train = df_train['real'].values

del df_train

X_test = df_test.drop(['real', 'xid', 'yid'], axis=1).values
y_test = df_test['real'].values

del df_test

# shuffle the array
y_train = y_train[:, np.newaxis]
training_data = np.hstack( (X_train, y_train) )
np.random.shuffle(training_data)

X_train = training_data[:, :-1]
y_train = training_data[:, -1]
X_train.shape, y_train.shape

del training_data

feature_name = ['hour', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10']
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

wind_data_day1_max1.csv
wind_data_day2_max1.csv
wind_data_day4_max1.csv
wind_data_day5_max1.csv
wind_data_day3_max1.csv


In [9]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2'},
    'num_leaves': 100,
    'learning_rate': 0.1,
    'feature_fraction': 0.3,
    'bagging_fraction': 0.5,
    'bagging_freq': 10,
    
}

print ('start training')

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1500,
                valid_sets=lgb_eval,  # eval training data
                feature_name=feature_name,
                categorical_feature=[0], 
                early_stopping_rounds=10)

start training




[1]	valid_0's l2: 47.7442
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's l2: 39.0454
[3]	valid_0's l2: 32.0357
[4]	valid_0's l2: 26.1901
[5]	valid_0's l2: 21.5721
[6]	valid_0's l2: 17.6952
[7]	valid_0's l2: 14.522
[8]	valid_0's l2: 11.9288
[9]	valid_0's l2: 9.82064
[10]	valid_0's l2: 8.05616
[11]	valid_0's l2: 6.71671
[12]	valid_0's l2: 5.56501
[13]	valid_0's l2: 4.64846
[14]	valid_0's l2: 3.86736
[15]	valid_0's l2: 3.21628
[16]	valid_0's l2: 2.69797
[17]	valid_0's l2: 2.2592
[18]	valid_0's l2: 1.90795
[19]	valid_0's l2: 1.61883
[20]	valid_0's l2: 1.37434
[21]	valid_0's l2: 1.17716
[22]	valid_0's l2: 1.01933
[23]	valid_0's l2: 0.887141
[24]	valid_0's l2: 0.780027
[25]	valid_0's l2: 0.687335
[26]	valid_0's l2: 0.614321
[27]	valid_0's l2: 0.55591
[28]	valid_0's l2: 0.515893
[29]	valid_0's l2: 0.47349
[30]	valid_0's l2: 0.44032
[31]	valid_0's l2: 0.411673
[32]	valid_0's l2: 0.389363
[33]	valid_0's l2: 0.373081
[34]	valid_0's l2: 0.361123
[35]	valid_0's l2: 0.3

In [10]:
if not os.path.exists('../dataset/lgb_cv3'):
    os.makedirs('../dataset/lgb_cv3')
with open('../dataset/lgb_cv3/wind_predictor.pickle', 'wb') as f:
    pickle.dump(gbm, f)

# Cv4
### trainset: day 1, 2, 3, 5; testset: day 4

In [11]:
df_train = pd.DataFrame()
for file in glob.glob('../dataset/wind_data_day[1,2,3,5]_max1.csv'):
    filename = os.path.basename(file)
    print (filename)    
    df_tmp = pd.read_csv(file)
    df_train = pd.concat([df_train, df_tmp], axis=0)
    del df_tmp
df_train.shape

df_test = pd.DataFrame()
for file in glob.glob('../dataset/wind_data_day[4]_max1.csv'):
    filename = os.path.basename(file)
    print (filename)
    df_tmp = pd.read_csv(file)
    df_test = pd.concat([df_test, df_tmp], axis=0)
    del df_tmp
df_test.shape

df_train['hour'] = df_train['hour'].astype(np.int)
df_test['hour'] = df_test['hour'].astype(np.int)

X_train = df_train.drop(['real', 'xid', 'yid'], axis=1).values
y_train = df_train['real'].values

del df_train

X_test = df_test.drop(['real', 'xid', 'yid'], axis=1).values
y_test = df_test['real'].values

del df_test

# shuffle the array
y_train = y_train[:, np.newaxis]
training_data = np.hstack( (X_train, y_train) )
np.random.shuffle(training_data)

X_train = training_data[:, :-1]
y_train = training_data[:, -1]
X_train.shape, y_train.shape

del training_data

feature_name = ['hour', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10']
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

wind_data_day1_max1.csv
wind_data_day2_max1.csv
wind_data_day3_max1.csv
wind_data_day5_max1.csv
wind_data_day4_max1.csv


In [12]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2'},
    'num_leaves': 150,
    'learning_rate': 0.1,
    'feature_fraction': 0.3,
    'bagging_fraction': 0.5,
    'bagging_freq': 10,
    
}

print ('start training')

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1500,
                valid_sets=lgb_eval,  # eval training data
                feature_name=feature_name,
                categorical_feature=[0], 
                early_stopping_rounds=10)

start training




[1]	valid_0's l2: 20.1337
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's l2: 16.9946
[3]	valid_0's l2: 14.1676
[4]	valid_0's l2: 12.014
[5]	valid_0's l2: 10.1438
[6]	valid_0's l2: 8.77338
[7]	valid_0's l2: 7.53816
[8]	valid_0's l2: 6.54026
[9]	valid_0's l2: 5.67512
[10]	valid_0's l2: 5.03971
[11]	valid_0's l2: 4.50038
[12]	valid_0's l2: 4.04549
[13]	valid_0's l2: 3.68435
[14]	valid_0's l2: 3.36407
[15]	valid_0's l2: 3.09098
[16]	valid_0's l2: 2.88858
[17]	valid_0's l2: 2.72329
[18]	valid_0's l2: 2.57151
[19]	valid_0's l2: 2.44808
[20]	valid_0's l2: 2.3473
[21]	valid_0's l2: 2.25802
[22]	valid_0's l2: 2.18235
[23]	valid_0's l2: 2.12265
[24]	valid_0's l2: 2.0728
[25]	valid_0's l2: 2.02942
[26]	valid_0's l2: 1.99184
[27]	valid_0's l2: 1.96402
[28]	valid_0's l2: 1.93617
[29]	valid_0's l2: 1.91415
[30]	valid_0's l2: 1.89718
[31]	valid_0's l2: 1.88298
[32]	valid_0's l2: 1.86835
[33]	valid_0's l2: 1.85352
[34]	valid_0's l2: 1.84178
[35]	valid_0's l2: 1.8309
[36]	v

In [13]:
if not os.path.exists('../dataset/lgb_cv4'):
    os.makedirs('../dataset/lgb_cv4')
with open('../dataset/lgb_cv4/wind_predictor.pickle', 'wb') as f:
    pickle.dump(gbm, f)

# Cv5
### trainset: day 1, 2, 3, 4; testset: day 5

In [14]:
df_train = pd.DataFrame()
for file in glob.glob('../dataset/wind_data_day[1,2,3,4]_max1.csv'):
    filename = os.path.basename(file)
    print (filename)    
    df_tmp = pd.read_csv(file)
    df_train = pd.concat([df_train, df_tmp], axis=0)
    del df_tmp
df_train.shape

df_test = pd.DataFrame()
for file in glob.glob('../dataset/wind_data_day[5]_max1.csv'):
    filename = os.path.basename(file)
    print (filename)
    df_tmp = pd.read_csv(file)
    df_test = pd.concat([df_test, df_tmp], axis=0)
    del df_tmp
df_test.shape

df_train['hour'] = df_train['hour'].astype(np.int)
df_test['hour'] = df_test['hour'].astype(np.int)

X_train = df_train.drop(['real', 'xid', 'yid'], axis=1).values
y_train = df_train['real'].values

del df_train

X_test = df_test.drop(['real', 'xid', 'yid'], axis=1).values
y_test = df_test['real'].values

del df_test

# shuffle the array
y_train = y_train[:, np.newaxis]
training_data = np.hstack( (X_train, y_train) )
np.random.shuffle(training_data)

X_train = training_data[:, :-1]
y_train = training_data[:, -1]
X_train.shape, y_train.shape

del training_data

feature_name = ['hour', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10']
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

wind_data_day1_max1.csv
wind_data_day2_max1.csv
wind_data_day3_max1.csv
wind_data_day4_max1.csv
wind_data_day5_max1.csv


In [15]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2'},
    'num_leaves': 200,
    'learning_rate': 0.1,
    'feature_fraction': 0.3,
    'bagging_fraction': 0.5,
    'bagging_freq': 10,
    
}

print ('start training')

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1500,
                valid_sets=lgb_eval,  # eval training data
                feature_name=feature_name,
                categorical_feature=[0], 
                early_stopping_rounds=10)

start training




[1]	valid_0's l2: 26.6
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's l2: 21.7759
[3]	valid_0's l2: 17.8829
[4]	valid_0's l2: 14.6802
[5]	valid_0's l2: 12.0758
[6]	valid_0's l2: 10.0436
[7]	valid_0's l2: 8.32718
[8]	valid_0's l2: 6.91574
[9]	valid_0's l2: 5.80176
[10]	valid_0's l2: 4.85734
[11]	valid_0's l2: 4.09453
[12]	valid_0's l2: 3.48908
[13]	valid_0's l2: 2.99919
[14]	valid_0's l2: 2.56847
[15]	valid_0's l2: 2.22827
[16]	valid_0's l2: 1.95622
[17]	valid_0's l2: 1.72378
[18]	valid_0's l2: 1.53499
[19]	valid_0's l2: 1.38607
[20]	valid_0's l2: 1.25554
[21]	valid_0's l2: 1.14684
[22]	valid_0's l2: 1.05556
[23]	valid_0's l2: 0.981741
[24]	valid_0's l2: 0.919629
[25]	valid_0's l2: 0.867578
[26]	valid_0's l2: 0.826864
[27]	valid_0's l2: 0.791424
[28]	valid_0's l2: 0.763044
[29]	valid_0's l2: 0.738866
[30]	valid_0's l2: 0.718978
[31]	valid_0's l2: 0.702244
[32]	valid_0's l2: 0.688101
[33]	valid_0's l2: 0.676299
[34]	valid_0's l2: 0.666766
[35]	valid_0's l2: 0

In [16]:
if not os.path.exists('../dataset/lgb_cv5'):
    os.makedirs('../dataset/lgb_cv5')
with open('../dataset/lgb_cv5/wind_predictor.pickle', 'wb') as f:
    pickle.dump(gbm, f)

In [17]:
for vs in range(1, 6):

    with open('../dataset/lgb_cv' + str(vs) + '/wind_predictor.pickle', 'rb') as f:
            gbm = pickle.load(f)

    for day in range(6, 11, 1): 
        print ('start day {}'.format(day))
        df_test = pd.read_csv('../dataset/wind_data_day' + str(day) + '_max1.csv')
        cols = ['hour'] + ['predict_' + str(model) for model in range(1, 11, 1)]
        X_test = df_test[cols].values
        y = gbm.predict(X_test)
        df_test['predict_final'] = y
        df_to_csv = df_test[['xid', 'yid', 'hour', 'predict_final']]
        df_to_csv.to_csv('../dataset/lgb_cv' + str(vs) + '/wind_lightgbm_day' + str(day) + '.csv')
        del X_test, y, df_to_csv

        for i in range(3, 21):
            t1 = time.time()
            day_hour = df_test[df_test['hour'] == i]
            df_real_day = day_hour.copy()
            xid = df_real_day[df_real_day['hour'] == i]['xid']
            yid = df_real_day[df_real_day['hour'] == i]['yid'] 
            wind = df_real_day[df_real_day['hour'] == i]['predict_final']
            df_test_hour = pd.DataFrame({'xid': list(xid),
                          'yid': list(yid),
                          'wind': list(wind)})
            pt = df_test_hour.pivot_table(index='xid', columns='yid', values='wind', aggfunc=np.sum)
            with open('../dataset/lgb_cv' + str(vs) + '/day' + str(day) + 'hour'+ str(i) +'.pickle', 'wb') as f:
                pickle.dump(pt, f)
            t2 = time.time()
            print ('cost {}s'.format(t2 - t1))

start day 6
cost 0.5805401802062988s
cost 0.4301445484161377s
cost 0.42816662788391113s
cost 0.4331519603729248s
cost 0.5093553066253662s
cost 0.43515610694885254s
cost 0.42713356018066406s
cost 0.5645020008087158s
cost 0.43214869499206543s
cost 0.4331512451171875s
cost 0.4301438331604004s
cost 0.42415857315063477s
cost 0.4301128387451172s
cost 0.4341566562652588s
cost 0.43114566802978516s
cost 0.4291422367095947s
cost 0.4291400909423828s
cost 0.4371633529663086s
start day 7
cost 0.4331545829772949s
cost 0.45921826362609863s
cost 0.5183651447296143s
cost 0.44217538833618164s
cost 0.5314478874206543s
cost 0.47225522994995117s
cost 0.719916820526123s
cost 0.4882979393005371s
cost 0.5855557918548584s
cost 0.5634996891021729s
cost 0.5274014472961426s
cost 0.4271361827850342s
cost 0.4431779384613037s
cost 0.4501969814300537s
cost 0.4582178592681885s
cost 0.44618678092956543s
cost 0.8312108516693115s
cost 0.7299158573150635s
start day 8
cost 0.44217610359191895s
cost 0.4361598491668701s
cost

cost 0.4742605686187744s
cost 0.4371626377105713s
cost 0.4572439193725586s
cost 0.43816494941711426s
cost 0.6928412914276123s
cost 0.5304107666015625s
cost 0.4882967472076416s
cost 0.4822838306427002s
start day 9
cost 0.5984914302825928s
cost 0.6492259502410889s
cost 0.5384387969970703s
cost 0.5433762073516846s
cost 0.5228898525238037s
cost 0.5239062309265137s
cost 0.522878885269165s
cost 0.5325517654418945s
cost 0.5380122661590576s
cost 0.5285389423370361s
cost 0.5509638786315918s
cost 0.6618061065673828s
cost 0.5386340618133545s
cost 0.531268835067749s
cost 0.536513090133667s
cost 0.5345287322998047s
cost 0.5183801651000977s
cost 0.5178961753845215s
start day 10
cost 0.6378202438354492s
cost 0.6507303714752197s
cost 0.5178749561309814s
cost 0.531909704208374s
cost 0.5400240421295166s
cost 0.5078494548797607s
cost 0.5294129848480225s
cost 0.5460026264190674s
cost 0.5401005744934082s
cost 0.5183815956115723s
cost 0.601600170135498s
cost 0.7085142135620117s
cost 0.5870611667633057s
cost

In [18]:
for vs in range(1, 6, 1):
    for day in range(6, 11, 1):
        dirpath ='../dataset/lgb_cv' + str(vs)
        wind_matrix = -1
        for hour in range(3, 21, 1):
            filename = 'day' + str(day) + 'hour' + str(hour) + '.pickle'
            with open(os.path.join(dirpath, filename), 'rb') as f:
                matrix = np.array(pickle.load(f, encoding='latin1'))

            if isinstance(wind_matrix, int):
                wind_matrix = matrix[:, :, np.newaxis]
            else:
                wind_matrix = np.concatenate([wind_matrix, matrix[:, :, np.newaxis]], axis=2)
        with open(os.path.join(dirpath, 'wind_matrix' + str(vs) + '.pickle'), 'wb') as f:
            pickle.dump(wind_matrix, f)

In [20]:
for day in range(6, 11, 1):
    print ('start day {}'.format(day))
    df = pd.DataFrame()
    for vs in range(1, 6, 1):
        file_dir = '../dataset/lgb_cv' + str(vs)
        if df.shape[0] == 0:
            df = pd.read_csv(os.path.join(file_dir, 'wind_lightgbm_day' + str(day) + '.csv'), index_col=[0])
        else:
            df = pd.concat([df, pd.read_csv(os.path.join(file_dir, 'wind_lightgbm_day' + str(day) + '.csv'), index_col=[0])[['predict_final']]], axis=1)
    df.columns = ['xid', 'yid', 'hour', 'predict_final1', 'predict_final2', 'predict_final3',
           'predict_final4', 'predict_final5']
    df['max_val'] = df[['predict_final1', 'predict_final2', 'predict_final3',
       'predict_final4', 'predict_final5']].apply(np.max, axis=1)
    
    for mode in ['max']:
        dir_name = '../dataset/'
        for i in range(3, 21):
            day_hour = df[df['hour'] == i]
            df_real_day = day_hour.copy()
            xid = df_real_day[df_real_day['hour'] == i]['xid']
            yid = df_real_day[df_real_day['hour'] == i]['yid'] 
            wind = df_real_day[df_real_day['hour'] == i][mode + '_val']
            df_test_hour = pd.DataFrame({'xid': list(xid),
                          'yid': list(yid),
                          'wind': list(wind)})
            pt = df_test_hour.pivot_table(index='xid', columns='yid', values='wind', aggfunc=np.sum)
            with open(dir_name + '/day' + str(day) + 'hour'+ str(i) +'.pickle', 'wb') as f:
                pickle.dump(pt, f)
        print ('mode: {}, day{} done'.format(mode, day))
    del df

start day 6


  mask |= (ar1 == a)


mode: max, day6 done
start day 7
mode: max, day7 done
start day 8
mode: max, day8 done
start day 9
mode: max, day9 done
start day 10
mode: max, day10 done


In [21]:
for day in range(6, 11, 1):
    dirpath ='../dataset/day' + str(day)
    wind_matrix = -1
    for hour in range(3, 21, 1):
        filename = 'day' + str(day) + 'hour' + str(hour) + '.pickle'
        with open(os.path.join('../dataset', filename), 'rb') as f:
            matrix = np.array(pickle.load(f, encoding='latin1'))

        if isinstance(wind_matrix, int):
            wind_matrix = matrix[:, :, np.newaxis]
        else:
            wind_matrix = np.concatenate([wind_matrix, matrix[:, :, np.newaxis]], axis=2)
    with open(os.path.join(dirpath, 'wind_matrix_lgb_cv.pickle'), 'wb') as f:
        pickle.dump(wind_matrix, f)