In [None]:
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pickle
import os
import glob
import numpy as np
import time

# 1. import data

In [None]:
df = pd.DataFrame()
for day in range(1, 6):
    df_tmp = pd.read_csv('../dataset/rainfall_data_day' + str(day) + '_max1.csv')
    df = pd.concat([df, df_tmp], axis=0)
    del df_tmp
df.shape

# 2. 70%train+30%validate

In [None]:
cols = ['hour'] + ['predict_' + str(i) for i in range(1, 11)]
X = df[cols].values
y = df['real'].values
del df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
del X, y

feature_name = ['hour', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10']
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

# 3. model training

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2'},
    'num_leaves': 550,
    'learning_rate': 0.6,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    
}

print ('start training')

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1500,
                valid_sets=lgb_eval,  # eval training data
                feature_name=feature_name,
                categorical_feature=[0], 
                early_stopping_rounds=10)

# 4. predict 

In [None]:
for day in range(6, 11): 
    print ('start day {}'.format(day))
    df_test = pd.read_csv('../dataset/rainfall_data_day' + str(day) + '_max1.csv')
    cols = ['hour'] + ['predict_' + str(i) for i in range(1, 11, 1)]
    X_test = df_test[cols].values
    y = gbm.predict(X_test)
    df_test['predict_final'] = y
    df_to_csv = df_test[['xid', 'yid', 'hour', 'predict_final']]
    df_to_csv.to_csv('../dataset/rainfall_data_day' + str(day) + '_lgb_all.csv')
    del X_test, y, df_to_csv
    
    for i in range(3, 21):
        t1 = time.time()
        day_hour = df_test[df_test['hour'] == i]
        df_real_day = day_hour.copy()
        xid = df_real_day[df_real_day['hour'] == i]['xid']
        yid = df_real_day[df_real_day['hour'] == i]['yid'] 
        rainfall = df_real_day[df_real_day['hour'] == i]['predict_final']
        df_test_hour = pd.DataFrame({'xid': list(xid),
                      'yid': list(yid),
                      'rainfall': list(rainfall)})
        pt = df_test_hour.pivot_table(index='xid', columns='yid', values='rainfall', aggfunc=np.sum)
        with open('../dataset/day' + str(day) + 'hour'+ str(i) +'.pickle', 'wb') as f:
            pickle.dump(pt, f)
        t2 = time.time()
        print ('cost {}s'.format(t2 - t1))

# 5. generate the rain matrix

In [None]:
for day in range(6, 11):
    dirpath = '../dataset/'
    wind_matrix = -1
    for hour in range(3, 21, 1):
        filename = 'day'+ str(day) +'hour' + str(hour) + '.pickle'
        with open(os.path.join(dirpath, filename), 'rb') as f:
            matrix = np.array(pickle.load(f, encoding='latin1'))

        if isinstance(wind_matrix, int):
            wind_matrix = matrix[:, :, np.newaxis]
        else:
            wind_matrix = np.concatenate([wind_matrix, matrix[:, :, np.newaxis]], axis=2)
    dirpath = '../dataset/day' + str(day) 
    with open(os.path.join(dirpath, 'rain_matrix_lgb.pickle'), 'wb') as f:
        pickle.dump(wind_matrix, f)