In [1]:
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pickle
import os
import glob
import numpy as np
import time

# 1. Generate Temporal Features

# Step1:  get index_list for get the temporal feature faster

In [2]:
def getNextHourIndexByCurHour(df_day, hour):
    if hour == 20:
        hour = 19
    index_start = (hour - 3 + 1)*548*421
    index_end = (hour - 3 + 2)*548*421
    return range(index_start, index_end)

In [3]:
def getPreviousHourIndexByCurHour(df_day, hour):
    if hour == 3:
        hour = 4
    index_start = (hour - 3 - 1)*548*421
    index_end = (hour - 3)*548*421
    return range(index_start, index_end)

In [4]:
df_rain_day1 = pd.read_csv('../dataset/rainfall_data_day1.csv')

previous_hour_list = []
for hour in range(3, 21):
    previous_hour_list.append(getPreviousHourIndexByCurHour(df_rain_day1, hour))
previous_hour_list = np.ravel(previous_hour_list)

next_hour_list = []
for hour in range(3, 21):
    next_hour_list.append(getNextHourIndexByCurHour(df_rain_day1, hour))
next_hour_list = np.ravel(next_hour_list)

# Step2: Generate Training & Testing Sets Add Temporal Features

In [5]:
for day in range(1, 11):
    if os.path.exists('../dataset/rainfall_data_add_temporal_feature_day'+ str(day) +'.csv'):
        print ('../dataset/rainfall_data_add_temporal_feature_day'+ str(day) +'.csv already exists')
        continue
    print(("day {} is begin").format(str(day)))
    start = time.time()
    df_day = pd.read_csv('../dataset/rainfall_data_day'+ str(day) + '.csv')
    feature = ['predict_' + str(i) for i in range (1, 11, 1)]
    df_day_previous_hour = df_day.iloc[previous_hour_list][feature]
    feature_previous_hour = ['predict_' + str(i) + '_previous_hour' for i in range (1, 11, 1)]
    df_day_previous_hour.columns = feature_previous_hour

    df_day_next_hour = df_day.iloc[next_hour_list][feature]
    feature_next_hour = ['predict_' + str(i) + '_next_hour' for i in range (1, 11, 1)]
    df_day_next_hour.columns = feature_next_hour
                 
    df_day_previous_hour = df_day_previous_hour.reset_index(drop=True)
    df_day_next_hour = df_day_next_hour.reset_index(drop=True)

    df_day_concat = pd.concat([df_day, df_day_previous_hour, df_day_next_hour], axis=1)  
    df_day_concat.to_csv('../dataset/rainfall_data_add_temporal_feature_day'+ str(day) +'.csv', index=False)
    del df_day, df_day_concat, df_day_previous_hour, df_day_next_hour
    print(("day {} is done").format(str(day)))
    cost_time = time.time() - start
    print(("cost time: {0:.2f} min").format(cost_time/60.0))

../dataset/rainfall_data_add_temporal_feature_day1.csv already exists
../dataset/rainfall_data_add_temporal_feature_day2.csv already exists
../dataset/rainfall_data_add_temporal_feature_day3.csv already exists
../dataset/rainfall_data_add_temporal_feature_day4.csv already exists
../dataset/rainfall_data_add_temporal_feature_day5.csv already exists
../dataset/rainfall_data_add_temporal_feature_day6.csv already exists
../dataset/rainfall_data_add_temporal_feature_day7.csv already exists
../dataset/rainfall_data_add_temporal_feature_day8.csv already exists
../dataset/rainfall_data_add_temporal_feature_day9.csv already exists
../dataset/rainfall_data_add_temporal_feature_day10.csv already exists


# 2. Train Model

In [6]:
df = pd.DataFrame()
for day in range(1, 6):
    df_tmp = pd.read_csv('../dataset/rainfall_data_add_temporal_feature_day' + str(day) + '.csv')
    df = pd.concat([df, df_tmp], axis=0)
    del df_tmp
df.shape

(20763720, 34)

In [8]:
cols = ['hour'] + ['predict_' + str(i) for i in range(1, 11)] + \
        ['predict_' + str(i) + '_previous_hour' for i in range(1, 11)] + \
            ['predict_' + str(i) + '_next_hour' for i in range(1, 11)]
X = df[cols].values
y = df['real'].values
del df
print (X.shape, y.shape)

(20763720, 31) (20763720,)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
del X, y
print (X_train.shape, X_test.shape)

lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

MemoryError: 

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2'},
    'num_leaves': 550,
    'learning_rate': 0.6,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    
}

print ('start training')

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1500,
                valid_sets=lgb_eval,  # eval training data
                categorical_feature=[0], 
                early_stopping_rounds=10)

# 3.predict

In [None]:
for day in range(6, 11): 
    print ('start day {}'.format(day))
    df_test = pd.read_csv('../dataset/rainfall_data_add_temporal_feature_day' + str(day) + '.csv')
    cols = ['hour'] + ['predict_' + str(i) for i in range(1, 11, 1)]
    X_test = df_test[cols].values
    y = gbm.predict(X_test)
    df_test['predict_final'] = y
    df_to_csv = df_test[['xid', 'yid', 'hour', 'predict_final']]
    df_to_csv.to_csv('../dataset/rainfall_data_day' + str(day) + '_lgb_time.csv')
    del X_test, y, df_to_csv
    
    for i in range(3, 21):
        t1 = time.time()
        day_hour = df_test[df_test['hour'] == i]
        df_real_day = day_hour.copy()
        xid = df_real_day[df_real_day['hour'] == i]['xid']
        yid = df_real_day[df_real_day['hour'] == i]['yid'] 
        rainfall = df_real_day[df_real_day['hour'] == i]['predict_final']
        df_test_hour = pd.DataFrame({'xid': list(xid),
                      'yid': list(yid),
                      'rainfall': list(rainfall)})
        pt = df_test_hour.pivot_table(index='xid', columns='yid', values='rainfall', aggfunc=np.sum)
        with open('../dataset/day' + str(day) + 'hour'+ str(i) +'.pickle', 'wb') as f:
            pickle.dump(pt, f)
        t2 = time.time()
        print ('cost {}s'.format(t2 - t1))

In [None]:
for day in range(6, 11, 1):
    for hour in range(3, 21):
        with open('../dataset/day' + str(day) + '/day' + str(day) + 'hour'+ str(hour) + '.pickle', 'rb') as f:
            wind_map = pickle.load(f)
        wind_map[wind_map < 0] = 0
        with open('../dataset/day' + str(day) + '/day' + str(day) + 'hour'+ str(hour) + '.pickle', 'wb') as f:
            pickle.dump(wind_map, f)
            print ('day' + str(day) + 'hour'+ str(hour) +  'done')

# 4. generate the rain matrix

In [None]:
for day in range(6, 11):
    dirpath = '../dataset/'
    wind_matrix = -1
    for hour in range(3, 21, 1):
        filename = 'day'+ str(day) +'hour' + str(hour) + '.pickle'
        with open(os.path.join(dirpath, filename), 'rb') as f:
            matrix = np.array(pickle.load(f, encoding='latin1'))

        if isinstance(wind_matrix, int):
            wind_matrix = matrix[:, :, np.newaxis]
        else:
            wind_matrix = np.concatenate([wind_matrix, matrix[:, :, np.newaxis]], axis=2)
    dirpath = '../dataset/day' + str(day) 
    with open(os.path.join(dirpath, 'rain_matrix_lgb_time.pickle'), 'wb') as f:
        pickle.dump(wind_matrix, f)