# 1. Import all the packages we need

In [1]:
import pandas as pd
import numpy as np 
import time
import pickle
import sys
import os

# 2. Run the following code to make sure all the functions can work

In [5]:
def outlier_finder(data):
    """
    find outliers in the data
    return the indexes of the outliers
    """
    if not isinstance(data, np.ndarray):
        data = np.array(data)
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    upper_bound = q3 + 1.5 * (q3 - q1)
    lower_bound = q1 - 1.5 * (q3 - q1)
    outliers = []
    for index, item in enumerate(data):
        if item > upper_bound or item < lower_bound:
            outliers.append(index)
    return outliers

def replace_val(data, outliers, mode='max'):
    """
    modify the outliers by the mean or max value of the rest data
    """
    data_list = []
    for i in range(len(data)):
        if i not in outliers:
            data_list.append(data[i])
    if mode == 'max':
        return np.max(data_list)
    elif mode == 'mean':
        return np.mean(data_list)
    else:
        print ('Node such mode')

class ProgressBar:
    """
    make a progress bar to show the progress
    """
    def __init__(self, count = 0, total = 0, width = 50):
        self.count = count
        self.total = total
        self.width = width
        self.progress = 0
    
    def move(self):
        self.count += 1
    
    def log(self):
        new_progress = self.width * self.count // self.total
        if new_progress > self.progress:
            sys.stdout.write(' ' * (self.width + 9) + '\r')
            sys.stdout.flush()
            self.progress = new_progress
            sys.stdout.write('{0:3}/{1:3}: '.format(self.count, self.total))
            sys.stdout.write('#' * self.progress + '-' * int(self.width - self.progress) + '\r')
            if self.progress == self.width:
                sys.stdout.write('\n')
            sys.stdout.flush()

def execute_func(input_file, output_file, mode='max'):
    """
    modify all the outliers
    """
    df = pd.read_csv(input_file)
    cols = ['predict_' + str(i) for i in range (1, 11, 1)]
    df_predict_vals = df[cols]

    t1 = time.time()
    d ={}
    bar = ProgressBar(total = df_predict_vals.shape[0])
    for index, row in df_predict_vals.iterrows():
        outliers = outlier_finder( row )
        if outliers:
            new_val = replace_val(row, outliers, mode)
            for outlier in outliers:
                row[outlier] = new_val
            d[index] = outliers
        bar.move()
        bar.log()
    t2 = time.time()
    print ('cost time {0:.2f}min'.format((t2 - t1) / 60))
    print ('{}/{} rows are modified'.format(len(d), df_predict_vals.shape[0]))

    df_features = pd.concat([df[['xid', 'yid', 'hour']], df_predict_vals], axis=1)
    df_features.to_csv(output_file, index=False)
    del df, df_predict_vals, df_features

# 3. Split the data into 5 files according to the days

In [4]:
# data directory
file_path = '../dataset'

for i in range(5, 10, 1):
    time_start = time.time()
    
    # dateset name
    file_name = 'ForecastDataforTesting_day' + str(i + 1) + '.csv'
    
    # judge if the file is already there
    file_list = [x[2] for x in os.walk(file_path)][0]
    if file_name in file_list:
        print ('{} already exists'.format(file_name))
        continue
    
    print ('start Day {}'.format(i + 1))
    
    # load the test data
    df_predicting_train = pd.read_csv('../dataset/ForecastDataforTesting_201802.csv', chunksize=1e7)
    
    df = pd.DataFrame()
    for chunk in df_predicting_train:
        df = pd.concat([df, chunk[chunk.date_id == i + 1]])
    
    # store data
    df.to_csv(os.path.join(file_path, file_name), index=False)
    del df
    
    time_end = time.time()
    print ('Day {0} done! cost {1:.2f} min'.format(i + 1, (time_end - time_start) / 60.0))

start Day 6
Day 6 done! cost 6.26 min
start Day 7
Day 7 done! cost 6.07 min
start Day 8
Day 8 done! cost 5.63 min
start Day 9
Day 9 done! cost 5.47 min
start Day 10
Day 10 done! cost 5.44 min


# 4. Change the data into the format we want

### There may be a memory error if you are using a PC with low memory storage (8G may be just fine, 12G is recommended), don't worry, just close other programs you are running, restart the kernel and run it again

In [3]:
def find_index_by_xyh(xid, yid, hour):
    return (hour-3)*548*421 + (xid-1)*421 + yid - 1

def replace_index(index):
    xyh = index.split('_')
    return find_index_by_xyh(int(xyh[0]), int(xyh[1]), int(xyh[2]))


if not os.path.exists('../dataset/x_y_hour.pickle'):
    t1 = time.time()
    df = pd.read_csv('../dataset/ForecastDataforTesting_day1.csv')
    df['x_y_hour'] = df['xid'].astype(int).astype(str) + '_' + df['yid'].astype(int).astype(str)  + '_' + df['hour'].astype(int).astype(str) 
    with open('../dataset/x_y_hour.pickle', 'wb') as f:
        pickle.dump(df['x_y_hour'], f)
    t2 = time.time()
    print ('cost {0:.2f}min to get x_y_hour.pickle'.format((t2 - t1) / 60))
    del df

for day in range(6, 11, 1):
    for mode in ['wind', 'rainfall']:
        print ('start day {}, mode {}'.format(day, mode))
        if os.path.exists('../dataset/' + mode + '_data_day' + str(day) + '.csv'):
            print (mode + '_data_day' + str(day) + '.csv is already there' )
            continue
        t1 = time.time()
        df = pd.read_csv('../dataset/ForecastDataforTesting_day' + str(day) + '.csv')
        with open('../dataset/x_y_hour.pickle', 'rb') as f:
            x_y_hour = pickle.load(f)
        df['x_y_hour'] = x_y_hour
        df = df.pivot('x_y_hour', 'model', mode)
        df['x_y_hour'] = df.index
        df['real_index'] = df['x_y_hour'].apply(replace_index)
        df = df.sort_values(by=['real_index'])
        df = df.reset_index(drop=True)
        
        # randomly pick a file to get xid, yid, hour
        df_real = pd.read_csv('../dataset/In_situMeasurementforTraining_201802.csv')
        df_real_day = df_real[df_real.date_id == 5].reset_index(drop=True)
        del df_real
        cols = [i for i in range(1, 11)]
        cols_for_real = ['xid', 'yid', 'hour']
        df = pd.concat([df[cols], df_real_day[cols_for_real]], axis=1)
        del df_real_day
        features = ['predict_' + str(i) for i in range(1, 11)] + cols_for_real
        df.columns = features
        df.to_csv('../dataset/' + mode + '_data_day' + str(day)+ '.csv', index=False)
        del df
        t2 = time.time()
        print ('cost {0:.2f}min'.format((t2 - t1) / 60))

start day 6, mode wind
wind_data_day6.csv is already there
start day 6, mode rainfall
rainfall_data_day6.csv is already there
start day 7, mode wind
wind_data_day7.csv is already there
start day 7, mode rainfall
rainfall_data_day7.csv is already there
start day 8, mode wind
wind_data_day8.csv is already there
start day 8, mode rainfall
rainfall_data_day8.csv is already there
start day 9, mode wind
cost 2.33min
start day 9, mode rainfall
cost 2.94min
start day 10, mode wind
cost 3.55min
start day 10, mode rainfall
cost 4.17min


# 5. Outliers processing

In [6]:
for day in range(6, 11, 1):
    for mode in ['wind', 'rainfall']:
        print ('start day {} mode {}'.format(day, mode))
        input_file = '../dataset/' + mode + '_data_day' + str(day) + '.csv'
        output_file = '../dataset/' + mode + '_data_day' + str(day) + '_max1.csv'
        execute_func(input_file, output_file, mode='max')

start day 6 mode wind
4152744/4152744: ##################################################
cost time 17.59min
1252845/4152744 rows are modified
start day 6 mode rainfall
4152744/4152744: ##################################################
cost time 18.47min
1855259/4152744 rows are modified
start day 7 mode wind
4152744/4152744: ##################################################
cost time 16.39min
1250292/4152744 rows are modified
start day 7 mode rainfall
4152744/4152744: ##################################################
cost time 18.50min
2075366/4152744 rows are modified
start day 8 mode wind
4152744/4152744: ##################################################
cost time 16.17min
1159772/4152744 rows are modified
start day 8 mode rainfall
4152744/4152744: ##################################################
cost time 15.63min
1151703/4152744 rows are modified
start day 9 mode wind
4152744/4152744: ##################################################
cost time 16.44min
1282209/4152744 rows 