In [56]:
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler

import pandas as pd
import os
save_dir = 'ProcessedData'

files = os.listdir('..//Data//OriginalData')
files.sort(key=lambda x: int(x[1:x.index('_')]))
files

['L1_Train.csv',
 'L2_Train.csv',
 'L2_Train_2.csv',
 'L3_Train.csv',
 'L4_Train.csv',
 'L4_Train_2.csv',
 'L5_Train.csv',
 'L6_Train.csv',
 'L7_Train.csv',
 'L7_Train_2.csv',
 'L8_Train.csv',
 'L8_Train_2.csv',
 'L9_Train.csv',
 'L9_Train_2.csv',
 'L10_Train.csv',
 'L10_Train_2.csv',
 'L11_Train.csv',
 'L12_Train.csv',
 'L12_Train_2.csv',
 'L13_Train.csv',
 'L14_Train.csv',
 'L15_Train.csv',
 'L16_Train.csv',
 'L17_Train.csv']

In [59]:
def get_period(time_str):
    time = datetime.strptime(time_str, "%M:%S").time()
    
    for period, (start, end) in periods.items():
        if start <= time <= end:
            return period
    return None

for file in files:
    file_name = file.split('.')[0]
    
    data_df = pd.read_csv(f'..//Data//OriginalData//{file}')
    
    # 計算時間區段
    data_df['DateTime'] = pd.to_datetime(data_df['DateTime'])
    data_df['Time'] = data_df['DateTime'].dt.strftime('%M:%S')

    period = {
        '00': ("00:00", "09:59"),
        '10': ("10:00", "19:59"),
        '20': ("20:00", "29:59"),
        '30': ("30:00", "39:59"),
        '40': ("40:00", "49:59"),
        '50': ("50:00", "59:59")
    }

    periods = {
        key: (
            datetime.strptime(value[0], "%M:%S").time(),
            datetime.strptime(value[1], "%M:%S").time()
        )
        for key, value in period.items()
    }

    data_df['Minute'] = data_df['Time'].apply(get_period)
    
    selected_features = [
        'WindSpeed(m/s)',
        'Pressure(hpa)',
        'Temperature(°C)',
        'Humidity(%)',
        'Sunlight(Lux)',
        'Power(mW)',
    ]
    
    # 計算每十分鐘的各項 Average, Max, Min
    data_df['Year'] = data_df['DateTime'].dt.year
    data_df['Month'] = data_df['DateTime'].dt.month
    data_df['Day'] = data_df['DateTime'].dt.day
    data_df['Hour'] = data_df['DateTime'].dt.hour
    
    avg_df = data_df.groupby(['Year', 'Month', 'Day', 'Hour', 'Minute'])[selected_features].mean().round(2).reset_index()
    avg_df.rename(columns=lambda x: "Avg_" + x if x in selected_features else x, inplace=True)
    
    max_df = data_df.groupby(['Year', 'Month', 'Day', 'Hour', 'Minute'])[selected_features].max().round(2).reset_index()
    max_df.rename(columns=lambda x: "Max_" + x if x in selected_features else x, inplace=True)
    
    min_df = data_df.groupby(['Year', 'Month', 'Day', 'Hour', 'Minute'])[selected_features].min().round(2).reset_index()
    min_df.rename(columns=lambda x: "Min_" + x if x in selected_features else x, inplace=True)
    
    selected_columns = ['Avg_' + col for col in selected_features]
    std_avg_df = StandardScaler().fit_transform(avg_df[selected_columns])
    std_avg_df = pd.DataFrame(std_avg_df, columns=['Std_' + col for col in selected_columns])
    
    selected_columns_min = ['Min_' + col for col in selected_features]
    std_min_df = StandardScaler().fit_transform(min_df[selected_columns_min])
    std_min_df = pd.DataFrame(std_min_df, columns=['Std_' + col for col in selected_columns_min])
    
    selected_columns_max = ['Max_' + col for col in selected_features]
    std_max_df = StandardScaler().fit_transform(max_df[selected_columns_max])
    std_max_df = pd.DataFrame(std_max_df, columns=['Std_' + col for col in selected_columns_max])
    
    processed_df = avg_df
    processed_df = pd.merge(processed_df, min_df, how='inner')
    processed_df = pd.merge(processed_df, max_df, how='inner')
    
    
    processed_df = pd.concat([processed_df, std_avg_df], axis=1)
    processed_df = pd.concat([processed_df, std_min_df], axis=1)
    processed_df = pd.concat([processed_df, std_max_df], axis=1)
    
    
    device_id_df = data_df['LocationCode'].to_frame()
    device_id_df.rename(columns={'LocationCode' : 'Device_ID'}, inplace=True)
    
    device_id_df = device_id_df.iloc[:len(processed_df)]
    processed_df = pd.concat([device_id_df, processed_df], axis=1)
    
    
    
    # 附上 SeqNumber
    # 序號西元年(4碼) + 月(2碼) + 日(2碼) + 預測時間(4碼) + 裝置代號(2碼)，總共 14 碼
    Year = processed_df['Year'].astype(str)
    Month = processed_df['Month'].astype(str).str.zfill(2)
    Day = processed_df['Day'].astype(str).str.zfill(2)
    Hour = processed_df['Hour'].astype(str).str.zfill(2)
    Minute = processed_df['Minute'].astype(str).str.zfill(2)

    Device_ID = processed_df['Device_ID'].astype(str).str.zfill(2)
    
    processed_df.insert(
        0, 
        'SeqNumber', 
        Year + Month + Day + Hour + Minute + Device_ID
        )
    
    processed_df.to_csv(f"..//Data//{save_dir}//{file_name}_Processed.csv",index=False)
