In [1]:
import pandas as pd
import glob
import os
from tqdm import tqdm

# Define the file path
file_path = "C:\\Users\\User\\Desktop\\Final Year Project\\HKO Raw Data\\aws\\"

# List all CSV files in the aws directory
csv_files = glob.glob(os.path.join(file_path, "*.csv"))

# Initialize an empty list to store the individual DataFrames
df_list = []

# Loop over the list of CSV files with tqdm for progress tracking
for file in tqdm(csv_files, desc="Reading CSV files"):
    
    # Check if the file is empty
    if os.stat(file).st_size == 0:
        print(f"Skipping empty file: {file}")
        continue

    # Read the csv file and append to df_list without checking for errors
    df_list.append(pd.read_csv(file, header=None))

# Concatenate all DataFrames in the list into one DataFrame
df = pd.concat(df_list, ignore_index=True)

# Now assign column names to the concatenated DataFrame
df.columns = ['aws_category', 'datetime', 'category', 'value']

Reading CSV files:   0%|          | 6/2098 [00:00<00:54, 38.45it/s]

Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-01.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-02.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-03.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-04.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-05.csv


Reading CSV files:  12%|█▏        | 244/2098 [00:02<00:04, 445.52it/s]

Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-23.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-24.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-25.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-26.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-27.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-28.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-29.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-30.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-31.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-02-01.csv
Skipping empty file:

Reading CSV files: 100%|██████████| 2098/2098 [07:52<00:00,  4.44it/s]


In [None]:
# Pivot the DataFrame to have categories as columns and fill NaN with zeros
pivot_df = df.pivot_table(
    index='datetime', 
    columns=['aws_category', 'category'], 
    values='value', 
    aggfunc='first'
).fillna(0)

# Flatten the MultiIndex columns
pivot_df.columns = ['{}_{}'.format(aws_cat, cat) for aws_cat, cat in pivot_df.columns]

# Reset index to turn the datetime index into a column
pivot_df.reset_index(inplace=True)

# Convert the 'datetime' column to datetime objects without timezone information
pivot_df['datetime'] = pd.to_datetime(pivot_df['datetime'], utc=False).dt.tz_localize(None)

# Your pivot_df now has columns like 'A1C_A1', 'A1C_B1', ..., 'YTS_A1', etc.
pivot_df

In [None]:
pivot_df.to_csv('C:\\Users\\User\\Desktop\\Final Year Project\\Code\\Processed_HKO\\HKO_aws_concatenated_data.csv', index=False)

In [1]:
import pandas as pd
import glob
import os
from tqdm import tqdm

# Define the input file path
input_file_path = "C:\\Users\\User\\Desktop\\Final Year Project\\HKO Raw Data\\aws\\"

# Define the output file path
output_file_path = "C:\\Users\\User\\Desktop\\Final Year Project\\Code\\Processed_HKO\\processing_aws\\"

# Ensure the output directory exists
os.makedirs(output_file_path, exist_ok=True)

# List all CSV files in the aws directory
csv_files = glob.glob(os.path.join(input_file_path, "*.csv"))

# Loop over the list of CSV files with tqdm for progress tracking
for file in tqdm(csv_files, desc="Processing CSV files"):
    
    # Check if the file is empty
    if os.stat(file).st_size == 0:
        print(f"Skipping empty file: {file}")
        continue

    # Read the CSV file
    df = pd.read_csv(file, header=None)

    # Assign column names
    df.columns = ['aws_category', 'datetime', 'category', 'value']

    # Pivot the DataFrame to have categories as columns and fill NaN with zeros
    pivot_df = df.pivot_table(
        index='datetime', 
        columns=['aws_category', 'category'], 
        values='value', 
        aggfunc='first'
    ).fillna(0)

    # Flatten the MultiIndex columns
    pivot_df.columns = ['{}_{}'.format(aws_cat, cat) for aws_cat, cat in pivot_df.columns]

    # Reset index to turn the datetime index into a column
    pivot_df.reset_index(inplace=True)

    # Convert the 'datetime' column to datetime objects without timezone information
    pivot_df['datetime'] = pd.to_datetime(pivot_df['datetime'], utc=False).dt.tz_localize(None)

    # Get the original file name without the path and the extension
    original_file_name = os.path.splitext(os.path.basename(file))[0]

    # Define the output file name
    output_file = os.path.join(output_file_path, f"{original_file_name}.csv")

    # Save the processed DataFrame to the specified path with the original name
    pivot_df.to_csv(output_file, index=False)

Processing CSV files:   0%|          | 0/2098 [00:00<?, ?it/s]

Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-01.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-02.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-03.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-04.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-05.csv


Processing CSV files:   2%|▏         | 33/2098 [00:11<04:19,  7.96it/s]

Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-23.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-24.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-25.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-26.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-27.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-28.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-29.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-30.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-01-31.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-02-01.csv
Skipping empty file:

Processing CSV files:   2%|▏         | 37/2098 [00:11<03:17, 10.45it/s]

Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-02-06.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-02-07.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-02-08.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-02-09.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-02-10.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-02-11.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-02-12.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-02-13.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-02-14.csv
Skipping empty file: C:\Users\User\Desktop\Final Year Project\HKO Raw Data\aws\aws_2018-02-15.csv
Skipping empty file:

Processing CSV files: 100%|██████████| 2098/2098 [22:16<00:00,  1.57it/s]


In [2]:
import pandas as pd
import glob
import os
from tqdm import tqdm

# Define the output file path where the individual processed files are saved
output_file_path = "C:\\Users\\User\\Desktop\\Final Year Project\\Code\\Processed_HKO\\processing_aws\\"

# Define the path where the concatenated file will be saved
final_output_path = "C:\\Users\\User\\Desktop\\Final Year Project\\Code\\Processed_HKO\\"

# List all processed CSV files in the output directory
processed_csv_files = glob.glob(os.path.join(output_file_path, "*.csv"))

# Initialize an empty list to store the individual DataFrames
concat_df_list = []

# Loop over the list of processed CSV files with tqdm for progress tracking
for file in tqdm(processed_csv_files, desc="Concatenating CSV files"):
    # Read the csv file and append to concat_df_list
    concat_df_list.append(pd.read_csv(file))

# Concatenate all DataFrames in the list into one DataFrame
combined_df = pd.concat(concat_df_list, ignore_index=True)

# Save the combined DataFrame to the final output path
combined_file_name = "HKO_aws_concatenated_data.csv"
combined_file_path = os.path.join(final_output_path, combined_file_name)
combined_df.to_csv(combined_file_path, index=False)

print(f"Combined CSV saved to: {combined_file_path}")

Concatenating CSV files: 100%|██████████| 1872/1872 [01:30<00:00, 20.76it/s]


Combined CSV saved to: C:\Users\User\Desktop\Final Year Project\Code\Processed_HKO\HKO_aws_concatenated_data.csv


In [4]:
import pandas as pd
df = pd.read_csv("C:\\Users\\User\\Desktop\\Final Year Project\\Code\\Processed_HKO\\HKO_aws_concatenated_data.csv")

In [5]:
df

Unnamed: 0,datetime,A1C_A1,A1C_B1,A1C_C1,A1C_D1,A1C_E1,A1C_F1,A1C_G1,A1C_H1,A1C_I1,...,R3E_y1,R3E_wg2,R3W_wg2,CT5_J1,CT5_L1,CT5_M1,CT5_N1,CT5_O1,CT5_P1,CT5_Q1
0,2018-01-06 00:00:00,93.0,41.0,49.0,101.0,42.0,57.0,100.0,46.0,78.0,...,,,,,,,,,,
1,2018-01-06 00:01:00,95.0,33.0,38.0,99.0,41.0,57.0,100.0,46.0,78.0,...,,,,,,,,,,
2,2018-01-06 00:02:00,96.0,37.0,40.0,98.0,40.0,54.0,100.0,46.0,78.0,...,,,,,,,,,,
3,2018-01-06 00:03:00,96.0,44.0,61.0,97.0,40.0,61.0,100.0,46.0,78.0,...,,,,,,,,,,
4,2018-01-06 00:04:00,98.0,52.0,62.0,97.0,42.0,62.0,100.0,46.0,78.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2695670,2023-09-30 23:55:00,175.0,24.0,26.0,179.0,23.0,28.0,240.0,20.0,43.0,...,120.0,,,232.0,224.0,95.0,281.0,221.0,1603.0,401.0
2695671,2023-09-30 23:56:00,182.0,27.0,30.0,180.0,23.0,30.0,240.0,20.0,43.0,...,120.0,,,232.0,226.0,96.0,281.0,221.0,1603.0,401.0
2695672,2023-09-30 23:57:00,179.0,29.0,31.0,181.0,24.0,31.0,240.0,20.0,43.0,...,120.0,,,232.0,226.0,96.0,281.0,221.0,1603.0,401.0
2695673,2023-09-30 23:58:00,179.0,28.0,31.0,180.0,24.0,31.0,240.0,20.0,43.0,...,120.0,,,232.0,226.0,96.0,281.0,221.0,1603.0,401.0


In [14]:
for i in df.iloc[1000000]:
    print(i)


2020-07-09 10:40:00
238.0
65.0
84.0
225.0
60.0
105.0
220.0
56.0
105.0
55.0
238.0
86.0
225.0
70.0
86.0
232.0
73.0
108.0
220.0
70.0
110.0
69.0
228.0
90.0
236.0
78.0
86.0
235.0
93.0
132.0
230.0
91.0
132.0
82.0
236.0
88.0
159.0
42.0
73.0
186.0
42.0
104.0
190.0
45.0
122.0
nan
nan
nan
nan
nan
nan
nan
232.0
94.0
125.0
251.0
76.0
173.0
220.0
73.0
177.0
309.0
274.0
262.0
76.0
314.0
293.0
10039.0
10047.0
0.0
0.0
0.0
9999.0
3005.0
2411.0
0.0
0.0
9999.0
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
238.0
238.0
238.0
100.0
242.0
232.0
nan
158.0
89.0
105.0
156.0
81.0
150.0
160.0
81.0
150.0
106.0
238.0
66.0
79.0
234.0
60.0
83.0
230.0
56.0
119.0
65.0
237.0
80.0
225.0
79.0
91.0
227.0
83.0
126.0
220.0
77.0
126.0
80.0
232.0
92.0
227.0
101.0
128.0
230.0
90.0
131.0
230.0
74.0
137.0
97.0
227.0
130.0
226.0
87.0
107.0
221.0
93.0
121.0
220.0
92.0
129.0
82.0
224.0
108.0
229.0
80.0
95.0
227.0
87.0
130.0
230.0
80.0
130.0
79.0
228.0
96.0
226.0
80.0
98.0
224.0
83.0
104.0
220.0
81.0
117.0
83.0
226.0
100.0
189.0
50