In [5]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [6]:
import pandas as pd

# Read the Parquet file into a DataFrame
btc_data = pd.read_parquet('../datasets/btc_data_hourly.parquet')

# Print the head of the DataFrame
btc_data.head()

Unnamed: 0,datetime,Open,High,Low,Close,Volume
0,2021-01-01 00:00:00,28912.47,28940.0,28896.36,28897.2,14.556408
1,2021-01-01 01:00:00,28949.71,28971.19,28934.74,28951.62,2.90925
2,2021-01-01 02:00:00,29200.0,29200.0,29141.89,29182.39,6.57342
3,2021-01-01 03:00:00,28987.6,29036.54,28987.6,29036.54,2.05827
4,2021-01-01 04:00:00,29135.25,29161.46,29084.48,29086.47,4.388466


In [8]:
max_timestamp = btc_data['datetime'].max()
min_timestamp = btc_data['datetime'].min()

max_timestamp, min_timestamp

(Timestamp('2023-02-01 00:00:00'), Timestamp('2021-01-01 00:00:00'))

In [10]:
btc_data_sorted = btc_data.sort_values(by='datetime', ascending=False)
print(btc_data_sorted.head())

                 datetime     Open     High      Low    Close     Volume
18264 2023-02-01 00:00:00  23170.0  23177.0  23165.0  23175.0  19.798608
18263 2023-01-31 23:00:00  23157.0  23157.0  23140.0  23140.0   0.101278
18262 2023-01-31 22:00:00  23110.0  23118.0  23108.0  23118.0   1.162838
18261 2023-01-31 21:00:00  23128.0  23131.0  23128.0  23131.0   0.242541
18260 2023-01-31 20:00:00  23129.0  23140.0  23122.0  23124.0   0.304229


In [12]:
import json
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Define file paths
parquet_file_path = "../datasets/normalised_bitcoin_price_hours.parquet"
json_file_path = "../datasets/scaler_params_hours.json"

# Ensure dataset directory exists
import os
os.makedirs("./datasets", exist_ok=True)

# Preserve the date column separately
btc_data.reset_index(inplace=True)  # Ensure 'date' is a regular column

# Selecting relevant columns (excluding 'date' for scaling)
features = ['Open', 'High', 'Low', 'Close', 'Volume']
btc_data_subset = btc_data[features]

# Initialize and apply MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
btc_data_scaled = pd.DataFrame(scaler.fit_transform(btc_data_subset), columns=features)

# Restore the 'date' column
btc_data_scaled.insert(0, 'datetime', btc_data['datetime'])

# Save scaled data as a Parquet file
btc_data_scaled.to_parquet(parquet_file_path, index=False)

# Save Min/Max values in JSON
scaler_params = {
    "min": btc_data_subset.min().to_dict(),
    "max": btc_data_subset.max().to_dict()
}
with open(json_file_path, "w") as f:
    json.dump(scaler_params, f, indent=4)

print(f"Normalized data saved at: {parquet_file_path}")
print(f"Scaler parameters saved at: {json_file_path}")


Normalized data saved at: ../datasets/normalised_bitcoin_price_hours.parquet
Scaler parameters saved at: ../datasets/scaler_params_hours.json


In [13]:
# Read the normalized Parquet file
normalized_btc_data = pd.read_parquet('../datasets/normalised_bitcoin_price_hours.parquet')

# Display the head of the DataFrame
normalized_btc_data.head()

Unnamed: 0,datetime,Open,High,Low,Close,Volume
0,2021-01-01 00:00:00,0.25073,0.251106,0.250628,0.250493,0.076344
1,2021-01-01 01:00:00,0.251433,0.251695,0.251353,0.25152,0.015258
2,2021-01-01 02:00:00,0.256154,0.256011,0.255263,0.255877,0.034476
3,2021-01-01 03:00:00,0.252147,0.252928,0.252351,0.253123,0.010795
4,2021-01-01 04:00:00,0.254932,0.255284,0.25418,0.254066,0.023016
