In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
import warnings
import os

# Tắt warning và GPU logs để gọn output
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [None]:
# --- 1. CONFIGURATION  ---
FILE_PATH = '/kaggle/input/cryto-data/crypto-2/BITCOIN24.csv' # Thay path của bạn
SPLIT_DATE = '2020-01-01'

In [None]:
# FIX 1: Tăng Window Size để mô hình có đủ dữ liệu học
WINDOW_SIZE = 365       # Dùng 250 ngày quá khứ để train cho ngày tiếp theo
TEST_SAMPLES = 50       # Test 50 ngày cuối
LAGS = 10                # Số ngày quan sát để dự báo ngày tiếp theo (Timesteps)

In [None]:
# --- 2. DATA LOADING & PREPROCESSING (GIỮ NGUYÊN) ---
def load_and_process_data(filepath):
    df = pd.read_csv(filepath)
    try:
        df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
    except:
        df['Date'] = pd.to_datetime(df['Date'])
    
    df = df.sort_values('Date').reset_index(drop=True)
    
    # Log-Transformation & Differencing
    df['Log_Price'] = np.log(df['Close'])
    df['Log_Return'] = df['Log_Price'].diff()
    df = df.dropna().reset_index(drop=True)
    
    # Chia 2 giai đoạn
    df_pre = df[df['Date'] < SPLIT_DATE].reset_index(drop=True)
    df_post = df[df['Date'] >= SPLIT_DATE].reset_index(drop=True)
    
    return df_pre, df_post

In [None]:
df_pre, df_post = load_and_process_data(FILE_PATH)
print(len(df_pre))
print(len(df_post))

In [None]:
# --- 3. METRICS ---
def calculate_metrics(y_true, y_pred, model_name, period_name):
    y_true = np.array(y_true).flatten()
    y_pred = np.array(y_pred).flatten()
    
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    
    true_dir = np.sign(y_true)
    pred_dir = np.sign(y_pred)
    mda = np.mean(true_dir == pred_dir) * 100
    
    up_idx = (true_dir == 1)
    if np.sum(up_idx) > 0:
        mda_plus = np.mean(true_dir[up_idx] == pred_dir[up_idx]) * 100
    else:
        mda_plus = 0
        
    signals = np.sign(y_pred)
    returns = signals * y_true
    total_return = np.sum(returns)
    
    return {
        "Period": period_name,
        "Model": model_name,
        "RMSE": rmse,
        "MAE": mae,
        "MDA (%)": mda,
        "MDA+ (%)": mda_plus,
        "Total Return (Log)": total_return
    }

In [None]:
# ---ROLLING WINDOW ENGINE---
def run_rolling_window(df, model_type, lags=LAGS):
    data = df['Log_Return'].values
    
    predictions = []
    actuals = []
    last_mlp = None
    last_scaler_mlp = None
    n_total = len(data)
    # Kiểm tra nếu dữ liệu không đủ cho Window Size
    if n_total < WINDOW_SIZE + TEST_SAMPLES:
        print(f"Lỗi: Dữ liệu quá ngắn ({n_total}) so với Window ({WINDOW_SIZE}) + Test ({TEST_SAMPLES})")
        return [], []

    if TEST_SAMPLES:
        start_index = n_total - TEST_SAMPLES
    else:
        start_index = WINDOW_SIZE
        
    print(f"   + Rolling Window: {start_index} -> {n_total} (Train Window Size: {WINDOW_SIZE})")
    
    for i in range(start_index, n_total):
        # Cắt cửa sổ huấn luyện [i-WINDOW : i]
        train_data = data[i-WINDOW_SIZE : i].reshape(-1, 1)
        test_val = data[i]
        
        # Scaling
        scaler = MinMaxScaler(feature_range=(0, 1))
        train_scaled = scaler.fit_transform(train_data)
        
        # Tạo Sequences
        X_train, y_train = [], []
        for j in range(lags, len(train_scaled)):
            X_train.append(train_scaled[j-lags : j, 0])
            y_train.append(train_scaled[j, 0])
            
        X_train, y_train = np.array(X_train), np.array(y_train)
        
        # Kiểm tra nếu không có dữ liệu train (do window quá nhỏ)
        if len(X_train) == 0:
            print("Lỗi: Window Size quá nhỏ so với Lags, không tạo được mẫu train.")
            break

        last_sequence = train_scaled[-lags:].reshape(1, -1)
        
        pred_scaled = 0
        
        # --- Tinh chỉnh tham số Model---
        if model_type == 'MLP':
            # Giảm độ phức tạp model để train nhanh hơn và đỡ overfit
            model = MLPRegressor(hidden_layer_sizes=(32, 16), activation='relu', 
                                 solver='adam', max_iter=300, random_state=42, early_stopping=True)
            model.fit(X_train, y_train)
            train_loss = model.loss_                     
            print(f"     MLP Loss: {train_loss:.6f}")
            pred_scaled = model.predict(last_sequence)
            last_mlp = model
            last_scaler_mlp = scaler
            
        elif model_type == 'LSTM':
            X_train_lstm = X_train.reshape(X_train.shape[0], lags, 1)
            last_sequence_lstm = last_sequence.reshape(1, lags, 1)
            
            tf.random.set_seed(42)
            model = Sequential()
            model.add(Input(shape=(lags, 1)))
            # Giảm units xuống 32, bỏ Dropout nếu dữ liệu ít
            model.add(LSTM(32, activation='tanh')) 
            model.add(Dense(1))
            model.compile(optimizer='adam', loss='mse')
            # Tăng epochs
            model.fit(X_train_lstm, y_train, epochs=20, batch_size=32, verbose=0, shuffle=False)
            pred_scaled = model.predict(last_sequence_lstm, verbose=0)
            
        pred = scaler.inverse_transform(pred_scaled.reshape(-1, 1))[0][0]
        
        predictions.append(pred)
        actuals.append(test_val)
        
        if (i - start_index) % 10 == 0:
            print(f"     Step {i}/{n_total}: True={test_val:.5f}, Pred={pred:.5f}")

    return actuals, predictions,last_mlp,last_scaler_mlp

In [None]:
# --- 5. MAIN EXECUTION ---
df_pre, df_post = load_and_process_data(FILE_PATH)
results = []

print("\n=== EXPERIMENT 1: PRE-COVID ===")
y_true_pre_mlp, y_pred_pre_mlp,last_mlp, last_scaler_mlp = run_rolling_window(df_pre, 'MLP')
if y_true_pre_mlp: results.append(calculate_metrics(y_true_pre_mlp, y_pred_pre_mlp, "MLP", "Pre-Covid"))

y_true_pre_lstm, y_pred_pre_lstm = run_rolling_window(df_pre, 'LSTM')
if y_true_pre_lstm: results.append(calculate_metrics(y_true_pre_lstm, y_pred_pre_lstm, "LSTM", "Pre-Covid"))

print("\n=== EXPERIMENT 2: POST-COVID ===")
y_true_post_mlp, y_pred_post_mlp = run_rolling_window(df_post, 'MLP')
if y_true_post_mlp: results.append(calculate_metrics(y_true_post_mlp, y_pred_post_mlp, "MLP", "Post-Covid"))

y_true_post_lstm, y_pred_post_lstm = run_rolling_window(df_post, 'LSTM')
if y_true_post_lstm: results.append(calculate_metrics(y_true_post_lstm, y_pred_post_lstm, "LSTM", "Post-Covid"))

# Display
final_df = pd.DataFrame(results)
print("\n================ FINAL RESULTS ================")
print(final_df)

In [None]:
import pandas as pd

df = pd.read_csv('/kaggle/input/cryto-data/crypto-2/BITCOIN24.csv')
df.info()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv("/kaggle/input/cryto-data/crypto-2/BITCOIN24.csv")

# Datetime & sort
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)

# Log-transform
df['log_price'] = np.log(df['Close'])

plt.figure()
plt.plot(df['Date'], df['log_price'])
plt.title("Logarithmic Bitcoin Price Over Time")
plt.xlabel("Date")
plt.ylabel("Log Price")
plt.show()

In [None]:
df['log_return'] = df['log_price'].diff()

plt.figure()
plt.plot(df['Date'], df['log_return'])
plt.title("First Difference of Log Price (Log Return)")
plt.xlabel("Date")
plt.ylabel("Log Return")
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
import warnings

warnings.filterwarnings('ignore')

# 1. Load and Inspect Data
file_path = '/kaggle/input/cryto-data/crypto-2/BITCOIN24.csv' # Adjusted for standard path
df = pd.read_csv(file_path)

# Convert Date and Sort
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df = df.sort_values('Date').reset_index(drop=True)

# 2. Global Preprocessing
df['log_price'] = np.log(df['Close'])
df['returns'] = df['log_price'].diff()
df = df.dropna().reset_index(drop=True)

# --- CORRECTION 1: Data Split Strategy ---
# We focus on the "Post-Covid" period as the dataset for experiment
split_date = '2020-01-01'
df_post_covid = df[df['Date'] >= split_date].reset_index(drop=True)
print(f"Analyzing Post-Covid Period: {len(df_post_covid)} samples")

# --- CORRECTION 2: Rolling Window Setup ---
# Instead of a single split, we define a window size for training
TRAIN_WINDOW_SIZE = 300 # e.g., Train on 300 days
TEST_HORIZON = 1        # Predict 1 day ahead

# Prepare Data Arrays
# For MLP (Lags): We create them dynamically or pre-calc
n_lags = 5
# Create Lags (Chronological for LSTM: lag_5, lag_4... lag_1)
for i in range(n_lags, 0, -1): # 5, 4, 3, 2, 1
    df_post_covid[f'lag_{i}'] = df_post_covid['returns'].shift(i)

df_model = df_post_covid.dropna().reset_index(drop=True)

# Features & Target
feature_cols = [f'lag_{i}' for i in range(n_lags, 0, -1)] # Chronological order
X = df_model[feature_cols].values
y = df_model['returns'].values
dates = df_model['Date'].values

# --- 3. METRICS DEFINITION (Kept your excellent logic) ---
def calculate_metrics(y_true, y_pred, model_name="Model"):
    epsilon = 1e-10
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    
    # Directional Accuracy
    direction_true = np.sign(y_true)
    direction_pred = np.sign(y_pred)
    mda = np.mean(direction_true == direction_pred) * 100
    
    # Trading Strategy Return
    trading_signals = np.sign(y_pred)
    strategy_returns = trading_signals * y_true
    total_return = np.sum(strategy_returns)
    
    return {
        "Model": model_name,
        "RMSE": rmse,
        "MAE": mae,
        "MDA (%)": mda,
        "Total Return": total_return
    }

# --- 4. ROLLING WINDOW TRAINING LOOP ---
# This simulates the real-world application described in the paper
print(f"\nStarting Rolling Window Evaluation (Last 50 days demo)...")

preds_mlp = []
preds_lstm = []
actuals = []

# We will run a loop for the last 50 days to demonstrate (running full dataset takes time)
start_index = len(X) - 50 

for i in range(start_index, len(X)):
    # Define Rolling Train/Test Sets
    # Train window slides: [i-300 : i]
    # Test point: [i]
    X_train_roll = X[i-TRAIN_WINDOW_SIZE : i]
    y_train_roll = y[i-TRAIN_WINDOW_SIZE : i]
    X_test_roll = X[i : i+1]
    y_test_roll = y[i : i+1]
    
    # --- MODEL 1: MLP ---
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_roll)
    X_test_scaled = scaler.transform(X_test_roll)
    
    mlp = MLPRegressor(hidden_layer_sizes=(50,), max_iter=200, random_state=42)
    mlp.fit(X_train_scaled, y_train_roll)
    pred_mlp = mlp.predict(X_test_scaled)[0]
    preds_mlp.append(pred_mlp)
    
    # --- MODEL 2: LSTM ---
    # Reshape for LSTM: (Samples, TimeSteps, Features)
    # TimeSteps = n_lags, Features = 1
    X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], n_lags, 1))
    X_test_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], n_lags, 1))
    
    # Simple LSTM (Re-initialized each step to prevent leakage)
    model_lstm = Sequential()
    model_lstm.add(Input(shape=(n_lags, 1)))
    model_lstm.add(LSTM(50, activation='tanh', verbose=0))
    model_lstm.add(Dense(1))
    model_lstm.compile(optimizer='adam', loss='mse')
    model_lstm.fit(X_train_lstm, y_train_roll, epochs=5, batch_size=32, verbose=0, shuffle=False)
    
    pred_lstm = model_lstm.predict(X_test_lstm, verbose=0)[0][0]
    preds_lstm.append(pred_lstm)
    
    actuals.append(y_test_roll[0])
    
    if i % 10 == 0:
        print(f"Processed step {i}/{len(X)}")

# --- 5. EVALUATION ---
metrics_mlp = calculate_metrics(np.array(actuals), np.array(preds_mlp), "MLP (Rolling)")
metrics_lstm = calculate_metrics(np.array(actuals), np.array(preds_lstm), "LSTM (Rolling)")

results_df = pd.DataFrame([metrics_mlp, metrics_lstm])
print("\n--- Rolling Window Results (Post-Covid Subset) ---")
print(results_df)