In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
df = pd.read_csv('Dataset\doge_dataset_day_ohlcvm.csv')

# Drop rows with missing values
df.dropna(inplace=True)

# Features and target
X = df[['open', 'high', 'low', 'volume_DOGE', 'market_cap', 'volume']]  # You can add more features if available
y = df['close']  # Target variable: closing price

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")


Mean Squared Error: 0.0000
R² Score: 0.9962


## Correlation Matrix

In [3]:
print(df[['open', 'close', 'high', 'low', 'volume_DOGE', 'market_cap', 'volume']].corr())

                 open     close      high       low  volume_DOGE  market_cap  \
open         1.000000  0.995383  0.996608  0.996618     0.068783    0.748095   
close        0.995383  1.000000  0.998233  0.996842     0.079557    0.748373   
high         0.996608  0.998233  1.000000  0.994104     0.088358    0.740723   
low          0.996618  0.996842  0.994104  1.000000     0.060686    0.757410   
volume_DOGE  0.068783  0.079557  0.088358  0.060686     1.000000    0.066979   
market_cap   0.748095  0.748373  0.740723  0.757410     0.066979    1.000000   
volume       0.724094  0.727230  0.731679  0.719526     0.219756    0.531926   

               volume  
open         0.724094  
close        0.727230  
high         0.731679  
low          0.719526  
volume_DOGE  0.219756  
market_cap   0.531926  
volume       1.000000  


From the correlation matrix, we can see that market cap and price is correlated. Now there are two options:
- Building a time aware model: Model depend on the past data instead of the derived current values.
- Remove market cap from the features
I will focus more one first option as it could be more realistic

# Building a time-aware model

## Linear Regression, Random Forest, XGBoost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src import lag_features
from src.tuning import tune_random_forest, tune_xgboost, tune_lightgbm
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score

# --- Loop through different lag settings ---
for lag in range(1, 8):
    print(f"\n📊 Evaluating models with {lag}-day lag features:")
    X_train, X_test, y_train, y_test = lag_features.create_lag_features(lag)

    # Tune hyperparameters for Random Forest, XGBoost, and LightGBM
    best_rf, mse_rf, r2_rf = tune_random_forest(X_train, y_train, X_test, y_test)
    best_xgb, mse_xgb, r2_xgb = tune_xgboost(X_train, y_train, X_test, y_test)
    best_lgb, mse_lgb, r2_lgb = tune_lightgbm(X_train, y_train, X_test, y_test)

    # --- Train Models ---
    models = {
        "Linear Regression": LinearRegression(),
        "Random Forest": best_rf,
        "XGBoost": best_xgb,
        "LightGBM": best_lgb
    }

    predictions = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        predictions[name] = preds
        mse = mean_squared_error(y_test, preds)
        r2 = r2_score(y_test, preds)
        print(f"{name} - MSE: {mse:.6f}, R²: {r2:.4f}")

    # --- Plot Predictions ---
    plt.figure(figsize=(12, 6))
    plt.plot(y_test.values, label='Actual Price', color='black')

    for name, preds in predictions.items():
        linestyle = {
            "Linear Regression": "--",
            "Random Forest": "-.",
            "XGBoost": ":",
            "LightGBM": "-"
        }[name]
        color = {
            "Linear Regression": "red",
            "Random Forest": "green",
            "XGBoost": "orange",
            "LightGBM": "purple"
        }[name]
        plt.plot(preds, label=name, linestyle=linestyle, color=color)

    plt.title(f'📈 Actual vs Predicted Dogecoin Prices ({lag}-Day Lag)')
    plt.xlabel('Test Sample Index')
    plt.ylabel('Price (USD)')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    # plt.show()



📊 Evaluating models with 1-day lag features:


# Tuning
## Random Forest

In [5]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

param_dist_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None] 
}


rf_random = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_dist_rf,
    n_iter=20,
    cv=3,
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

rf_random.fit(X_train, y_train)
best_rf = rf_random.best_estimator_
y_pred_random = best_rf.predict(X_test)
# Evaluation
mse_rf = mean_squared_error(y_test, y_pred_random)
r2_rf = r2_score(y_test, y_pred_random)
print(f"Best Random Forest Parameters: {rf_random.best_params_}")
print(f"Mean Squared Error: {mse_rf:.4f}")
print(f"R² Score: {r2_rf:.4f}")

Best Random Forest Parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10}
Mean Squared Error: 0.0007
R² Score: 0.8873


## XGBoost Tuning

In [6]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

param_dist_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_random = RandomizedSearchCV(
    XGBRegressor(random_state=42),
    param_distributions=param_dist_xgb,
    n_iter=20,
    cv=3,
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

xgb_random.fit(X_train, y_train)
best_xgb = xgb_random.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)
# Evaluation
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"Best XGBoost Parameters: {xgb_random.best_params_}")
print(f"Mean Squared Error: {mse_xgb:.4f}")
print(f"R² Score: {r2_xgb:.4f}")

Best XGBoost Parameters: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
Mean Squared Error: 0.0014
R² Score: 0.7749


## LightGBM Tuning

In [7]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV

param_dist_lgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, -1],
    'num_leaves': [31, 50, 100],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

lgb_random = RandomizedSearchCV(
    LGBMRegressor(random_state=42),
    param_distributions=param_dist_lgb,
    n_iter=20,
    cv=3,
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

lgb_random.fit(X_train, y_train)
best_lgb = lgb_random.best_estimator_
y_pred_lgb = best_lgb.predict(X_test)
# Evaluation
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
r2_lgb = r2_score(y_test, y_pred_lgb)
print(f"Best LightGBM Parameters: {lgb_random.best_params_}")
print(f"Mean Squared Error: {mse_lgb:.4f}")
print(f"R² Score: {r2_lgb:.4f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000285 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1027
[LightGBM] [Info] Number of data points in the train set: 2929, number of used features: 6
[LightGBM] [Info] Start training from score 0.027435
Best LightGBM Parameters: {'subsample': 1.0, 'num_leaves': 50, 'n_estimators': 300, 'max_depth': -1, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
Mean Squared Error: 0.0005
R² Score: 0.9215


## Tommorrows price

In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Load & Sort Data
df = pd.read_csv('Dataset/doge_dataset_day_ohlcvm.csv', parse_dates=['date'])
df = df.sort_values('date')

# Step 2: Create Lag Feature for Prediction
last = df.iloc[-1]
X_tomorrow = pd.DataFrame([[
    last['market_cap'], 
    last['volume'], 
    last['volume_DOGE'], 
    last['open'], 
    last['high'], 
    last['low']
]], columns=[
    'market_cap_lag1', 'volume_lag1', 'volume_DOGE_lag1', 
    'open_lag1', 'high_lag1', 'low_lag1'
])

# Step 3: Build Lagged Dataset
df['market_cap_lag1'] = df['market_cap'].shift(1)
df['volume_lag1'] = df['volume'].shift(1)
df['volume_DOGE_lag1'] = df['volume_DOGE'].shift(1)
df['open_lag1'] = df['open'].shift(1)
df['high_lag1'] = df['high'].shift(1)
df['low_lag1'] = df['low'].shift(1)
df.dropna(inplace=True)

# Step 4: Split Data
features = ['market_cap_lag1', 'volume_lag1', 'volume_DOGE_lag1', 
            'open_lag1', 'high_lag1', 'low_lag1']
X = df[features]
y = df['close']
train_size = int(len(df) * 0.7)
X_train, y_train = X.iloc[:train_size], y.iloc[:train_size]

# Step 5: Train All Models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=200, min_samples_split=2, min_samples_leaf=1, max_depth=10),
    'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    'LightGBM': LGBMRegressor(n_estimators=300, learning_rate=0.1, random_state=42)
}

# Step 6: Fit and Predict for Today
print(f"📅 Based on yesterday’s data ({last['date'].date()}),")
for name, model in models.items():
    model.fit(X_train, y_train)
    predicted_price = model.predict(X_tomorrow)[0]
    print(f"🔹 {name} predicted Dogecoin price: ${predicted_price:.6f}")


📅 Based on yesterday’s data (2025-06-27),
🔹 Linear Regression predicted Dogecoin price: $0.157493
🔹 Random Forest predicted Dogecoin price: $0.163134
🔹 XGBoost predicted Dogecoin price: $0.169618
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000087 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1027
[LightGBM] [Info] Number of data points in the train set: 2929, number of used features: 6
[LightGBM] [Info] Start training from score 0.027435
🔹 LightGBM predicted Dogecoin price: $0.161885
