In [3]:
import pandas as pd
import math
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import pickle

def get_data():
    """
    Load data from CSV, sort by date, select features and target,
    then split the data into training and test sets.
    
    Features: High, Open, Low (used to predict the next day's Close)
    Target: Close
    """
    df = pd.read_csv('/Users/devshah/Documents/WorkSpace/University/year 3/CSC392/Trading_Simulator/data/updated_data.csv', parse_dates=['Date'])
    print(f"Initial data shape: {df.shape}")
    
    # Sort data by date
    df = df.sort_values(by='Date')
    
    # Select relevant columns
    fin_df = df[['High', 'Open', 'Low', 'Close']]
    
    # Define feature set and target
    X = fin_df[['High', 'Open', 'Low']]
    y = fin_df['Close']
    
    # Split the data: 70% training, 30% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=2)
    print(f"Training data shape: {X_train.shape}, Test data shape: {X_test.shape}")
    
    return X_train, X_test, y_train, y_test

def main():
    # Load and prepare the data.
    X_train, X_test, y_train, y_test = get_data()
    
    # Build a pipeline with scaling, kernel approximation (Nystroem), and Ridge regression.
    # Scaling is critical when using kernel methods.
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('nystroem', Nystroem(kernel='rbf', random_state=2)),
        ('ridge', Ridge())
    ])
    
    # Define a broader hyperparameter grid.
    param_grid = {
        'nystroem__n_components': [50, 100, 200, 300],
        'ridge__alpha': [0.01, 0.1, 1.0, 10.0]
    }
    
    # Use GridSearchCV to tune hyperparameters with 5-fold CV.
    grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    print("Best parameters found:")
    print(grid.best_params_)
    
    # Evaluate the best model on the test set.
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmse = math.sqrt(mse)
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R-squared: {r2:.4f}")
    print(f"RMSE: {rmse:.2f}")
    
    # Perform 20-fold cross-validation on the test set.
    kfold = KFold(n_splits=20, shuffle=True, random_state=2)
    cv_scores = cross_val_score(best_model, X_test, y_test, cv=kfold, scoring='r2')
    print("Cross-validated R2: {:.2f}%".format(cv_scores.mean() * 100))
    
    # Save the trained pipeline to disk.
    with open('kernel_ridge_model_approx.pkl', "wb") as f:
        pickle.dump(best_model, f)
    print("Model saved as 'kernel_ridge_model_approx.pkl'")

if __name__ == "__main__":
    main()


Initial data shape: (75100, 7)
Training data shape: (52570, 3), Test data shape: (22530, 3)
Best parameters found:
{'nystroem__n_components': 300, 'ridge__alpha': 0.01}
Mean Squared Error: 1742.93
R-squared: 0.9863
RMSE: 41.75
Cross-validated R2: 97.51%
Model saved as 'kernel_ridge_model_approx.pkl'
