In [7]:
import os
os.chdir("D:\\github\\vn_backtester")
os.getcwd()

'D:\\github\\vn_backtester'

In [27]:
import pandas as pd
import numpy as np

__closepath__ = 'data/vn_stock/price_volume/close_matrix_top30_20120101-20240101.txt'
__openpath__ = 'data/vn_stock/price_volume/open_matrix_top30_20120101-20240101.txt'
__highpath__ = 'data/vn_stock/price_volume/high_matrix_top30_20120101-20240101.txt'
__lowpath__ = 'data/vn_stock/price_volume/low_matrix_top30_20120101-20240101.txt'
__volumepath = 'data/vn_stock/price_volume/volume_matrix_top30_20120101-20240101.txt'

def load_and_process_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    df.set_index('time', inplace=True)
    df = df.astype(float)
    return df

close = load_and_process_data(__closepath__)
open = load_and_process_data(__openpath__)
high = load_and_process_data(__highpath__)
low = load_and_process_data(__lowpath__)
volume = load_and_process_data(__volumepath)

In [11]:
close

Unnamed: 0_level_0,A32,AAA,AAM,AAS,AAT,AAV,ABB,ABC,ABI,ABR,...,XHC,XLV,XMC,XMD,XMP,XPH,YBC,YBM,YEG,YTC
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-01-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-01-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-12-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-12-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-12-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score

def prepare_data(df, ticker, feature_windows=[9, 13, 21]):
    data = df[[ticker]].dropna()
    data['time_index'] = np.arange(len(data))
    
    for window in feature_windows:
        data[f'SMA_{window}'] = data[ticker].rolling(window=window).mean()  # Simple Moving Average
        data[f'EMA_{window}'] = data[ticker].ewm(span=window, adjust=False).mean()  # Exponential Moving Average
        data[f'Rolling_Var_{window}'] = data[ticker].rolling(window=window).var()  # Rolling Variance
        data[f'Rolling_Std_{window}'] = data[ticker].rolling(window=window).std()  # Rolling Std Dev
    
    data = data.fillna(0)

    # Add lag features
    data['Lag_1'] = data[ticker].shift(1)
    data['Lag_2'] = data[ticker].shift(2)
    data['Lag_3'] = data[ticker].shift(3)
    data = data.fillna(0)
    
    features = [col for col in data.columns if col != ticker]
    X = data[features]
    y = data[ticker]
    
    # Split the data before and after 2022
    X_train = X[X.index < '2022-01-01']
    y_train = y[X.index < '2022-01-01']
    
    X_test = X[X.index >= '2022-01-01']
    y_test = y[X.index >= '2022-01-01']
    
    return X_train, X_test, y_train, y_test

def tune_hyperparameters(X_train, y_train, model_type='ridge', cv_folds=5):
    if model_type == 'ridge':
        model = Ridge()
        param_grid = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0]}
    else:
        model = LinearRegression()
        param_grid = {}

    grid_search = GridSearchCV(model, param_grid, cv=cv_folds, scoring='neg_mean_absolute_error')
    grid_search.fit(X_train, y_train)

    return grid_search.best_estimator_, grid_search.best_params_, -grid_search.best_score_

def linear_regression_for_tickers(df, model_type='ridge', cv_folds=5):
    results = {}
    
    overall_train_actuals = []
    overall_train_preds = []
    overall_test_actuals = []
    overall_test_preds = []
    
    tickers = df.columns
    for ticker in tickers:
        X_train, X_test, y_train, y_test = prepare_data(df, ticker)
        
        best_model, best_params, best_cv_mae = tune_hyperparameters(X_train, y_train, model_type, cv_folds)
        best_model.fit(X_train, y_train)
        
        y_train_pred = best_model.predict(X_train)
        y_train_pred = np.clip(y_train_pred, a_min=0, a_max=None)  # Ensure no negative values
        mae_train = mean_absolute_error(y_train, y_train_pred)
        r2_train = r2_score(y_train, y_train_pred)
        
        overall_train_actuals.extend(y_train)
        overall_train_preds.extend(y_train_pred)
        
        y_test_pred = best_model.predict(X_test)
        y_test_pred = np.clip(y_test_pred, a_min=0, a_max=None)  # Ensure no negative values
        mae_test = mean_absolute_error(y_test, y_test_pred)
        r2_test = r2_score(y_test, y_test_pred)
        
        overall_test_actuals.extend(y_test)
        overall_test_preds.extend(y_test_pred)
        
        results[ticker] = {
            'model': best_model,
            'best_params': best_params,
            'best_cv_mae': best_cv_mae,  # Best cross-validated MAE
            'train_predictions': y_train_pred,
            'train_mae': mae_train,
            'train_r2': r2_train,
            'test_predictions': y_test_pred,
            'test_mae': mae_test,
            'test_r2': r2_test,
        }
    
    overall_mae_train = mean_absolute_error(overall_train_actuals, overall_train_preds)
    overall_r2_train = r2_score(overall_train_actuals, overall_train_preds)
    overall_mae_test = mean_absolute_error(overall_test_actuals, overall_test_preds)
    overall_r2_test = r2_score(overall_test_actuals, overall_test_preds)
    
    results['overall'] = {
        'train_mae': overall_mae_train,
        'train_r2': overall_r2_train,
        'test_mae': overall_mae_test,
        'test_r2': overall_r2_test,
    }
    
    return results

In [51]:
results = linear_regression_for_tickers(close, model_type='ridge', cv_folds=5)

overall = results['overall']
print(f'Overall Mean Absolute Error (Train set): {overall["train_mae"]}')
print(f'Overall R-squared (Train set): {overall["train_r2"]}')
print(f'Overall Mean Absolute Error (Test set): {overall["test_mae"]}')
print(f'Overall R-squared (Test set): {overall["test_r2"]}')

for ticker, result in results.items():
    if ticker != 'overall':
        print(f'{ticker} Best Params: {result["best_params"]}, Best Cross-Validation MAE: {result["best_cv_mae"]}')

Overall Mean Absolute Error (Train set): 0.0018202504005618886
Overall R-squared (Train set): 0.9996236695688293
Overall Mean Absolute Error (Test set): 0.08350306496030364
Overall R-squared (Test set): 0.5949736480820393
A32 Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAA Best Params: {'alpha': 0.1}, Best Cross-Validation MAE: 1.297727871776558
AAM Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAS Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAT Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAV Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABB Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABC Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABI Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABR Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABS Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABT Best Params: {'alpha': 0.0

In [52]:
def complete_predictions(df, results):
    full_predictions_df = pd.DataFrame(index=df.index, columns=df.columns)
    
    for ticker, result in results.items():
        if ticker != 'overall':
            train_indices = df[df.index < '2022-01-01'].index
            test_indices = df[df.index >= '2022-01-01'].index
            
            full_predictions_df.loc[train_indices, ticker] = result['train_predictions']
            full_predictions_df.loc[test_indices, ticker] = result['test_predictions']
    
    return full_predictions_df

In [55]:
close_predictions = complete_predictions(close, results)
close_predictions.shape
close_predictions.to_csv('data/vn_stock/linear_models/ridge_close_top30.txt', sep='\t')

In [57]:
results = linear_regression_for_tickers(open, model_type='ridge', cv_folds=5)

overall = results['overall']
print(f'Overall Mean Absolute Error (Train set): {overall["train_mae"]}')
print(f'Overall R-squared (Train set): {overall["train_r2"]}')
print(f'Overall Mean Absolute Error (Test set): {overall["test_mae"]}')
print(f'Overall R-squared (Test set): {overall["test_r2"]}')

for ticker, result in results.items():
    if ticker != 'overall':
        print(f'{ticker} Best Params: {result["best_params"]}, Best Cross-Validation MAE: {result["best_cv_mae"]}')

open_predictions = complete_predictions(open, results)
print(open_predictions.shape)
open_predictions.to_csv('data/vn_stock/linear_models/ridge_open_top30.txt', sep='\t')

Overall Mean Absolute Error (Train set): 0.0019128466357899387
Overall R-squared (Train set): 0.9996088768817099
Overall Mean Absolute Error (Test set): 0.08389403738872889
Overall R-squared (Test set): 0.5933423664856394
A32 Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAA Best Params: {'alpha': 0.1}, Best Cross-Validation MAE: 1.3004980606060648
AAM Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAS Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAT Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAV Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABB Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABC Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABI Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABR Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABS Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABT Best Params: {'alpha': 0.

In [58]:
results = linear_regression_for_tickers(low, model_type='ridge', cv_folds=5)

overall = results['overall']
print(f'Overall Mean Absolute Error (Train set): {overall["train_mae"]}')
print(f'Overall R-squared (Train set): {overall["train_r2"]}')
print(f'Overall Mean Absolute Error (Test set): {overall["test_mae"]}')
print(f'Overall R-squared (Test set): {overall["test_r2"]}')

for ticker, result in results.items():
    if ticker != 'overall':
        print(f'{ticker} Best Params: {result["best_params"]}, Best Cross-Validation MAE: {result["best_cv_mae"]}')

low_predictions = complete_predictions(low, results)
print(low_predictions.shape)
low_predictions.to_csv('data/vn_stock/linear_models/ridge_low_top30.txt', sep='\t')

Overall Mean Absolute Error (Train set): 0.0017863765514465378
Overall R-squared (Train set): 0.9996243428626925
Overall Mean Absolute Error (Test set): 0.08147433372723127
Overall R-squared (Test set): 0.6006767880585194
A32 Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAA Best Params: {'alpha': 0.1}, Best Cross-Validation MAE: 1.2862593589356537
AAM Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAS Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAT Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAV Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABB Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABC Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABI Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABR Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABS Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABT Best Params: {'alpha': 0.

In [59]:
results = linear_regression_for_tickers(high, model_type='ridge', cv_folds=5)

overall = results['overall']
print(f'Overall Mean Absolute Error (Train set): {overall["train_mae"]}')
print(f'Overall R-squared (Train set): {overall["train_r2"]}')
print(f'Overall Mean Absolute Error (Test set): {overall["test_mae"]}')
print(f'Overall R-squared (Test set): {overall["test_r2"]}')

for ticker, result in results.items():
    if ticker != 'overall':
        print(f'{ticker} Best Params: {result["best_params"]}, Best Cross-Validation MAE: {result["best_cv_mae"]}')

high_predictions = complete_predictions(high, results)
print(high_predictions.shape)
high_predictions.to_csv('data/vn_stock/linear_models/ridge_high_top30.txt', sep='\t')

Overall Mean Absolute Error (Train set): 0.001803603586170665
Overall R-squared (Train set): 0.9996311749284944
Overall Mean Absolute Error (Test set): 0.08471479846475546
Overall R-squared (Test set): 0.6011874820260593
A32 Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAA Best Params: {'alpha': 0.1}, Best Cross-Validation MAE: 1.316130735240021
AAM Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAS Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAT Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
AAV Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABB Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABC Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABI Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABR Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABS Best Params: {'alpha': 0.001}, Best Cross-Validation MAE: -0.0
ABT Best Params: {'alpha': 0.00