In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Load the data
file_path = 'data/bsmax_0_1.csv'
data = pd.read_csv(file_path, delimiter='\t')

# Define the target variable and features
target = 'rtn'
features = data.columns.drop(target).drop('ymd')

X = data[features]
y = data[target]

# Normalize the data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the neural network model
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model on the test set
test_loss = model.evaluate(X_test, y_test, verbose=1)

print(f'Test Mean Squared Error: {test_loss}')


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 14569781.0000 - val_loss: 17435394.0000
Epoch 2/100
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 770us/step - loss: 13093278.0000 - val_loss: 17435736.0000
Epoch 3/100
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 682us/step - loss: 13085485.0000 - val_loss: 17436602.0000
Epoch 4/100
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 682us/step - loss: 15290623.0000 - val_loss: 17438854.0000
Epoch 5/100
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 663us/step - loss: 12757020.0000 - val_loss: 17440302.0000
Epoch 6/100
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 684us/step - loss: 14658835.0000 - val_loss: 17441386.0000
Epoch 7/100
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 663us/step - loss: 14216333.0000 - val_loss: 17452274.0000
Epoch 8/100
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the model
rf = RandomForestRegressor(random_state=42)

# Perform grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Score (MSE): {best_score}')


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   5.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   5.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   5.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   5.2s
[CV] END m

In [3]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize the model
xgb = XGBRegressor(random_state=42)

# Perform grid search
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search_xgb.fit(X_train, y_train)

# Best parameters and best score
best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = -grid_search_xgb.best_score_

print(f'Best Parameters for XGBoost: {best_params_xgb}')
print(f'Best Score (MSE) for XGBoost: {best_score_xgb}')


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END 



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, n_estimators=200, subsample=1.0; total time=   2.0s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, n_estimators=200, subsample=1.0; total time=   2.1s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, n_estimators=200, subsample=1.0; total time=   2.0s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, n_estimators=200, subsample=0.8; total time=   2.2s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, n_estimators=200, subsample=1.0; total time=   2.1s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, n_estimators=200, subsample=1.0; total time=   2.1s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, n_estimators=300, subsample=0.8; total time=   3.2s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, n_estimators=300, subsample=0.8; total time=   3.3s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, n_estima

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the data
file_path = 'data/bsmax_0_1.csv'
data = pd.read_csv(file_path, delimiter='\t')

# Define the target variable and features
target = 'rtn'
features = data.columns.drop(target)

X = data[features]
y = data[target]

# Normalize the data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

# Predict on the test set
y_pred = linear_regression_model.predict(X_test)

# Calculate the Mean Squared Error on the test set
test_mse = mean_squared_error(y_test, y_pred)

print(f'Test Mean Squared Error: {test_mse}')


Test Mean Squared Error: 3157969.1203146223


In [6]:
from sklearn.linear_model import Ridge

# Initialize and train the Ridge Regression model
ridge_regression_model = Ridge(alpha=1.0)
ridge_regression_model.fit(X_train, y_train)

# Predict on the test set
y_pred_ridge = ridge_regression_model.predict(X_test)

# Calculate the Mean Squared Error on the test set
test_mse_ridge = mean_squared_error(y_test, y_pred_ridge)

print(f'Test Mean Squared Error (Ridge Regression): {test_mse_ridge}')


Test Mean Squared Error (Ridge Regression): 13493437.643980566


In [8]:
from sklearn.linear_model import Lasso

# Initialize and train the Lasso Regression model
lasso_regression_model = Lasso(alpha=0.1)
lasso_regression_model.fit(X_train, y_train)

# Predict on the test set
y_pred_lasso = lasso_regression_model.predict(X_test)

# Calculate the Mean Squared Error on the test set
test_mse_lasso = mean_squared_error(y_test, y_pred_lasso)

print(f'Test Mean Squared Error (Lasso Regression): {test_mse_lasso}')


Test Mean Squared Error (Lasso Regression): 3346413.874615754


  model = cd_fast.enet_coordinate_descent(


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the data
file_path = 'data/bsmax_0_1.csv'
data = pd.read_csv(file_path, delimiter='\t')

# Define the target variable and features
target = 'rtn'
features = data.columns.drop(target)

X = data[features]
y = data[target]

# Normalize the data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

# Predict on the test set
y_pred = linear_regression_model.predict(X_test)

# Calculate the Mean Squared Error on the test set
test_mse = mean_squared_error(y_test, y_pred)

print(f'Test Mean Squared Error: {test_mse}')

# Extract the coefficients and intercept
coefficients = linear_regression_model.coef_
intercept = linear_regression_model.intercept_

# Display the coefficients and intercept
print("Coefficients:", coefficients)
print("Intercept:", intercept)

# Create the linear regression formula
formula = f"{intercept}"
for i, feature in enumerate(features):
    formula += f" + ({coefficients[i]} * {feature})"

print("Linear Regression Formula:")
print(f"rtn = {formula}")


Test Mean Squared Error: 3157969.1203146223
Coefficients: [-6.95672408e+00 -2.76584779e+05  2.73136584e+05  2.02960303e+03
 -3.75427720e+02  2.91911840e+02  1.59936659e+03  2.43928600e+02
 -9.84855004e+02 -2.75968356e+02  1.32325046e+02  1.00667524e+03
 -9.51097920e+02  3.34975667e+02  2.08223985e+03 -1.35553773e+03
 -8.91060520e+02  1.01091936e+02 -5.28421638e+02  2.61214705e+03
 -2.28733102e+03 -3.21895297e+03  9.59072334e+02 -1.81947364e+03
 -7.48749157e+01  2.21672255e+02 -1.24692881e+03  2.07257753e+02]
Intercept: 288.56409730386065
Linear Regression Formula:
rtn = 288.56409730386065 + (-6.9567240817954294 * ymd) + (-276584.77920420776 * init) + (273136.58443820616 * avg) + (2029.6030314611844 * dev) + (-375.4277197300624 * bkmax) + (291.911840490251 * skmax) + (1599.3665855090696 * bomega) + (243.92859954530905 * somega) + (-984.8550042158589 * bpri) + (-275.96835550639247 * bfor) + (132.3250461959597 * bmom) + (1006.6752365869751 * bc) + (-951.0979199894239 * bpmax) + (334.97566

In [11]:
# Load the data
file_path = 'data/bsmax_0_1.csv'
data1 = pd.read_csv(file_path, delimiter='\t')
data1 = data1.columns.drop('rtn').drop('ymd')
data1

Index(['init', 'avg', 'dev', 'bkmax', 'skmax', 'bomega', 'somega', 'bpri',
       'bfor', 'bmom', 'bc', 'bpmax', 'bmax', 'bmomn', 'bcn', 'bpmaxn',
       'bmaxn', 'spri', 'sfor', 'smom', 'sc', 'spmax', 'smax', 'smomn', 'scn',
       'spmaxn', 'smaxn'],
      dtype='object')