In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Load the data
file_path = 'data/bsmax_0_1.csv'
data = pd.read_csv(file_path, delimiter='\t')

# Define the target variable and features
target = 'rtn'
features = data.columns.drop(target).drop('ymd')

X = data[features]
y = data[target]

# Normalize the data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the neural network model
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model on the test set
test_loss = model.evaluate(X_test, y_test, verbose=1)

print(f'Test Mean Squared Error: {test_loss}')


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 14034982.0000 - val_loss: 17435814.0000
Epoch 2/100
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 770us/step - loss: 12702629.0000 - val_loss: 17435948.0000
Epoch 3/100
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 735us/step - loss: 13432244.0000 - val_loss: 17437202.0000
Epoch 4/100
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 733us/step - loss: 16744237.0000 - val_loss: 17438298.0000
Epoch 5/100
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 708us/step - loss: 13135622.0000 - val_loss: 17442350.0000
Epoch 6/100
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 695us/step - loss: 13427006.0000 - val_loss: 17444050.0000
Epoch 7/100
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 730us/step - loss: 13244628.0000 - val_loss: 17448170.0000
Epoch 8/100
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the model
rf = RandomForestRegressor(random_state=42)

# Perform grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Score (MSE): {best_score}')


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   4.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   4.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   4.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   5.1s
[CV] END m

In [14]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize the model
xgb = XGBRegressor(random_state=42)

# Perform grid search
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search_xgb.fit(X_train, y_train)

# Best parameters and best score
best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = -grid_search_xgb.best_score_

print(f'Best Parameters for XGBoost: {best_params_xgb}')
print(f'Best Score (MSE) for XGBoost: {best_score_xgb}')


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END 



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, n_estimators=200, subsample=1.0; total time=   1.9s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, n_estimators=200, subsample=1.0; total time=   2.0s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, n_estimators=300, subsample=0.8; total time=   2.9s
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, n_estimators=300, subsample=0.8; total time=   2.9s
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, n_estimators=300, subsample=0.8; total time=   3.2s
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=3, n_estimators

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the data
file_path = 'data/bsmax_0_1.csv'
data = pd.read_csv(file_path, delimiter='\t')

# Define the target variable and features
target = 'rtn'
features = data.columns.drop(target)

X = data[features]
y = data[target]

# Normalize the data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

# Predict on the test set
y_pred = linear_regression_model.predict(X_test)

# Calculate the Mean Squared Error on the test set
test_mse = mean_squared_error(y_test, y_pred)

print(f'Test Mean Squared Error: {test_mse}')


Test Mean Squared Error: 3157969.1203146223


In [16]:
from sklearn.linear_model import Ridge

# Initialize and train the Ridge Regression model
ridge_regression_model = Ridge(alpha=1.0)
ridge_regression_model.fit(X_train, y_train)

# Predict on the test set
y_pred_ridge = ridge_regression_model.predict(X_test)

# Calculate the Mean Squared Error on the test set
test_mse_ridge = mean_squared_error(y_test, y_pred_ridge)

print(f'Test Mean Squared Error (Ridge Regression): {test_mse_ridge}')


Test Mean Squared Error (Ridge Regression): 13493437.643980566


In [17]:
from sklearn.linear_model import Lasso

# Initialize and train the Lasso Regression model
lasso_regression_model = Lasso(alpha=0.1)
lasso_regression_model.fit(X_train, y_train)

# Predict on the test set
y_pred_lasso = lasso_regression_model.predict(X_test)

# Calculate the Mean Squared Error on the test set
test_mse_lasso = mean_squared_error(y_test, y_pred_lasso)

print(f'Test Mean Squared Error (Lasso Regression): {test_mse_lasso}')


Test Mean Squared Error (Lasso Regression): 3346413.874615754


  model = cd_fast.enet_coordinate_descent(


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the data
file_path = 'data/bsmax_0_1.csv'
data = pd.read_csv(file_path, delimiter='\t')

# Define the target variable and features
target = 'rtn'
features = data.columns.drop(target).drop('ymd')

X = data[features]
y = data[target]

# Normalize the data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

# Predict on the test set
y_pred = linear_regression_model.predict(X_test)

# Calculate the Mean Squared Error on the test set
test_mse = mean_squared_error(y_test, y_pred)

print(f'Test Mean Squared Error: {test_mse}')

# Extract the coefficients and intercept
coefficients = linear_regression_model.coef_
intercept = linear_regression_model.intercept_

# Display the coefficients and intercept
print("Coefficients:", coefficients)
print("Intercept:", intercept)

# Create the linear regression formula
formula = f"{intercept}"
for i, feature in enumerate(features):
    formula += f" + ({coefficients[i]} * {feature})"

print("Linear Regression Formula:")
print(f"rtn = {formula}")


Test Mean Squared Error: 3158201.164546838
Coefficients: [-2.76580320e+05  2.73125773e+05  2.03104533e+03 -3.75155317e+02
  2.91684357e+02  1.59949591e+03  2.43092373e+02 -9.85474884e+02
 -2.75135364e+02  1.32234034e+02  1.00626893e+03 -9.49620536e+02
  3.34555684e+02  2.08063896e+03 -1.35406550e+03 -8.90454504e+02
  1.01031854e+02 -5.27928051e+02  2.61449762e+03 -2.28880958e+03
 -3.22261494e+03  9.58150200e+02 -1.82057525e+03 -7.67354513e+01
  2.23402802e+02 -1.24515777e+03  2.09067479e+02]
Intercept: 288.87741937231016
Linear Regression Formula:
rtn = 288.87741937231016 + (-276580.32019477966 * init) + (273125.7730055981 * avg) + (2031.0453307002142 * dev) + (-375.1553165083148 * bkmax) + (291.68435685121784 * skmax) + (1599.4959123373858 * bomega) + (243.09237277792903 * somega) + (-985.4748836215142 * bpri) + (-275.13536372971714 * bfor) + (132.23403426949199 * bmom) + (1006.2689339465715 * bc) + (-949.6205362201598 * bpmax) + (334.5556836728103 * bmax) + (2080.638958311242 * bmomn

In [22]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Load the dataset
file_path = 'data/bsmax_0_1.csv'
data = pd.read_csv(file_path, sep='\t')

# Split the data into 80% for training and 20% for prediction
split_index = int(len(data) * 0.8)
train_data = data[:split_index]
predict_data = data[split_index:]

# Separate features and target variable for training data
X_train = train_data.drop('rtn', axis=1).drop('ymd', axis=1)
y_train = train_data['rtn']

# Normalize the features for training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Separate features and target variable for prediction data
X_predict = predict_data.drop('rtn', axis=1).drop('ymd', axis=1)
y_actual = predict_data['rtn']

# Normalize the features for prediction data
X_predict_scaled = scaler.transform(X_predict)

# Predict on the prediction set
y_pred = model.predict(X_predict_scaled)

# Evaluate the model on the prediction set
r2 = r2_score(y_actual, y_pred)
print(f'R-squared value on prediction data: {r2}')


R-squared value on prediction data: 0.8084930194429341
