In [25]:
# Import Packages
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from boruta import BorutaPy
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Suppress warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("data.csv")

def updown_nextDay(row):
    if row['diff_Close_next_1day'] > 0:
        return 1        
    else:
        return 0

# Apply the function to each row
df['nextDay_updown'] = df.apply(updown_nextDay, axis=1)
df.drop(columns=['Close','Close_next_1day','diff_Close_next_1day'],inplace=True)



### Feature Selection

In [26]:
def boruta_top_10_features(df, target_column, max_iter=200):
    # Separate features and target
    X = df.drop(target_column, axis=1).values
    y = df[target_column].values
    
    # Define a random forest classifier
    rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5, random_state=42)
    
    # Initialize Boruta
    boruta_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42, max_iter=max_iter)
    
    # Fit Boruta
    boruta_selector.fit(X, y)
    
    # Get feature importance ranking
    ranking = boruta_selector.ranking_
    
    # Get feature names
    feature_names = df.drop(target_column, axis=1).columns
    
    # Create a DataFrame with feature names and their rankings
    feature_ranking_df = pd.DataFrame({
        'Feature': feature_names,
        'Ranking': ranking
    })
    
    # Sort the features by ranking (lower is better)
    feature_ranking_df = feature_ranking_df.sort_values(by='Ranking').reset_index(drop=True)
    
    # Select top 10 features
    top_10_features = feature_ranking_df.head(10)['Feature'].values
    
    # Return a DataFrame with the top 10 features
    top_10_features_df = df[top_10_features]
    top_10_features_df[target_column] = df[target_column]

    return top_10_features_df

df_FS = boruta_top_10_features(df,'nextDay_updown')

X = df_FS.drop('nextDay_updown', axis=1)
y = df_FS['nextDay_updown']


Iteration: 	1 / 200
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	2 / 200
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	3 / 200
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	4 / 200
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	5 / 200
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	6 / 200
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	7 / 200
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	8 / 200
Confirmed: 	0
Tentative: 	0
Rejected: 	30


BorutaPy finished running.

Iteration: 	9 / 200
Confirmed: 	0
Tentative: 	0
Rejected: 	30


### Logistic Regression

In [27]:
acc_max = 0

for i in range(1000):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

    # Initialize the logistic regression model
    log_reg = LogisticRegression(solver='liblinear')  # Use 'liblinear' solver for small datasets

    
    log_reg.fit(X_train, y_train)

    # Predict on the test set
    y_pred = log_reg.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    
    if accuracy > acc_max:
        acc_max = accuracy

print(accuracy)
    

0.532258064516129


### Naive Bayes

In [28]:
acc_max = 0

for i in range(100):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Initialize the Naive Bayes
    nb_classifier = GaussianNB()

    # Fit GridSearchCV to the training data
    nb_classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = nb_classifier.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred) 

    if accuracy > acc_max:
        acc_max = accuracy

print(accuracy)

0.6190476190476191


### Random Forest

In [29]:
acc_max = 0

for i in range(100):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Initialize the Random Forest Classifier
    rf_classifier = RandomForestClassifier()

    # Fit GridSearchCV to the training data
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = rf_classifier.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred) 

    if accuracy > acc_max:
        acc_max = accuracy

print(accuracy)

0.6904761904761905


In [30]:
acc_max = 0

for i in range(100):

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Initialize the XGBoost Classifier
    xgb_classifier = XGBClassifier()

    # Fit GridSearchCV to the training data
    xgb_classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = xgb_classifier.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    
    if accuracy > acc_max:
        acc_max = accuracy

print(accuracy)

0.5952380952380952
