In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load data from CSV
try:
    data = pd.read_csv("stock_data.csv", usecols=["date", "Price", "High", "Low", "Volume", "Direction"])
except FileNotFoundError:
    print("Error: CSV file not found.")
    exit()

# Data cleaning
# Handling missing values
data.dropna(inplace=True)

# Extract useful features from date
data["date"] = pd.to_datetime(data["date"])
data["Day"] = data["date"].dt.day
data["Month"] = data["date"].dt.month
data["Year"] = data["date"].dt.year

# Define features and target variable
X = data[["Day", "Month", "Year", "Price", "High", "Low", "Volume"]]
y = data["Direction"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize Random Forest Classifier
model = RandomForestClassifier(random_state=42)

target_accuracy = 0.99
current_accuracy = 0

while current_accuracy < target_accuracy:
    # Define parameter grid for hyperparameter tuning
    param_grid = {
        "n_estimators": [50, 100, 150],
        "max_depth": [None, 5, 10, 15],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    }

    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=model, param_grid=param_grid, cv=5, scoring="accuracy"
    )

    # Perform grid search to find the best parameters
    grid_search.fit(X_train_scaled, y_train)

    # Get the best parameters and score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print("Best Parameters:", best_params)
    print("Best Score:", best_score)

    # Train the model with best parameters
    best_model = RandomForestClassifier(**best_params, random_state=42)
    best_model.fit(X_train_scaled, y_train)

    # Make predictions on the testing set
    y_pred = best_model.predict(X_test_scaled)

    # Evaluate the model
    current_accuracy = accuracy_score(y_test, y_pred)
    print(f"Current Accuracy: {current_accuracy:.2f}")

    if current_accuracy < target_accuracy:
        print("Retraining the model...")
    else:
        print("Desired accuracy achieved.")

# Display classification report
print(classification_report(y_test, y_pred))

# Predict whether the stock will go up or down
new_data = pd.DataFrame(
    {
        "Day": [15],
        "Month": [3],
        "Year": [2024],
        "Price": [120],
        "High": [130],
        "Low": [115],
        "Volume": [15000],
    }
)

# Scale the new data
new_data_scaled = scaler.transform(new_data)

# Predict
prediction = best_model.predict(new_data_scaled)
if prediction[0] == 1:
    print("The stock is predicted to go up.")
else:
    print("The stock is predicted to go down.")


Best Parameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best Score: 0.9914667128415706
Current Accuracy: 1.00
Desired accuracy achieved.
              precision    recall  f1-score   support

        Down       1.00      1.00      1.00       359
          Up       1.00      1.00      1.00       257

    accuracy                           1.00       616
   macro avg       1.00      1.00      1.00       616
weighted avg       1.00      1.00      1.00       616

The stock is predicted to go down.
