In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load data from CSV
data = pd.read_csv('stock_data.csv', usecols=['Date', 'Price', 'High', 'Low', 'Volume'])

# Data cleaning
# Handling missing values
data.dropna(inplace=True)

# Extract useful features from date
data['Date'] = pd.to_datetime(data['Date'])
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year

# Define features and target variable
X = data[['Day', 'Month', 'Year', 'Price', 'High', 'Low', 'Volume']]
y = data['Direction']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize Random Forest Classifier
model = RandomForestClassifier(random_state=42)

target_accuracy = 0.8
current_accuracy = 0

while current_accuracy < target_accuracy:
    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

    # Perform grid search to find the best parameters
    grid_search.fit(X_train_scaled, y_train)

    # Get the best parameters and score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print("Best Parameters:", best_params)
    print("Best Score:", best_score)

    # Train the model with best parameters
    best_model = RandomForestClassifier(**best_params, random_state=42)
    best_model.fit(X_train_scaled, y_train)

    # Make predictions on the testing set
    y_pred = best_model.predict(X_test_scaled)

    # Evaluate the model
    current_accuracy = accuracy_score(y_test, y_pred)
    print(f'Current Accuracy: {current_accuracy:.2f}')

    if current_accuracy < target_accuracy:
        print("Retraining the model...")
    else:
        print("Desired accuracy achieved.")

# Display classification report
print(classification_report(y_test, y_pred))

# Predict whether the stock will go up or down
new_data = pd.DataFrame({
    'Day': [15],
    'Month': [3],
    'Year': [2024],
    'Price': [120],
    'High': [130],
    'Low': [115],
    'Volume': [15000]
})

# Scale the new data
new_data_scaled = scaler.transform(new_data)

# Predict
prediction = best_model.predict(new_data_scaled)
if prediction[0] == 1:
    print("The stock is predicted to go up.")
else:
    print("The stock is predicted to go down.")


ValueError: Usecols do not match columns, columns expected but not found: ['Low', 'Price', 'Date', 'Volume', 'High']

: 

In [None]:
pip install alpha-vantage

: 

: 