In [2]:
# Import libraries
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Fetch stock data
stock_data = yf.Ticker("AAPL")  # Example: Apple Inc.
stock_data = stock_data.history(period="max")

stock_data = stock_data.loc["2000-01-01":"2024-12-31"].copy()

# Prepare features
stock_data['Return'] = stock_data['Close'].pct_change()
stock_data['Moving_Avg_5'] = stock_data['Close'].rolling(window=5).mean()
stock_data['Moving_Avg_20'] = stock_data['Close'].rolling(window=20).mean()
stock_data['Volatility'] = stock_data['Close'].rolling(window=5).std()

# Target Variables
stock_data['Price_Tomorrow'] = stock_data['Close'].shift(-1)
stock_data['Direction'] = (stock_data['Price_Tomorrow'] > stock_data['Close']).astype(int)  # 1 for Up, 0 for Down

# stock_data

# Drop rows with NaN values introduced by rolling and shifting
stock_data = stock_data.dropna()

# Features and target
X = stock_data[['Close', 'Return', 'Moving_Avg_5', 'Moving_Avg_20', 'Volatility']]
y = stock_data['Direction']

# Align X and y to ensure proper indexing
X, y = X.align(y, axis=0)

# Check alignment
print("X shape:", X.shape)
print("y shape:", y.shape)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Predictions
y_pred = classifier.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


X shape: (6269, 5)
y shape: (6269,)
Accuracy: 0.5167464114832536

Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.43      0.47       628
           1       0.51      0.60      0.55       626

    accuracy                           0.52      1254
   macro avg       0.52      0.52      0.51      1254
weighted avg       0.52      0.52      0.51      1254



In [None]:
# Use a library like yfinance to fetch data for major indices, such as:
## S&P 500 (^GSPC)
## Nasdaq Composite (^IXIC)
## Dow Jones Industrial Average (^DJI)

# Fetch S&P 500 data
sp500 = yf.Ticker("^GSPC")  # Example: Apple Inc.
sp500 = sp500.history(period="max")

sp500 = sp500.loc["2000-01-01":"2024-12-31"].copy()

sp500['SP500_Return'] = sp500['Close'].pct_change()

# Fetch Nasdaq Composite data
NC = yf.Ticker("^GSPC")  # Example: Apple Inc.
NC = NC.history(period="max")

NC = NC.loc["2000-01-01":"2024-12-31"].copy()

sp500['SP500_Return'] = sp500['Close'].pct_change()

# Align dates with your stock data
sp500 = sp500[['SP500_Return']].reset_index()
stock_data = stock_data.reset_index()

# Merge S&P 500 data with stock data
stock_data = pd.merge(stock_data, sp500, on='Date', how='left')

# Drop NaN values introduced by alignment
stock_data = stock_data.dropna()

stock_data

In [None]:
# Use ETFs (Exchange Traded Funds) representing specific sectors. Examples include:
## Technology: XLK
## Healthcare: XLV
## Financials: XLF
### Fetch and process data for relevant ETFs as with the S&P 500.


# Fetch Technology sector ETF data
tech_etf = yf.Ticker("XLK")  # Example: Apple Inc.
tech_etf = tech_etf.history(period="max")

tech_etf = tech_etf.loc["2000-01-01":"2024-12-31"].copy()

tech_etf['Tech_Return'] = tech_etf['Close'].pct_change()

# Merge Technology ETF data with stock data
tech_etf = tech_etf[['Tech_Return']].reset_index()
stock_data = pd.merge(stock_data, tech_etf, on='Date', how='left')

# Drop NaN values introduced by alignment
stock_data = stock_data.dropna()

stock_data

In [None]:
def add_lagged_features(df, columns, lags):
    """
    Adds lagged features for specified columns.
    Args:
        df (DataFrame): The original DataFrame.
        columns (list): List of column names to lag.
        lags (list): List of lag periods to apply.
    Returns:
        DataFrame: Updated DataFrame with lagged features.
    """
    for col in columns:
        for lag in lags:
            df[f"{col}_lag_{lag}"] = df[col].shift(lag)
    return df

# Add lagged features
lag_columns = ['Close', 'SP500_Return', 'Tech_Return']
lags = [1, 3, 7]
stock_data = add_lagged_features(stock_data, lag_columns, lags)

# Drop NaN values introduced by lagging
stock_data = stock_data.dropna()


In [None]:
def fetch_news_sentiment(ticker):
    """
    Fetches sentiment data for a given stock ticker from a news API.
    Args:
        ticker (str): Stock ticker symbol.
    Returns:
        DataFrame: DataFrame with sentiment scores.
    """
    # Example: Using a hypothetical API to fetch sentiment data
    import requests
    
    # Replace with actual API endpoint and parameters
    api_url = "https://newsapi.org/v2/everything"
    params = {
        'q': ticker,
        'apiKey': 'your_api_key',
        'sortBy': 'relevance',
        'language': 'en',
    }
    response = requests.get(api_url, params=params)
    data = response.json()
    
    # Placeholder: Compute sentiment score (e.g., positive, neutral, negative)
    sentiment_scores = []
    for article in data.get('articles', []):
        # Example sentiment logic: Positive if "positive" keyword found
        sentiment = 1 if "positive" in article['title'].lower() else 0
        sentiment_scores.append({'date': article['publishedAt'], 'sentiment': sentiment})
    
    sentiment_df = pd.DataFrame(sentiment_scores)
    sentiment_df['date'] = pd.to_datetime(sentiment_df['date']).dt.date
    return sentiment_df

In [None]:
def fetch_macro_indicators(start_date, end_date):
    """
    Fetches macroeconomic indicators such as interest rates and inflation.
    Args:
        start_date (str): Start date in 'YYYY-MM-DD' format.
        end_date (str): End date in 'YYYY-MM-DD' format.
    Returns:
        DataFrame: DataFrame with macroeconomic indicators.
    """
    from fredapi import Fred

    # Initialize FRED API with your API key
    fred = Fred(api_key='your_fred_api_key')

    # Fetch indicators (examples: Federal Funds Rate, CPI)
    interest_rates = fred.get_series('FEDFUNDS', start_date, end_date)
    inflation = fred.get_series('CPIAUCSL', start_date, end_date)

    # Combine into a DataFrame
    macro_data = pd.DataFrame({
        'Date': interest_rates.index,
        'Interest_Rate': interest_rates.values,
        'Inflation': inflation.reindex(interest_rates.index, method='ffill').values
    }).reset_index(drop=True)
    return macro_data

In [None]:
# Create a feature for relative strength
stock_data['Relative_SP500'] = stock_data['Return'] - stock_data['SP500_Return']
stock_data['Relative_Tech'] = stock_data['Return'] - stock_data['Tech_Return']

# Moving averages
stock_data['SP500_MA_5'] = stock_data['SP500_Return'].rolling(window=5).mean()
stock_data['Tech_MA_5'] = stock_data['Tech_Return'].rolling(window=5).mean()

stock_data


In [None]:
# Features and target
X = stock_data[['Close', 'Return', 'Moving_Avg_5', 'Moving_Avg_20', 'Volatility',
                'SP500_Return', 'Tech_Return', 'Relative_SP500', 'Relative_Tech', 
                'SP500_MA_5', 'Tech_MA_5']]
y = stock_data['Direction']

# Align X and y to ensure proper indexing
X, y = X.align(y, axis=0)

# Check alignment
print("X shape:", X.shape)
print("y shape:", y.shape)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
# Synthetic Minority Oversampling Technique (SMOTE) generates synthetic samples for the minority class
smote = SMOTE(random_state=42) 
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Predictions
y_pred = classifier.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Initialize Random Forest model
rf = RandomForestClassifier(random_state=42)

# GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Use the best model for evaluation
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
