In [None]:
# importing packages and loading data
import pandas as pd
import yfinance as yf
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import numpy as np
import joblib

# Fetching the data for the given period
aapl = yf.download("AAPL", start='2024-12-01', end='2024-12-13')

# Generating relevant columns and removing NaN values
aapl['close-open'] = aapl['Close'] - aapl['Open']
aapl['increase'] = aapl['Close'].shift(-1) - aapl['Close']
aapl['Y'] = aapl['increase'].apply(lambda x: 1 if x > 0 else (0 if x == 0 else -1))  # Dependent variable
aapl = aapl.dropna()  # Dropping rows with NaN values

# Features for scaling (match this with what you used during training)
features = ['close-open', 'Open', 'High', 'Low', 'Close', 'Volume']
X = aapl[features]
Y = aapl['Y']

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Applying PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, Y, test_size=0.2, random_state=42)

# Training the Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Making predictions and evaluating the model
y_pred = model.predict(X_test)
joblib.dump(model, 'model.pkl')  # Save the model to disk
joblib.dump(scaler, 'scaler.pkl')  # Save the scaler
joblib.dump(pca, 'pca.pkl')  # Save the PCA
print(classification_report(y_test, y_pred))  # Print classification report


# Function to fetch the latest data and make predictions
def predict_next_day_movement(stock_symbol="AAPL"):
    # Fetch the latest stock data (last 5 days)
    data = yf.download(stock_symbol, period="5d", interval="1d")

    # Generate the same features as before
    data['close-open'] = data['Close'] - data['Open']
    data['increase'] = data['Close'].shift(-1) - data['Close']

    # Drop the last row as it's the current day's data and has no "next day" value
    data = data.dropna()

    # Extract all the features for prediction (same features used in training)
    latest_data = data[features].iloc[-1]  # Keep all the features

    # Load the pre-trained scaler and pca (from the saved files)
    scaler = joblib.load('scaler.pkl')
    pca = joblib.load('pca.pkl')
    model = joblib.load('model.pkl')

    # Scale the features using the same scaler as used during training
    latest_data_scaled = scaler.transform([latest_data])  # Apply same scaling as training data

    # Apply PCA transformation using the same PCA object as used during training
    latest_data_pca = pca.transform(latest_data_scaled)

    # Ensure the input shape is correct (it should be 1 sample with 3 features)
    if latest_data_pca.shape[1] != 3:
        raise ValueError(f"Expected 3 features after PCA, got {latest_data_pca.shape[1]} features.")

    # Make the prediction
    prediction = model.predict(latest_data_pca)

    # Translate the prediction to a readable format
    if prediction == 1:
        prediction_text = "Stock is predicted to go up."
    elif prediction == 0:
        prediction_text = "Stock is predicted to stay the same."
    else:
        prediction_text = "Stock is predicted to go down."

    # Return the prediction
    return prediction_text


# Test the prediction function
prediction = predict_next_day_movement()
print(prediction)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

Stock is predicted to go up.





Some ML Projects that I've found:


*   HK/Non-HK Usernames: Analysing classification of IG usernames to be "HK" or non-HK. Shows how to do data cleaning and labelling. Methods used include t-SNE, **logistic regression, random forest**, supported vector mechines (SVM), **confusion matrix**. (https://www.reddit.com/r/learnmachinelearning/comments/1bhvfnd/rate_my_first_ml_project/)

*   Simple NVIDIA stock price predictor: More digestable. Uses random forest. Code for ML, prediction, backtesting seems easier to understand? Variables used: Close, Volume, Open, High, Low, Rolling average of stock price over n days (https://github.com/mar-antaya/predict_nvda/blob/main/predict_nvda.ipynb)

*   Geeks4Geeks Stock Price Predictor: Uses variables such as "open-close", "high-low", "is-quarter-end" (i.e at the end of quarter, when companies release their earnings report) (https://www.geeksforgeeks.org/stock-price-prediction-using-machine-learning-in-python/)

*   ML Stock Prediction (Neural Network): A different approach. Using just a few predictors to generate forecasts. Seems a bit beyond us for now. But still including this because of what it said in the conclusion - "it would be interesting to factor in sentiment analysis on news and social media regarding the stock market in general" (https://www.analyticsvidhya.com/blog/2021/10/machine-learning-for-stock-market-prediction-with-step-by-step-implementation/)


More finance statistics we could look at (fundamental analysis - which I'm also quite clueless about):

*   Investopedia: Fundamental Analysis.
(https://www.investopedia.com/terms/f/fundamentalanalysis.asp#:~:text=Fundamental%20analysis%20(FA)%20is%20a,related%20economic%20and%20financial%20factors.&text=The%20end%20goal%20is%20to,security%20is%20undervalued%20or%20overvalued)
*   Some key ratios to look at include:  
Price to earnings (P/E) ratio - overvalued/undervalued, \
Earnings Per Share (EPS),
Return on Equity (ROE)
Debt to equity (D/E) - how much of its assets are financed by debt (as opposed to equity), \
EBITDA (Earnings before interests, taxes, depreciation and amortization)


*   List item
*   List item





