In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
import alpaca_trade_api as trade_api
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report


In [2]:
# laod the .env variables to the notebook
load_dotenv()

True

## Part 1 - Data Collection

### Collect the historical price data using alpaca API

In [3]:
# Import the keys required to draw data from the API
alpaca_secret_key = os.getenv('ALPACA_SECRET_KEY') 
alpaca_api_key = os.getenv('ALPACA_API_KEYID')

# Create an API object
alpaca = trade_api.REST(alpaca_api_key,
                       alpaca_secret_key,
                       api_version = 'v2')

In [4]:
# Set up the start and end dates in ISO format
start = pd.Timestamp('2022-05-31', tz = 'America/New_York').isoformat()
end = pd.Timestamp('2025-05-31', tz = 'America/New_York').isoformat()

# Set the ticker
ticker = 'AAPL'

# Set the timeframe to 1 hour for the API
timeframe = '1Hour'

# Get the current closing prices of the stock
df_prices = alpaca.get_bars(ticker,
                           timeframe,
                           start = start,
                           end = end).df

# Rename the index
df_prices.index.rename('Date', inplace = True)

# Display data acquired
df_prices

Unnamed: 0_level_0,close,high,low,trade_count,open,volume,vwap
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-05-31 08:00:00+00:00,150.0100,151.0000,148.8900,2814,150.00,100023,149.774887
2022-05-31 09:00:00+00:00,149.0500,150.0000,148.9900,1943,150.00,71538,149.262514
2022-05-31 10:00:00+00:00,149.2300,149.3700,148.7300,1139,149.01,52145,149.007077
2022-05-31 11:00:00+00:00,149.5000,149.7800,149.1800,2972,149.25,147931,149.450164
2022-05-31 12:00:00+00:00,148.6898,150.6900,148.5900,8864,149.44,707354,149.125127
...,...,...,...,...,...,...,...
2025-05-30 19:00:00+00:00,200.6500,201.9600,199.2889,154879,199.32,15011350,200.734151
2025-05-30 20:00:00+00:00,200.6100,201.1000,200.2000,4475,200.65,29375065,200.847757
2025-05-30 21:00:00+00:00,200.6400,200.6500,200.4000,700,200.60,162112,200.508232
2025-05-30 22:00:00+00:00,200.4000,200.6500,200.4000,507,200.60,25292,200.484558


## Part 2 - Feature Engineering

Some technical indicators are to be calculated to use as features to be fed to the machine learining model

These indicators include:

- 50-day and 200-day Simple Moving Averages (SMA): Trend-following indicators.

- Relative Strength Index (RSI): Measures the speed and change of price movements.

- Average True Range (ATR): Measures volatility.

In [5]:
# Calculate moving averages
df_prices['SMA_50'] = df_prices['close'].rolling(50).mean()
df_prices['SMA_200'] = df_prices['close'].rolling(200).mean()


In [6]:
# Calculate Relative Strength Index (RSI)
delta = df_prices['close'].diff()
gains = delta.where(delta > 0, 0)
losses = -delta.where(delta < 0, 0)

gains_mean = gains.rolling(14).mean()
losses_mean = losses.rolling(14).mean()

rs = gains_mean / losses_mean
df_prices['RSI'] = 100 - (100 / (1 + rs))


In [7]:
# Calculate the 14-day Average True Range (ATR) for volatility
high_low = df_prices['close'] - df_prices['low']
high_close = df_prices['high'] - (df_prices['close'].shift()).abs()
low_close = df_prices['low'] - (df_prices['close'].shift()).abs()

tr = pd.concat([high_low, high_close, low_close], axis = 1)
df_prices['ATR'] = tr.max(axis = 1).rolling(14).mean()


In [8]:
# Drop missing values
df_prices.dropna(inplace = True)

# display the dataframe
df_prices

Unnamed: 0_level_0,close,high,low,trade_count,open,volume,vwap,SMA_50,SMA_200,RSI,ATR
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-06-16 19:00:00+00:00,130.10,130.3000,129.0400,183006,129.54,21078573,129.726817,132.938662,142.841393,26.166438,0.701786
2022-06-16 20:00:00+00:00,130.55,131.1950,130.0500,5773,130.11,25903353,130.064895,132.894762,142.744093,26.531234,0.744286
2022-06-16 21:00:00+00:00,130.48,130.5300,130.3300,2139,130.50,89622,130.430462,132.866762,142.651243,24.840770,0.722143
2022-06-16 22:00:00+00:00,130.46,130.8000,130.4000,1491,130.48,108764,130.556641,132.826962,142.557393,41.542956,0.733571
2022-06-16 23:00:00+00:00,130.53,130.6000,130.2000,1541,130.45,88013,130.382198,132.790562,142.462543,41.624557,0.719286
...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 19:00:00+00:00,200.65,201.9600,199.2889,154879,199.32,15011350,200.734151,200.722122,204.957559,56.764794,0.848393
2025-05-30 20:00:00+00:00,200.61,201.1000,200.2000,4475,200.65,29375065,200.847757,200.725046,204.906809,57.379593,0.864807
2025-05-30 21:00:00+00:00,200.64,200.6500,200.4000,700,200.60,162112,200.508232,200.734046,204.860358,59.116581,0.868379
2025-05-30 22:00:00+00:00,200.40,200.6500,200.4000,507,200.60,25292,200.484558,200.736738,204.804859,51.425985,0.776950


## Part 3  - Machine Learining Model

A machine learning model will be created to predict the price movement in the hour.

The labelling is done in a binary classification model with '1's for the prices going up and '0's for the prices going dowm.

In [9]:
# Define the label variables with 1 if prices going up and 0 otherwise
df_prices['Next_close'] = df_prices['close'].shift(-1)
df_prices['Price_variation'] = df_prices['Next_close'] - df_prices['close']
df_prices['Label'] = (df_prices['Price_variation'] > 0).astype(int)

# Remove the NaNs present in the dataframe
df_prices.dropna(inplace = True)

In [10]:
# Define the data features(X) and labels(y) for machine learning
X = df_prices.drop(columns = 'Label', axis = 1)
y = df_prices['Label']

# Split the data into test and train sets with 75% train and 25% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)


In [11]:
# Initialize StandardScaler
X_scaler = StandardScaler()

# Fit X_train to scaler
X_scaler.fit(X_train)

# Apply Scaling as required to the datasets
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Initialize PCA with 3 components
pca = PCA(n_components = 3)

# Fit pca to the training data and transform both the train and test data sets
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)


In [13]:
# Initialize the model
gb_model = GradientBoostingClassifier(n_estimators = 300, learning_rate = 0.1, max_depth = 5, random_state = 1)

# Fit the model with the training data
gb_model.fit(X_train_pca, y_train)

In [14]:
# Make predictions on the X_test_pca data set with the model
y_pred = gb_model.predict(X_test_pca)

# Display the classification report comparing the y_test and predicted labels
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.44      0.47      1488
           1       0.49      0.54      0.51      1445

    accuracy                           0.49      2933
   macro avg       0.49      0.49      0.49      2933
weighted avg       0.49      0.49      0.49      2933

