In [6]:
# Import dependencies
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
import alpaca_trade_api as trade_api


In [7]:
# laod the .env variables to the notebook
load_dotenv()

True

## Part 1 - Data Collection

### Collect the historical price data using alpaca API

In [8]:
# Import the keys required to draw data from the API
alpaca_secret_key = os.getenv('ALPACA_SECRET_KEY') 
alpaca_api_key = os.getenv('ALPACA_API_KEYID')

# Create an API object
alpaca = trade_api.REST(alpaca_api_key,
                       alpaca_secret_key,
                       api_version = 'v2')

In [9]:
# Set up the start and end dates in ISO format
start = pd.Timestamp('2022-05-31', tz = 'America/New_York').isoformat()
end = pd.Timestamp('2025-05-31', tz = 'America/New_York').isoformat()

# Set the ticker
ticker = 'AAPL'

# Set the timeframe to 1 hour for the API
timeframe = '1Hour'

# Get the current closing prices of the stock
df_prices = alpaca.get_bars(ticker,
                           timeframe,
                           start = start,
                           end = end).df

# Rename the index
df_prices.index.rename('Date', inplace = True)

# Display data acquired
df_prices

Unnamed: 0_level_0,close,high,low,trade_count,open,volume,vwap
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-05-31 08:00:00+00:00,150.0100,151.0000,148.8900,2814,150.00,100023,149.774887
2022-05-31 09:00:00+00:00,149.0500,150.0000,148.9900,1943,150.00,71538,149.262514
2022-05-31 10:00:00+00:00,149.2300,149.3700,148.7300,1139,149.01,52145,149.007077
2022-05-31 11:00:00+00:00,149.5000,149.7800,149.1800,2972,149.25,147931,149.450164
2022-05-31 12:00:00+00:00,148.6898,150.6900,148.5900,8864,149.44,707354,149.125127
...,...,...,...,...,...,...,...
2025-05-30 19:00:00+00:00,200.6500,201.9600,199.2889,154879,199.32,15011350,200.734151
2025-05-30 20:00:00+00:00,200.6100,201.1000,200.2000,4475,200.65,29375065,200.847757
2025-05-30 21:00:00+00:00,200.6400,200.6500,200.4000,700,200.60,162112,200.508232
2025-05-30 22:00:00+00:00,200.4000,200.6500,200.4000,507,200.60,25292,200.484558


## Part 2 - Feature Engineering

Some technical indicators are to be calculated to use as features to be fed to the machine learining model

These indicators include:

- 50-day and 200-day Simple Moving Averages (SMA): Trend-following indicators.

- Relative Strength Index (RSI): Measures the speed and change of price movements.

- Average True Range (ATR): Measures volatility.

In [18]:
# Calculate moving averages
df_prices['SMA_50'] = df_prices['close'].rolling(50).mean()
df_prices['SMA_200'] = df_prices['close'].rolling(200).mean()


In [19]:
# Calculate Relative Strength Index (RSI)
delta = df_prices['close'].diff()
gains = delta.where(delta > 0, 0)
losses = -delta.where(delta < 0, 0)

gains_mean = gains.rolling(14).mean()
losses_mean = losses.rolling(14).mean()

rs = gains_mean / losses_mean
df_prices['RSI'] = 100 - (100 / (1 + rs))


In [20]:
# Calculate the 14-day Average True Range (ATR) for volatility
high_low = df_prices['close'] - df_prices['low']
high_close = df_prices['high'] - (df_prices['close'].shift()).abs()
low_close = df_prices['low'] - (df_prices['close'].shift()).abs()

tr = pd.concat([high_low, high_close, low_close], axis = 1)
df_prices['ATR'] = tr.max(axis = 1).rolling(14).mean()


In [21]:
# Drop missing values
df_prices.dropna(inplace = True)

# display the dataframe
df_prices

Unnamed: 0_level_0,close,high,low,trade_count,open,volume,vwap,SMA_50,SMA_200,RSI,ATR,Next_close,Price_variation,Label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-07-26 13:00:00+00:00,152.180,153.085,151.5400,83027,151.96,8790418,152.162344,154.011120,149.343662,28.526618,0.411207,151.605,-0.575,0
2022-07-26 14:00:00+00:00,151.605,152.470,151.1884,76869,152.18,7511472,151.747749,153.976020,149.372237,23.178318,0.408179,150.990,-0.615,0
2022-07-26 15:00:00+00:00,150.990,151.680,150.8500,56008,151.61,5289524,151.233205,153.926220,149.395387,22.128913,0.399536,151.630,0.640,1
2022-07-26 16:00:00+00:00,151.630,151.690,150.8000,49014,151.00,4628752,151.241230,153.875820,149.422561,37.985827,0.450614,151.855,0.225,1
2022-07-26 17:00:00+00:00,151.855,152.180,151.5500,45103,151.63,4747934,151.888237,153.845520,149.450086,36.200807,0.442114,151.590,-0.265,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 18:00:00+00:00,199.320,199.360,198.1800,57044,198.22,3770397,198.966497,200.719936,205.006208,46.696009,0.666236,200.650,1.330,1
2025-05-30 19:00:00+00:00,200.650,201.960,199.2889,154879,199.32,15011350,200.734151,200.722122,204.957559,56.764794,0.848393,200.610,-0.040,0
2025-05-30 20:00:00+00:00,200.610,201.100,200.2000,4475,200.65,29375065,200.847757,200.725046,204.906809,57.379593,0.864807,200.640,0.030,1
2025-05-30 21:00:00+00:00,200.640,200.650,200.4000,700,200.60,162112,200.508232,200.734046,204.860358,59.116581,0.868379,200.400,-0.240,0


## Part 3  - Machine Learining Model

A machine learning model will be created to predict the price movement in the hour.

The labelling is done in a binary classification model with '1's for the prices going up and '0's for the prices going dowm.

In [23]:
# Define the label variables with 1 if prices going up and 0 otherwise
df_prices['Next_close'] = df_prices['close'].shift(-1)
df_prices['Price_variation'] = df_prices['Next_close'] - df_prices['close']
df_prices['Label'] = (df_prices['Price_variation'] > 0).astype(int)

# Remove the NaNs present in the dataframe
df_prices.dropna(inplace = True)