In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
data_path = os.path.join(project_root, "data", "historical_stock_data_5min_6months.csv")
df = pd.read_csv(data_path, parse_dates=['timestamp'])
df.head()

Unnamed: 0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap
0,AAPL,2024-08-26 08:00:00+00:00,226.25,226.41,226.25,226.41,1965.0,219.0,226.365909
1,AMZN,2024-08-26 08:00:00+00:00,177.0,177.6,177.0,177.3,6356.0,150.0,177.363163
2,AVGO,2024-08-26 08:00:00+00:00,165.75,165.75,165.75,165.75,359.0,13.0,165.75
3,BAC,2024-08-26 08:00:00+00:00,39.32,39.34,39.31,39.34,1742.0,32.0,39.32
4,DIS,2024-08-26 08:00:00+00:00,90.25,90.39,90.25,90.39,651.0,44.0,90.294025


In [3]:
print("Raw data shape:", df.shape)
print("\nFirst 3 rows:")
print(df.head(3))

Raw data shape: (843934, 9)

First 3 rows:
  symbol                 timestamp    open    high     low   close  volume  \
0   AAPL 2024-08-26 08:00:00+00:00  226.25  226.41  226.25  226.41  1965.0   
1   AMZN 2024-08-26 08:00:00+00:00  177.00  177.60  177.00  177.30  6356.0   
2   AVGO 2024-08-26 08:00:00+00:00  165.75  165.75  165.75  165.75   359.0   

   trade_count        vwap  
0        219.0  226.365909  
1        150.0  177.363163  
2         13.0  165.750000  


In [4]:
# Check data integrity
print("Data Validation:")
print(f"Total Rows: {len(df)}")
print(f"Missing Values: {df.isnull().sum().sum()}")
print(f"Duplicates: {df.duplicated(subset=['symbol', 'timestamp']).sum()}")

# Check time range for a sample symbol (e.g., AAPL)
aapl_times = df[df['symbol'] == 'AAPL']['timestamp'].agg(['min', 'max'])
print("\nAAPL Time Range:")
print(f"Start: {aapl_times['min']}\nEnd: {aapl_times['max']}")

Data Validation:
Total Rows: 843934
Missing Values: 0
Duplicates: 0

AAPL Time Range:
Start: 2024-08-26 08:00:00+00:00
End: 2025-02-20 00:55:00+00:00


In [5]:
df.describe()

Unnamed: 0,open,high,low,close,volume,trade_count,vwap
count,843934.0,843934.0,843934.0,843934.0,843934.0,843934.0,843934.0
mean,253.24423,253.43022,253.052107,253.243953,153235.8,1485.715804,253.242906
std,212.964695,213.11953,212.805855,212.965264,693363.7,4026.025645,212.964384
min,18.525,18.56,18.51,18.525,100.0,1.0,18.534677
25%,89.814,89.88,89.74,89.81,3431.0,57.0,89.808729
50%,190.0,190.16,189.83,190.0,25736.0,601.0,189.998287
75%,369.46,369.8,369.08,369.46,93795.0,1327.0,369.474174
max,1078.181,1078.235,1078.0,1078.0,130782800.0,356453.0,1078.0


In [6]:
df_sorted = df.sort_values(['symbol', 'timestamp']).reset_index(drop=True)

In [7]:
df_sorted.head()

Unnamed: 0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap
0,AAPL,2024-08-26 08:00:00+00:00,226.25,226.41,226.25,226.41,1965.0,219.0,226.365909
1,AAPL,2024-08-26 08:05:00+00:00,226.32,226.81,226.32,226.8,5482.0,235.0,226.670652
2,AAPL,2024-08-26 08:10:00+00:00,226.72,226.72,226.66,226.66,1918.0,53.0,226.712568
3,AAPL,2024-08-26 08:15:00+00:00,226.71,226.84,226.71,226.82,1427.0,50.0,226.7925
4,AAPL,2024-08-26 08:20:00+00:00,226.84,226.84,226.84,226.84,369.0,16.0,226.84


In [8]:
# Ensure your DataFrame is sorted and has a 'close' column
# 1. Calculate the EMAs
df_sorted['ema_12'] = df_sorted['close'].ewm(span=12, adjust=False).mean()
df_sorted['ema_26'] = df_sorted['close'].ewm(span=26, adjust=False).mean()

# 2. MACD line = ema_12 - ema_26
df_sorted['macd'] = df_sorted['ema_12'] - df_sorted['ema_26']

# 3. Signal line (9-day EMA of MACD)
df_sorted['signal_line'] = df_sorted['macd'].ewm(span=9, adjust=False).mean()

# 4. MACD histogram (optional, often used for visualization)
df_sorted['macd_hist'] = df_sorted['macd'] - df_sorted['signal_line']

# 5. Trading signal (for example):
#    +1 when MACD > Signal (bullish)
#    -1 when MACD < Signal (bearish)
df_sorted['macd_signal'] = 0
df_sorted.loc[df_sorted['macd'] > df_sorted['signal_line'], 'macd_signal'] = 1
df_sorted.loc[df_sorted['macd'] < df_sorted['signal_line'], 'macd_signal'] = -1

In [9]:
df_sorted.head()

Unnamed: 0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap,ema_12,ema_26,macd,signal_line,macd_hist,macd_signal
0,AAPL,2024-08-26 08:00:00+00:00,226.25,226.41,226.25,226.41,1965.0,219.0,226.365909,226.41,226.41,0.0,0.0,0.0,0
1,AAPL,2024-08-26 08:05:00+00:00,226.32,226.81,226.32,226.8,5482.0,235.0,226.670652,226.47,226.438889,0.031111,0.006222,0.024889,1
2,AAPL,2024-08-26 08:10:00+00:00,226.72,226.72,226.66,226.66,1918.0,53.0,226.712568,226.499231,226.455267,0.043963,0.01377,0.030193,1
3,AAPL,2024-08-26 08:15:00+00:00,226.71,226.84,226.71,226.82,1427.0,50.0,226.7925,226.54858,226.482285,0.066295,0.024275,0.04202,1
4,AAPL,2024-08-26 08:20:00+00:00,226.84,226.84,226.84,226.84,369.0,16.0,226.84,226.593414,226.508782,0.084632,0.036347,0.048285,1
