#Prepare

### Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Import Original Stock Price

The following stock prices are up to February 10, 2026

In [None]:
import pandas as pd

# 1. Define a list named stock_symbols
stock_symbols = ['AMD', 'GLD', 'GS', 'INTC', 'JPM', 'META', 'MSFT', 'MU', 'NVDA', 'RXRX', 'TSLA']

# 2. Define a string variable base_path
base_path = '/content/drive/MyDrive/Quant Trading/Stock Price/'

# 3. Initialize an empty dictionary named stock_data
stock_data = {}

# 4. Loop through each symbol in the stock_symbols list
for symbol in stock_symbols:
    try:
        # a. Construct the full file path for the current stock's CSV file
        file_path = f'{base_path}{symbol}.csv'

        # b. Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(file_path)

        # c. Store the DataFrame in the stock_data dictionary
        stock_data[symbol] = df
        print(f"Successfully loaded {symbol}.csv")
    except FileNotFoundError:
        print(f"Error: {symbol}.csv not found at {file_path}")
    except Exception as e:
        print(f"Error loading {symbol}.csv: {e}")

# 5. Print the keys of the stock_data dictionary to confirm
print("\nLoaded stock symbols:")
print(stock_data.keys())

# Save/ Load/ Display 'stock_data'

### Save 'stock_data'

In [None]:
import pickle
import os

# Define the full path to save the file
save_path = os.path.join(base_path, 'all_stock_data.pkl')

# Use pickle to save the stock_data dictionary to a file
try:
    with open(save_path, 'wb') as f:
        pickle.dump(stock_data, f)
    print(f"'stock_data' dictionary successfully saved to: {save_path}")
except Exception as e:
    print(f"Error saving 'stock_data' dictionary: {e}")

### Load 'stock_data'

In [None]:
import pickle
import os

# Define the base path (originally from cell af55b064)
base_path = '/content/drive/MyDrive/Quant Trading/Stock Price/'

# Define the full path to load the file
load_path = os.path.join(base_path, 'all_stock_data.pkl')

# Load the 'stock_data' dictionary from the file
try:
    with open(load_path, 'rb') as f:
        loaded_stock_data = pickle.load(f)
    print(f"'stock_data' dictionary successfully loaded from: {load_path}")
    print(f"Loaded stock symbols: {loaded_stock_data.keys()}")

    # Assign the loaded data back to the stock_data variable for continued use
    stock_data = loaded_stock_data

    print("\n--- Sample DataFrame (AMD from loaded data) ---")
    display(stock_data['AMD'].head(10))

except FileNotFoundError:
    print(f"Error: File not found at {load_path}")
except Exception as e:
    print(f"Error loading 'stock_data' dictionary: {e}")

### Display All Feature Names

In [None]:
import pandas as pd

# Take one sample DataFrame (e.g., AMD) to inspect its columns
sample_df = stock_data['AMD'].copy()

# Define a list of original columns that are not engineered features
original_cols = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume', 'Adj Close']

# Filter out original columns to show only the engineered features
engineered_features = [col for col in sample_df.columns if col not in original_cols]

print("Current Engineered Feature Names:")
for feature in engineered_features:
    print(feature)

print(f"\nTotal Engineered Features: {len(engineered_features)}")
print(f"Total Columns in a DataFrame: {len(sample_df.columns)}")


### Display Sample Data for Selected Stocks for Convinience of Checking

In [None]:
import pandas as pd

# Select three sample stock symbols for display
sample_symbols = ['AMD', 'GLD', 'MSFT']

print("--- Sample Data for Selected Stocks (First 5 rows, after initial NaNs) ---\n")

for symbol in sample_symbols:
    if symbol in stock_data:
        print(f"Displaying data for {symbol}:")
        df = stock_data[symbol].copy()

        # Drop rows where any of the EMA or SMA columns are NaN to get valid data points
        # This ensures we see data where indicators are fully calculated
        ma_cols = [col for col in df.columns if col.startswith('EMA_') or col.startswith('SMA_')]
        df_clean = df.dropna(subset=ma_cols)

        # Display the first 5 rows of the cleaned DataFrame (which will be later in time)
        display(df_clean.head(5))
        print("\n" + "-" * 50 + "\n") # Separator for readability
    else:
        print(f"Error: {symbol} not found in stock_data dictionary.")


#Feature Calculation

### Calculate 5,10,20,50-day EMA and 50,100,200-day SMA

In [None]:
import pandas as pd

# Define the EMA periods to calculate
ema_periods = [5, 10, 20, 50]

# Define the SMA periods to calculate
sma_periods = [50, 100, 200]

# Loop through each stock in the stock_data dictionary
for symbol, df in stock_data.items():
    # Ensure 'Close' column is numeric
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')

    # Sort by date to ensure correct rolling calculations (if not already sorted)
    # This is crucial for time-series calculations like moving averages.
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.sort_values(by='Date').reset_index(drop=True)

    # Calculate EMAs
    for period in ema_periods:
        df[f'EMA_{period}'] = df['Close'].ewm(span=period, adjust=False).mean()

    # Calculate SMAs
    for period in sma_periods:
        df[f'SMA_{period}'] = df['Close'].rolling(window=period).mean()

    # Update the DataFrame in the dictionary
    stock_data[symbol] = df
    print(f"Successfully calculated EMAs and SMAs for {symbol}.")

print("\n--- Sample DataFrame (AMD with new MA columns) ---")
display(stock_data['AMD'].head(10)) # Display first 10 rows to see more MA values


### Calculate 5-day, 14-day, and 21-day RSI

In [None]:
import pandas as pd

# Define the RSI periods to calculate
rsi_periods = [5, 14, 21]

# Loop through each stock in the stock_data dictionary
for symbol, df in stock_data.items():
    # Ensure 'Close' column is numeric
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')

    # Ensure 'Date' column is datetime and sorted for correct time-series calculations
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.sort_values(by='Date').reset_index(drop=True)

    # Calculate daily price changes
    delta = df['Close'].diff()

    # Calculate gains and losses
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)

    for period in rsi_periods:
        # Calculate exponential moving average of gains and losses
        avg_gain = gain.ewm(com=period-1, adjust=False).mean()
        avg_loss = loss.ewm(com=period-1, adjust=False).mean()

        # Calculate Relative Strength (RS)
        rs = avg_gain / avg_loss

        # Calculate RSI
        df[f'RSI_{period}'] = 100 - (100 / (1 + rs))

    # Update the DataFrame in the dictionary
    stock_data[symbol] = df
    print(f"Successfully calculated RSIs for {symbol}.")

print("\n--- Sample DataFrame (AMD with new RSI columns, after initial NaNs) ---")
# Display a sample with cleaned NaNs to see valid RSI values
sample_df = stock_data['AMD'].copy()
ma_rsi_cols = [col for col in sample_df.columns if col.startswith('EMA_') or col.startswith('SMA_') or col.startswith('RSI_')]
sample_df_clean = sample_df.dropna(subset=ma_rsi_cols)
display(sample_df_clean.head(10))


### Calculate MACD, Signal Line, and MACD Histogram


MACD line is based on 12-day EMA - 26-day EMA here.\
Signal Line is based on 9-day EMA of MACD.\
MACD Histogram = MACD - Signal.

In [None]:
import pandas as pd

# Define MACD periods
ema_fast_period = 12
ema_slow_period = 26
signal_period = 9

# Loop through each stock in the stock_data dictionary
for symbol, df in stock_data.items():
    # Ensure 'Close' column is numeric
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')

    # Ensure 'Date' column is datetime and sorted for correct time-series calculations
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.sort_values(by='Date').reset_index(drop=True)

    # Calculate Fast EMA (12-period EMA)
    df['EMA_Fast'] = df['Close'].ewm(span=ema_fast_period, adjust=False).mean()

    # Calculate Slow EMA (26-period EMA)
    df['EMA_Slow'] = df['Close'].ewm(span=ema_slow_period, adjust=False).mean()

    # Calculate MACD Line
    df['MACD'] = df['EMA_Fast'] - df['EMA_Slow']

    # Calculate Signal Line (9-period EMA of MACD Line)
    df['Signal_Line'] = df['MACD'].ewm(span=signal_period, adjust=False).mean()

    # Calculate MACD Histogram
    df['MACD_Histogram'] = df['MACD'] - df['Signal_Line']

    # Drop temporary EMA columns if desired (optional)
    df.drop(columns=['EMA_Fast', 'EMA_Slow'], inplace=True, errors='ignore')

    # Update the DataFrame in the dictionary
    stock_data[symbol] = df
    print(f"Successfully calculated MACD indicators for {symbol}.")

print("\n--- Sample DataFrame (AMD with new MACD columns, after initial NaNs) ---")
# Display a sample with cleaned NaNs to see valid MACD values
sample_df = stock_data['AMD'].copy()
macd_cols = [col for col in sample_df.columns if col.startswith('MACD') or col.startswith('Signal_Line')]
sample_df_clean = sample_df.dropna(subset=macd_cols)
display(sample_df_clean.head(10))


### Calculate 5-day, 14-day, and 21-day Average True Range (ATR)



Using EMA for smoothing here.

In [None]:
import pandas as pd
import numpy as np

# Define the ATR periods to calculate
atr_periods = [5, 14, 21]

# Loop through each stock in the stock_data dictionary
for symbol, df in stock_data.items():
    # Ensure relevant columns are numeric and Date is datetime
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
    df['High'] = pd.to_numeric(df['High'], errors='coerce')
    df['Low'] = pd.to_numeric(df['Low'], errors='coerce')

    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.sort_values(by='Date').reset_index(drop=True)

    # Calculate True Range (TR)
    # High - Low
    high_low = df['High'] - df['Low']
    # High - Previous Close (absolute value)
    high_prev_close = np.abs(df['High'] - df['Close'].shift(1))
    # Low - Previous Close (absolute value)
    low_prev_close = np.abs(df['Low'] - df['Close'].shift(1))

    # True Range is the maximum of the three
    df['TR'] = high_low.combine(high_prev_close, max).combine(low_prev_close, max)

    # Calculate ATR for each defined period using EMA of TR
    for period in atr_periods:
        df[f'ATR_{period}'] = df['TR'].ewm(span=period, adjust=False).mean()

    # Drop the temporary TR column if desired
    df.drop(columns=['TR'], inplace=True, errors='ignore')

    # Update the DataFrame in the dictionary
    stock_data[symbol] = df
    print(f"Successfully calculated ATR indicators for {symbol}.")

print("\n--- Sample DataFrame (AMD with new ATR columns, after initial NaNs) ---")
# Display a sample with cleaned NaNs to see valid ATR values
sample_df = stock_data['AMD'].copy()
atr_cols = [col for col in sample_df.columns if col.startswith('ATR_')]
sample_df_clean = sample_df.dropna(subset=atr_cols)
display(sample_df_clean.head(10))


### Calculate 5-day, 20-day, and 60-day Volume SMA


In [None]:
import pandas as pd

# Define the Volume SMA periods to calculate
volume_sma_periods = [5, 20, 60]

# Loop through each stock in the stock_data dictionary
for symbol, df in stock_data.items():
    # Ensure 'Volume' column is numeric
    df['Volume'] = pd.to_numeric(df['Volume'], errors='coerce')

    # Ensure 'Date' column is datetime and sorted for correct time-series calculations
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.sort_values(by='Date').reset_index(drop=True)

    # Calculate SMAs for Volume
    for period in volume_sma_periods:
        df[f'Volume_SMA_{period}'] = df['Volume'].rolling(window=period).mean()

    # Update the DataFrame in the dictionary
    stock_data[symbol] = df
    print(f"Successfully calculated Volume SMAs for {symbol}.")

print("\n--- Sample DataFrame (AMD with new Volume SMA columns, after initial NaNs) ---")
# Display a sample with cleaned NaNs to see valid Volume SMA values
sample_df = stock_data['AMD'].copy()
volume_sma_cols = [col for col in sample_df.columns if col.startswith('Volume_SMA_')]
sample_df_clean = sample_df.dropna(subset=volume_sma_cols)
display(sample_df_clean.head(10))


### Calculate On-Balance Volume (OBV) and its Rolling Z-Scores (20-day and 50-day)


In [None]:
import pandas as pd
import numpy as np

# Define the rolling window periods for OBV Z-score
obv_zscore_periods = [20, 50]

# Loop through each stock in the stock_data dictionary
for symbol, df in stock_data.items():
    # Ensure 'Close' and 'Volume' columns are numeric
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
    df['Volume'] = pd.to_numeric(df['Volume'], errors='coerce')

    # Ensure 'Date' column is datetime and sorted for correct time-series calculations
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.sort_values(by='Date').reset_index(drop=True)

    # --- Calculate OBV (On-Balance Volume) ---
    # Initialize OBV column with NaNs or 0, then calculate based on price changes
    df['OBV'] = np.nan

    if len(df) > 0:
        # The first OBV value is typically the first day's volume
        df.loc[0, 'OBV'] = df.loc[0, 'Volume']

        # Calculate OBV for subsequent rows using a loop for clarity and correctness
        # (though vectorized approaches exist, this is robust for varied initial data)
        for i in range(1, len(df)):
            if pd.isna(df.loc[i, 'Close']) or pd.isna(df.loc[i-1, 'Close']) or pd.isna(df.loc[i, 'Volume']):
                df.loc[i, 'OBV'] = df.loc[i-1, 'OBV'] # Maintain previous OBV if data is missing
                continue

            if df.loc[i, 'Close'] > df.loc[i-1, 'Close']:
                df.loc[i, 'OBV'] = df.loc[i-1, 'OBV'] + df.loc[i, 'Volume']
            elif df.loc[i, 'Close'] < df.loc[i-1, 'Close']:
                df.loc[i, 'OBV'] = df.loc[i-1, 'OBV'] - df.loc[i, 'Volume']
            else: # Close == Prev Close
                df.loc[i, 'OBV'] = df.loc[i-1, 'OBV']

    # --- Calculate Rolling Z-scores for OBV for defined periods ---
    for period in obv_zscore_periods:
        rolling_mean_col = f'OBV_Rolling_Mean_{period}'
        rolling_std_col = f'OBV_Rolling_Std_{period}'
        z_score_col = f'OBV_Z_Score_{period}'

        # Calculate rolling mean and standard deviation of OBV
        # min_periods=1 allows calculation to start as soon as 1 data point is available
        df[rolling_mean_col] = df['OBV'].rolling(window=period, min_periods=1).mean()
        df[rolling_std_col] = df['OBV'].rolling(window=period, min_periods=1).std()

        # Calculate OBV Z-score. Handle division by zero for std dev (will result in NaN).
        df[z_score_col] = (df['OBV'] - df[rolling_mean_col]) / df[rolling_std_col]
        # Fix: Avoid inplace=True with chained assignment to prevent FutureWarning
        df[z_score_col] = df[z_score_col].replace([np.inf, -np.inf], np.nan) # Replace inf with NaN if std dev is 0

        # Drop temporary rolling mean and std columns if desired (optional)
        df.drop(columns=[rolling_mean_col, rolling_std_col], inplace=True, errors='ignore')

    # Update the DataFrame in the dictionary
    stock_data[symbol] = df
    print(f"Successfully calculated OBV and its Rolling Z-scores for {symbol}.")

print("\n--- Sample DataFrame (AMD with new OBV and OBV_Z_Score columns, after initial NaNs) ---")
# Display a sample with cleaned NaNs to see valid OBV and OBV_Z_Score values
sample_df = stock_data['AMD'].copy()
obv_zscore_cols = ['OBV', 'OBV_Z_Score_20', 'OBV_Z_Score_50']
sample_df_clean = sample_df.dropna(subset=obv_zscore_cols)
display(sample_df_clean.head(10))

### Calculate 5-day and 14-day Stochastic Oscillator



%K and %D lines for periods of 5 and 14 days.\
%D is 3-day SMA of %K.

In [None]:
import pandas as pd
import numpy as np

# Define the Stochastic periods to calculate
stochastic_periods = [5, 14]

# Define the %D smoothing period (standard is 3)
d_period = 3

# Loop through each stock in the stock_data dictionary
for symbol, df in stock_data.items():
    # Ensure relevant columns are numeric and Date is datetime
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
    df['High'] = pd.to_numeric(df['High'], errors='coerce')
    df['Low'] = pd.to_numeric(df['Low'], errors='coerce')

    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.sort_values(by='Date').reset_index(drop=True)

    for period in stochastic_periods:
        # Calculate Highest High (HH) and Lowest Low (LL) over the period
        df[f'HH_{period}'] = df['High'].rolling(window=period).max()
        df[f'LL_{period}'] = df['Low'].rolling(window=period).min()

        # Calculate Raw %K
        # Avoid division by zero if (HH - LL) is zero
        denominator = (df[f'HH_{period}'] - df[f'LL_{period}']).replace(0, np.nan) # Replace 0 with NaN to avoid inf
        df[f'K_{period}'] = ((df['Close'] - df[f'LL_{period}']) / denominator) * 100

        # Calculate %D (3-period SMA of %K)
        df[f'D_{period}'] = df[f'K_{period}'].rolling(window=d_period).mean()

        # Drop temporary HH and LL columns
        df.drop(columns=[f'HH_{period}', f'LL_{period}'], inplace=True, errors='ignore')

    # Update the DataFrame in the dictionary
    stock_data[symbol] = df
    print(f"Successfully calculated Stochastic Oscillators for {symbol}.")

print("\n--- Sample DataFrame (AMD with new Stochastic columns, after initial NaNs) ---")
# Display a sample with cleaned NaNs to see valid Stochastic values
sample_df = stock_data['AMD'].copy()
stochastic_cols = [col for col in sample_df.columns if col.startswith('K_') or col.startswith('D_')]
sample_df_clean = sample_df.dropna(subset=stochastic_cols)
display(sample_df_clean.head(10))


### Calculate 5-day, 14-day, and 21-day Money Flow Index (MFI)

In [None]:
import pandas as pd
import numpy as np

# Define the MFI periods to calculate
mfi_periods = [5, 14, 21]

# Loop through each stock in the stock_data dictionary
for symbol, df in stock_data.items():
    # Ensure relevant columns are numeric and Date is datetime
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
    df['High'] = pd.to_numeric(df['High'], errors='coerce')
    df['Low'] = pd.to_numeric(df['Low'], errors='coerce')
    df['Volume'] = pd.to_numeric(df['Volume'], errors='coerce')

    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.sort_values(by='Date').reset_index(drop=True)

    # Calculate Typical Price (TP)
    df['TP'] = (df['High'] + df['Low'] + df['Close']) / 3

    # Calculate Raw Money Flow (MF)
    df['Money_Flow'] = df['TP'] * df['Volume']

    # Calculate Positive and Negative Money Flow
    # Shift TP by 1 to compare with previous day's typical price
    df['Positive_MF'] = np.where(df['TP'] > df['TP'].shift(1), df['Money_Flow'], 0)
    df['Negative_MF'] = np.where(df['TP'] < df['TP'].shift(1), df['Money_Flow'], 0)

    for period in mfi_periods:
        # Calculate Money Ratio (MR)
        # Rolling sum of positive and negative money flow over the period
        positive_money_flow_sum = df['Positive_MF'].rolling(window=period, min_periods=1).sum()
        negative_money_flow_sum = df['Negative_MF'].rolling(window=period, min_periods=1).sum()

        # Avoid division by zero
        money_ratio = positive_money_flow_sum / negative_money_flow_sum
        money_ratio.replace([np.inf, -np.inf], np.nan, inplace=True)
        money_ratio.fillna(0, inplace=True) # Handle cases where negative_money_flow_sum is 0

        # Calculate MFI
        df[f'MFI_{period}'] = 100 - (100 / (1 + money_ratio))

    # Drop temporary columns if desired
    df.drop(columns=['TP', 'Money_Flow', 'Positive_MF', 'Negative_MF'], inplace=True, errors='ignore')

    # Update the DataFrame in the dictionary
    stock_data[symbol] = df
    print(f"Successfully calculated MFI indicators for {symbol}.")

print("\n--- Sample DataFrame (AMD with new MFI columns, after initial NaNs) ---")
# Display a sample with cleaned NaNs to see valid MFI values
sample_df = stock_data['AMD'].copy()
mfi_cols = [col for col in sample_df.columns if col.startswith('MFI_')]
sample_df_clean = sample_df.dropna(subset=mfi_cols)
display(sample_df_clean.head(10))


### Calculate Bollinger Bands (Middle, Upper, and Lower Bands)


A standard 20-day period for the Middle Band (SMA) and 2 standard deviations for the Upper and Lower Bands are used.

In [None]:
import pandas as pd

# Define Bollinger Band period and standard deviation multiplier
bb_period = 20
bb_std_dev_multiplier = 2

# Loop through each stock in the stock_data dictionary
for symbol, df in stock_data.items():
    # Ensure 'Close' column is numeric
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')

    # Ensure 'Date' column is datetime and sorted for correct time-series calculations
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.sort_values(by='Date').reset_index(drop=True)

    # Calculate Middle Band (20-day SMA)
    df[f'BB_Middle_{bb_period}'] = df['Close'].rolling(window=bb_period).mean()

    # Calculate Standard Deviation over the same period
    df[f'BB_StdDev_{bb_period}'] = df['Close'].rolling(window=bb_period).std()

    # Calculate Upper Band
    df[f'BB_Upper_{bb_period}'] = df[f'BB_Middle_{bb_period}'] + (df[f'BB_StdDev_{bb_period}'] * bb_std_dev_multiplier)

    # Calculate Lower Band
    df[f'BB_Lower_{bb_period}'] = df[f'BB_Middle_{bb_period}'] - (df[f'BB_StdDev_{bb_period}'] * bb_std_dev_multiplier)

    # Drop the temporary Standard Deviation column if desired
    df.drop(columns=[f'BB_StdDev_{bb_period}'], inplace=True, errors='ignore')

    # Update the DataFrame in the dictionary
    stock_data[symbol] = df
    print(f"Successfully calculated Bollinger Bands for {symbol}.")

print("\n--- Sample DataFrame (AMD with new Bollinger Band columns, after initial NaNs) ---")
# Display a sample with cleaned NaNs to see valid Bollinger Band values
sample_df = stock_data['AMD'].copy()
bb_cols = [col for col in sample_df.columns if col.startswith('BB_')]
sample_df_clean = sample_df.dropna(subset=bb_cols)
display(sample_df_clean.head(10))
