<a href="https://colab.research.google.com/github/thunder913/Deep-Learning/blob/main/stocks_deep_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [102]:
import kagglehub
import numpy as np
import os
import pandas as pd
import kagglehub
import matplotlib.pyplot as plt
import pickle
import ta

from sklearn.preprocessing import LabelEncoder

In [16]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Stocks LSTM Prediction

In [19]:
google_drive_path = 'drive/MyDrive/DL_Pickles/'
local_path = './pickles/'

path = kagglehub.dataset_download("tsaustin/us-historical-stock-prices-with-earnings-data")

stock_prices_path = f"{path}/stocks_latest/stock_prices_latest.csv"

stock_data_path = f"./{google_drive_path}/stock_data_processed.pkl"

In [65]:
def calculate_rsi(group, period = 14):
    delta = group['close'].diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=period, min_periods=1).mean()
    avg_loss = loss.rolling(window=period, min_periods=1).mean()
    rs = avg_gain / avg_loss
    group['RSI'] = 100 - (100 / (1 + rs))
    return group

def calculate_macd(group, short_window=12, long_window=26, signal_window=9):
    # Calculate EMA short and long
    ema_short = group['close'].ewm(span=short_window, adjust=False).mean()
    ema_long = group['close'].ewm(span=long_window, adjust=False).mean()

    # Calculate MACD and Signal Line
    group['MACD'] = ema_short - ema_long
    group['Signal_Line'] = group['MACD'].ewm(span=signal_window, adjust=False).mean()

    return group

In [64]:
def get_peak_price_and_days_ago(df):
    """
    Calculate the peak price over a rolling 90-day window and determine the number of days
    since the peak price for each row in the DataFrame.
    """

    df['peak_price'] = df['close'].rolling(window=90, min_periods=1).max()
    peak_day_index = df['close'].rolling(window=90, min_periods=1).apply(lambda x: (len(x) - 1 - x.argmax()), raw=False)
    df['peak_days_ago'] = peak_day_index.fillna(0).astype(int)  # Fill NaN for rows that don't have enough data
    return df

def process_ticker_data(df):
    """
    Add historical price features for specific days ago for each stock (grouped by ticker).
    Also calculate the peak price and days since the peak price for each stock.
    """

    print('Calculated RSI')
    # Calculate RSI (14-day by default)
    df = df.groupby('ticker', group_keys=False).apply(calculate_rsi)

    print('Calculated moving averages')
    # Add moving averages
    moving_average_windows = [5, 10, 20, 50, 100, 200]  # Specify moving average windows
    for window in moving_average_windows:
        df[f'{window}_day_MA'] = df.groupby('ticker')['close'].rolling(window=window, min_periods=1).mean().reset_index(level=0, drop=True)

    print('Calculated MACD')
    # Calculate MACD and Signal Line

    df = df.groupby('ticker', group_keys=False).apply(calculate_macd)

    print('Calculated price days ago')
    # Add columns for prices X days ago
    for days in [1,2,3,4,5, 10, 15, 30, 60, 90]:
        df[f'price_{days}_days_ago'] = df.groupby('ticker')['close'].shift(days)

    df = df.groupby('ticker', group_keys=False).apply(get_peak_price_and_days_ago)

    # Remove the first 200 rows for each ticker
    df = df.groupby('ticker').apply(lambda group: group.iloc[200:]).reset_index(drop=True)

    return df

In [60]:
def getTidiedStockData():
    """
    Read and tidy the stock data dataset. If the processed stock data already exists as a pickle file,
    it loads and returns the data. Otherwise, it processes the raw stock data, cleans it, adds features,
    and saves the tidy data to a pickle file for future use.
    """

    # Check if the file exists
    if os.path.exists(stock_data_path):
        return pd.read_pickle(stock_data_path)

    # Read the basic data and some tidying
    stock_data = pd.read_csv(stock_prices_path)
    stock_data_tidy = stock_data

    stock_data_tidy.date = pd.to_datetime(stock_data_tidy.date)
    stock_data_tidy = stock_data_tidy.rename(columns={'symbol': 'ticker'})
    stock_data_tidy = stock_data_tidy.drop(columns=['split_coefficient'])

    stock_data_tidy = stock_data_tidy.sort_values(by=['ticker', 'date'], ascending=[True, True])

    stock_data_tidy = process_ticker_data(stock_data_tidy)

    stock_data_tidy = stock_data_tidy[stock_data_tidy.price_90_days_ago.isna() == False]

    # Encode labels
    le = LabelEncoder()
    stock_data_tidy['ticker_encoded'] = le.fit_transform(stock_data_tidy['ticker'])

    stock_data_tidy.to_pickle(stock_data_path)

    return stock_data_tidy

In [66]:
stock_data_tidy = getTidiedStockData()

Calculated RSI


  df = df.groupby('ticker', group_keys=False).apply(calculate_rsi)


Calculated moving averages
Calculated MACD


  df = df.groupby('ticker', group_keys=False).apply(calculate_macd)


Calculated price days ago


  df = df.groupby('ticker', group_keys=False).apply(get_peak_price_and_days_ago)
  df = df.groupby('ticker').apply(lambda group: group.iloc[200:]).reset_index(drop=True)


In [77]:
stock_data_tidy[stock_data_tidy.ticker == 'AAPL']

Unnamed: 0,ticker,date,open,high,low,close,close_adjusted,volume,RSI,5_day_MA,...,price_4_days_ago,price_5_days_ago,price_10_days_ago,price_15_days_ago,price_30_days_ago,price_60_days_ago,price_90_days_ago,peak_price,peak_days_ago,ticker_encoded
45385,AAPL,1998-10-19,36.69,38.06,35.8800,37.50,1.1777,4248000,45.088409,37.390,...,38.75,37.44,32.19,39.06,35.13,34.69,27.81,43.00,40,14
45386,AAPL,1998-10-20,37.94,38.19,36.0000,36.06,1.1325,3411500,44.933921,36.852,...,37.38,38.75,32.56,39.50,38.25,34.44,28.12,43.00,41,14
45387,AAPL,1998-10-21,36.75,37.44,35.7500,37.13,1.1661,3844800,53.777545,36.802,...,36.63,37.38,31.94,38.13,37.38,33.63,27.50,43.00,42,14
45388,AAPL,1998-10-22,36.88,37.63,36.2500,36.75,1.1541,2833700,54.492291,36.826,...,36.69,36.63,30.81,35.69,38.13,35.13,28.00,43.00,43,14
45389,AAPL,1998-10-23,36.75,36.88,35.1300,35.50,1.1149,3178400,59.627691,36.588,...,37.50,36.69,35.13,35.06,37.63,36.50,28.12,43.00,44,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51079,AAPL,2021-06-07,126.17,126.32,124.8321,125.90,125.9000,71057550,48.785292,124.934,...,124.28,124.61,125.43,127.45,134.32,121.96,142.06,137.39,84,14
51080,AAPL,2021-06-08,126.60,128.46,126.2101,126.74,126.7400,74403774,56.450512,125.426,...,125.06,124.28,127.10,126.27,134.72,121.03,137.09,137.39,85,14
51081,AAPL,2021-06-09,127.21,127.75,126.5200,127.13,127.1300,56877937,58.198925,125.840,...,123.54,125.06,126.90,124.85,134.39,123.99,131.96,137.39,86,14
51082,AAPL,2021-06-10,127.02,128.19,125.9400,126.11,126.1100,71186421,45.481928,126.354,...,125.89,123.54,126.85,124.69,133.58,125.57,134.14,137.39,87,14


In [82]:
import unittest
import pandas as pd

class TestCalculateRSI(unittest.TestCase):

    def setUp(self):
        # Sample data for testing
        self.data = {
            'close': [44.34, 44.09, 44.15, 43.61, 44.33, 44.83, 45.10, 45.42, 45.84, 46.08, 45.89, 46.03, 45.61, 46.28, 46.28]
        }
        self.expected_rsi = [
            None, None, None, None, None, None, None, None, None, None,
            70.53, 66.36, 57.92, 62.84, 62.84
        ]

        # Create a DataFrame for testing
        self.df = pd.DataFrame(self.data)

    def test_calculate_rsi(self):

        print(self.df)
        # Apply the function
        result = calculate_rsi(self.df.copy())

        # Assert that RSI is calculated correctly
        for i, expected in enumerate(self.expected_rsi):
            if expected is None:
                self.assertTrue(pd.isna(result.iloc[i]['RSI']))
            else:
                self.assertAlmostEqual(result.iloc[i]['RSI'], expected, places=2)

if __name__ == '__main__':
    unittest.main()


E
ERROR: /root/ (unittest.loader._FailedTest./root/)
----------------------------------------------------------------------
AttributeError: module '__main__' has no attribute '/root/'

----------------------------------------------------------------------
Ran 1 test in 0.001s

FAILED (errors=1)


SystemExit: True

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [97]:
def calculate_rsi1(group, period=14):
    # Calculate the price differences
    delta = group['close'].diff()

    # Separate gains and losses
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)

    # Calculate the first average gain and loss (simple average for first period)
    avg_gain = gain.rolling(window=period, min_periods=period).mean()
    avg_loss = loss.rolling(window=period, min_periods=period).mean()

    # Calculate smoothed average gain and loss
    avg_gain = avg_gain.shift(1) * (period - 1) / period + gain / period
    avg_loss = avg_loss.shift(1) * (period - 1) / period + loss / period

    # Handle the case where avg_loss is zero to avoid division by zero
    rs = avg_gain / avg_loss
    rs = rs.fillna(0)

    # Calculate RSI
    rsi = 100 - (100 / (1 + rs))

    # Assign the RSI back to the DataFrame
    group['RSI'] = rsi

    return group

In [96]:
data = {
            'close': [44.34, 44.09, 44.15, 43.61, 44.33, 44.83, 45.10, 45.42, 45.84, 46.08, 45.89, 46.03, 45.61, 46.28, 46.28]
        }

calculate_rsi1(df,7)
df = pd.DataFrame(data)

In [98]:
calculate_rsi1(df,7)


Unnamed: 0,close,RSI
0,44.34,0.0
1,44.09,0.0
2,44.15,0.0
3,43.61,0.0
4,44.33,0.0
5,44.83,0.0
6,45.1,0.0
7,45.42,70.884521
8,45.84,74.920635
9,46.08,82.636656
