## Necessary imports

In [1]:
import os
import sys

sys.path.append(os.path.abspath('..'))
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

# To ignore warnings
import warnings

import pandas as pd
import requests
import yfinance as yf
from bs4 import BeautifulSoup

from utils.data import (
    _get_talib_momentum_indicators,
    _get_talib_pattern_indicators,
    clean_market_cap,
    features_based_on_fundamentals,
    features_based_on_price,
    get_date_of_previous_month,
)

warnings.filterwarnings('ignore')

# Display settings for pandas
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

## Fetch data

In [2]:
STOCKS_URL = "https://stockanalysis.com/list/nasdaq-100-stocks/"
START_DATE = '2020-01-01'
END_DATE = get_date_of_previous_month()

assert pd.to_datetime(END_DATE) < pd.to_datetime('today'), "END_DATE must be in the past"
assert pd.to_datetime(START_DATE) < pd.to_datetime(END_DATE), "START_DATE must be before END_DATE"

In [3]:
response = requests.get(STOCKS_URL)
soup = BeautifulSoup(response.text, "lxml")

table = soup.find('table')
rows = table.find_all('tr')

# Make sure to call find_all on the first row only for headers
headers = [th.text.strip() for th in rows[0].find_all('th')]
data = []
for row in rows[1:]:
    cols = [td.text.strip() for td in row.find_all('td')]
    if cols:
        data.append(cols)

df = pd.DataFrame(data, columns=headers)
number_of_stocks = len(df)
print(f"**Number of stocks in the list: {number_of_stocks}\n\n")
print(df.head())

**Number of stocks in the list: 101


  No. Symbol           Company Name Market Cap Stock Price % Change  Revenue
0   1   NVDA     NVIDIA Corporation  4,293.08B      176.67    0.24%  165.22B
1   2   MSFT  Microsoft Corporation  3,849.86B      517.93    1.86%  281.72B
2   3   AAPL             Apple Inc.  3,643.32B      245.50    3.20%  408.63B
3   4   GOOG          Alphabet Inc.  3,083.41B      255.24    1.15%  371.40B
4   5  GOOGL          Alphabet Inc.  3,080.58B      254.72    1.07%  371.40B


In [4]:
# Clean 'Market Cap' column and convert to numeric
df = clean_market_cap(df)

In [5]:
# select the top 24 stocks by Market Cap (it is alreasdy sorted by volume on the website,
# but let's be sure in case that changes in the future)

df.sort_values(by='Market Cap', ascending=False, inplace=True)
top_24_by_volume = df.head(24)

print(top_24_by_volume[['Symbol', 'Company Name', 'Market Cap']])

   Symbol                  Company Name  Market Cap
0    NVDA            NVIDIA Corporation     4293.08
1    MSFT         Microsoft Corporation     3849.86
2    AAPL                    Apple Inc.     3643.32
3    GOOG                 Alphabet Inc.     3083.41
4   GOOGL                 Alphabet Inc.     3080.58
5    AMZN              Amazon.com, Inc.     2468.71
6    META          Meta Platforms, Inc.     1955.40
7    AVGO                 Broadcom Inc.     1628.93
8    TSLA                   Tesla, Inc.     1416.75
9    NFLX                 Netflix, Inc.      521.37
10   PLTR    Palantir Technologies Inc.      432.69
11   COST  Costco Wholesale Corporation      421.82
12   ASML             ASML Holding N.V.      361.06
13   CSCO           Cisco Systems, Inc.      269.65
14   TMUS             T-Mobile US, Inc.      268.04
15    AMD  Advanced Micro Devices, Inc.      255.42
16    AZN               AstraZeneca PLC      236.24
17    LIN                     Linde plc      224.61
18    APP   

In [6]:
top_24_by_volume

Unnamed: 0,No.,Symbol,Company Name,Market Cap,Stock Price,% Change,Revenue
0,1,NVDA,NVIDIA Corporation,4293.08,176.67,0.24%,165.22B
1,2,MSFT,Microsoft Corporation,3849.86,517.93,1.86%,281.72B
2,3,AAPL,Apple Inc.,3643.32,245.5,3.20%,408.63B
3,4,GOOG,Alphabet Inc.,3083.41,255.24,1.15%,371.40B
4,5,GOOGL,Alphabet Inc.,3080.58,254.72,1.07%,371.40B
5,6,AMZN,"Amazon.com, Inc.",2468.71,231.48,0.11%,670.04B
6,7,META,"Meta Platforms, Inc.",1955.4,778.38,-0.24%,178.80B
7,8,AVGO,Broadcom Inc.,1628.93,344.94,-0.12%,59.93B
8,9,TSLA,"Tesla, Inc.",1416.75,426.07,2.21%,92.72B
9,10,NFLX,"Netflix, Inc.",521.37,1226.97,1.59%,41.69B


In [7]:
tickers_list =  top_24_by_volume['Symbol'].tolist()[0:]

In [8]:
tickers_list

['NVDA',
 'MSFT',
 'AAPL',
 'GOOG',
 'GOOGL',
 'AMZN',
 'META',
 'AVGO',
 'TSLA',
 'NFLX',
 'PLTR',
 'COST',
 'ASML',
 'CSCO',
 'TMUS',
 'AMD',
 'AZN',
 'LIN',
 'APP',
 'SHOP',
 'PEP',
 'INTU',
 'PDD',
 'MU']

In [9]:
%%time 

ticker_data = dict()
dataset_price = dict()
dataset_price_df = pd.DataFrame()

for i,ticker in enumerate(tickers_list):

  print(f"{i} - Fetching data for {ticker} stock...\n")
  print("-"*24, "\n")
  
  ticker_data[ticker] = yf.Ticker(ticker).history(start=START_DATE, end=END_DATE, interval='1d')
  price_data_by_ticker = features_based_on_price(ticker_data[ticker], ticker=ticker)
  print("-"*24, "\n")
  
  dataset_price_df = pd.concat([dataset_price_df, price_data_by_ticker], axis=0)
  dataset_price[ticker] = price_data_by_ticker
  
  if i == 1:
    break

0 - Fetching data for NVDA stock...

------------------------ 

'Adj Close' not present in columns for NVDA.Using 'Close' instead.
Data columns:  Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits'], dtype='object')
Data index type:  <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
Data shape:  (1423, 7)
Resampling to month-end frequency... 

Resampling done. The new data shape is  (68, 22)
Final data shape for NVDA:  (63, 33)
------------------------ 

1 - Fetching data for MSFT stock...

------------------------ 

'Adj Close' not present in columns for MSFT.Using 'Close' instead.
Data columns:  Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits'], dtype='object')
Data index type:  <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
Data shape:  (1423, 7)
Resampling to month-end frequency... 

Resampling done. The new data shape is  (68, 22)
Final data shape for MSFT:  (63, 33)
------------------------ 

CPU times: user 88 ms, s

In [10]:
dataset_price_df.ticker.value_counts()

ticker
NVDA    63
MSFT    63
Name: count, dtype: int64

In [11]:
dataset_price_df.iloc[-1]

Date                      2025-08-31 00:00:00
Close                              506.690002
ln_volume_min                       16.558349
ln_volume_median                    16.858203
ln_volume_max                       17.244184
daily_growth_min                    -0.017601
daily_growth_median                 -0.004421
daily_growth_max                     0.021999
weekly_growth_min                   -0.036789
weekly_growth_median                 -0.00395
weekly_growth_max                    0.045151
biweekly_growth_min                 -0.049835
biweekly_growth_median              -0.020656
biweekly_growth_max                  0.050151
monthly_growth_min                  -0.048692
monthly_growth_median                0.016553
monthly_growth_max                   0.073771
rel_spread_oc_median                 0.004621
rel_spread_hl_median                 0.014583
vol_10d_mean_median                 -0.002008
vol_10d_std_median                   0.012003
vol_10d_min_median                

In [None]:
%%time 

dataset_fundamentals = dict()

for i,ticker in enumerate(tickers_list):

    print(f"{i} - Fetching data for {ticker} stock...\n")
    print("-"*24, "\n")

    dataset_fundamentals[ticker] = features_based_on_fundamentals(ticker, END_DATE)

    print("-"*24, "\n")

    if i == 1:
        break

0 - Fetching data for NVDA stock...

------------------------ 

Number of missing values in fund_feats_float: 0
------------------------ 

1 - Fetching data for MSFT stock...

------------------------ 

Number of missing values in fund_feats_float: 0
------------------------ 

CPU times: user 236 ms, sys: 22.8 ms, total: 259 ms
Wall time: 5.67 s


In [None]:
%%time 

dataset_talib_pattern = dict()
dataset_talib_momentum = dict()

for i,ticker in enumerate(tickers_list):

    print(f"{i} - Calculate TALIB-based fearures for {ticker} stock...\n")
    print("-"*24, "\n")

    data_with_date = ticker_data[ticker].reset_index()
    data_with_date_ticker = data_with_date.copy()
    data_with_date_ticker['Ticker'] = ticker

    rows_initial = len(data_with_date_ticker)
    
    # Ensure columns are float64 before passing to TA-Lib
    # That part should resolve the issue with the `mfi` indicator, but it did not
    # TODO: investigate further if needed
    for col in ['Open', 'High', 'Low', 'Close']:
        data_with_date_ticker[col] = pd.to_numeric(data_with_date_ticker[col], errors='coerce')

    rows_after_conversion = len(data_with_date_ticker.dropna(subset=['Open', 'High', 'Low', 'Close']))
    
    if rows_initial != rows_after_conversion:
        print(f"Warning: Number of rows changed after numeric conversion for {ticker} stock: {rows_initial} -> {rows_after_conversion}")
    
    dataset_talib_pattern[ticker] = _get_talib_pattern_indicators(data_with_date_ticker)
    print("# of pattern indicators calculated:", len(dataset_talib_pattern[ticker].columns))
    
    dataset_talib_momentum[ticker] = _get_talib_momentum_indicators(data_with_date_ticker)
    print("# of momentum indicators calculated:", len(dataset_talib_momentum[ticker].columns))

    zeros_pattern, zeros_momentum = (dataset_talib_pattern[ticker] == 0).sum().sum(),\
                                        (dataset_talib_momentum[ticker] == 0).sum().sum()
    print(f"% of zeros in pattern indicators: {zeros_pattern /  (dataset_talib_pattern[ticker].shape[0]*dataset_talib_pattern[ticker].shape[1]):.2%} ")
    print(f"% of zeros in momentum indicators: {zeros_momentum / (dataset_talib_momentum[ticker].shape[0]*dataset_talib_momentum[ticker].shape[1]):.2%} ")

    print("-"*24, "\n")

    if i == 1:
        break

0 - Calculate TALIB-based fearures for NVDA stock...

------------------------ 

# of pattern indicators calculated: 63
# of momentum indicators calculated: 40
% of zeros in pattern indicators: 93.62% 
% of zeros in momentum indicators: 1.46% 
------------------------ 

1 - Calculate TALIB-based fearures for MSFT stock...

------------------------ 

# of pattern indicators calculated: 63
# of momentum indicators calculated: 40
% of zeros in pattern indicators: 93.60% 
% of zeros in momentum indicators: 1.41% 
------------------------ 

CPU times: user 17.3 ms, sys: 2.94 ms, total: 20.2 ms
Wall time: 21.2 ms


## Final dataset creation by merging separate dataframes

In [19]:
#df_resampled = df.resample('ME').agg(agg_funcs)
dataset_talib_momentum[ticker].set_index('Date').drop(columns='Ticker').resample('ME').median().reset_index().dropna()
dataset_talib_pattern[ticker].set_index('Date').drop(columns='Ticker').resample('ME').median().reset_index().dropna()



Unnamed: 0,Date,cdl2crows,cdl3blackrows,cdl3inside,cdl3linestrike,cdl3outside,cdl3starsinsouth,cdl3whitesoldiers,cdlabandonedbaby,cdladvancedblock,cdlbelthold,cdlbreakaway,cdlclosingmarubozu,cdlconcealbabyswall,cdlcounterattack,cdldarkcloudcover,cdldoji,cdldojistar,cdldragonflydoji,cdlengulfing,cdleveningdojistar,cdleveningstar,cdlgapsidesidewhite,cdlgravestonedoji,cdlhammer,...,cdlladderbottom,cdllongleggeddoji,cdllongline,cdlmarubozu,cdlmatchinglow,cdlmathold,cdlmorningdojistar,cdlmorningstar,cdlonneck,cdlpiercing,cdlrickshawman,cdlrisefall3methods,cdlseparatinglines,cdlshootingstar,cdlshortline,cdlspinningtop,cdlstalledpattern,cdlsticksandwich,cdltakuru,cdltasukigap,cdlthrusting,cdltristar,cdlunique3river,cdlupsidegap2crows,cdlxsidegap3methods
0,2020-01-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-02-29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-03-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-04-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-05-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,2025-04-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64,2025-05-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65,2025-06-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66,2025-07-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
test = dataset_talib_pattern[ticker].set_index('Date').drop(columns='Ticker').resample('ME').median().reset_index().dropna()

In [21]:
test.columns

Index(['Date', 'cdl2crows', 'cdl3blackrows', 'cdl3inside', 'cdl3linestrike',
       'cdl3outside', 'cdl3starsinsouth', 'cdl3whitesoldiers',
       'cdlabandonedbaby', 'cdladvancedblock', 'cdlbelthold', 'cdlbreakaway',
       'cdlclosingmarubozu', 'cdlconcealbabyswall', 'cdlcounterattack',
       'cdldarkcloudcover', 'cdldoji', 'cdldojistar', 'cdldragonflydoji',
       'cdlengulfing', 'cdleveningdojistar', 'cdleveningstar',
       'cdlgapsidesidewhite', 'cdlgravestonedoji', 'cdlhammer',
       'cdlhangingman', 'cdlharami', 'cdlharamicross', 'cdlhighwave',
       'cdlhikkake', 'cdlhikkakemod', 'cdlhomingpigeon', 'cdlidentical3crows',
       'cdlinneck', 'cdlinvertedhammer', 'cdlkicking', 'cdlkickingbylength',
       'cdlladderbottom', 'cdllongleggeddoji', 'cdllongline', 'cdlmarubozu',
       'cdlmatchinglow', 'cdlmathold', 'cdlmorningdojistar', 'cdlmorningstar',
       'cdlonneck', 'cdlpiercing', 'cdlrickshawman', 'cdlrisefall3methods',
       'cdlseparatinglines', 'cdlshootingstar', 'cd

In [22]:
msft = price_data[ price_data['ticker'] == 'MSFT']

NameError: name 'price_data' is not defined

In [None]:
msft.columns

Index(['Date', 'Close', 'ln_volume_min', 'ln_volume_median', 'ln_volume_max',
       'daily_growth_min', 'daily_growth_median', 'daily_growth_max',
       'weekly_growth_min', 'weekly_growth_median', 'weekly_growth_max',
       'biweekly_growth_min', 'biweekly_growth_median', 'biweekly_growth_max',
       'monthly_growth_min', 'monthly_growth_median', 'monthly_growth_max',
       'rel_spread_oc_median', 'rel_spread_hl_median', 'vol_10d_mean_median',
       'vol_10d_std_median', 'vol_10d_min_median', 'vol_10d_max_median',
       'return_1m', 'return_2m', 'return_3m', 'vol_3m', 'momentum_3m',
       'mma_3_scaled', 'mma_6_scaled', 'year', 'month', 'ticker'],
      dtype='object')

In [None]:
data_m1 = pd.merge(msft, test, how='left',
                   on=['Date'],
                   validate = "many_to_one"
                   )

In [None]:
data_m1.shape, test.shape, msft.shape

((63, 94), (68, 62), (63, 33))