## Necessary imports

In [1]:
import os
import sys

sys.path.append(os.path.abspath('..'))
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

import pandas as pd
import requests
import yfinance as yf
from bs4 import BeautifulSoup

from utils.data import (
    _get_talib_momentum_indicators,
    _get_talib_pattern_indicators,
    clean_market_cap,
    features_based_on_fundamentals,
    features_based_on_price,
    get_date_of_previous_month,
)

# Display settings for pandas
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)


## Fetch data

In [2]:
STOCKS_URL = "https://stockanalysis.com/list/nasdaq-100-stocks/"
START_DATE = '2020-01-01'
END_DATE = get_date_of_previous_month()

assert pd.to_datetime(END_DATE) < pd.to_datetime('today'), "END_DATE must be in the past"
assert pd.to_datetime(START_DATE) < pd.to_datetime(END_DATE), "START_DATE must be before END_DATE"

In [3]:
response = requests.get(STOCKS_URL)
soup = BeautifulSoup(response.text, "lxml")

table = soup.find('table')
rows = table.find_all('tr')

# Make sure to call find_all on the first row only for headers
headers = [th.text.strip() for th in rows[0].find_all('th')]
data = []
for row in rows[1:]:
    cols = [td.text.strip() for td in row.find_all('td')]
    if cols:
        data.append(cols)

df = pd.DataFrame(data, columns=headers)
number_of_stocks = len(df)
print(f"**Number of stocks in the list: {number_of_stocks}\n\n")
print(df.head())

**Number of stocks in the list: 101


  No. Symbol           Company Name Market Cap Stock Price % Change  Revenue
0   1   NVDA     NVIDIA Corporation  4,293.08B      176.67    0.24%  165.22B
1   2   MSFT  Microsoft Corporation  3,849.86B      517.93    1.86%  281.72B
2   3   AAPL             Apple Inc.  3,643.32B      245.50    3.20%  408.63B
3   4   GOOG          Alphabet Inc.  3,083.41B      255.24    1.15%  371.40B
4   5  GOOGL          Alphabet Inc.  3,080.58B      254.72    1.07%  371.40B


In [4]:
# Clean 'Market Cap' column and convert to numeric
# df['Market Cap'] = df['Market Cap'].str.replace('$', '').str.replace('B', '').str.replace('M', '').str.replace(',', '')
# df['Market Cap'] = pd.to_numeric(df['Market Cap'], errors='coerce')
# df

df = clean_market_cap(df)

In [5]:
# select the top 24 stocks by Market Cap (it is alreasdy sorted by volume on the website,
# but let's be sure in case that changes in the future)
df.sort_values(by='Market Cap', ascending=False, inplace=True)
top_24_by_volume = df.head(24)

print(top_24_by_volume[['Symbol', 'Company Name', 'Market Cap']])

   Symbol                  Company Name  Market Cap
0    NVDA            NVIDIA Corporation     4293.08
1    MSFT         Microsoft Corporation     3849.86
2    AAPL                    Apple Inc.     3643.32
3    GOOG                 Alphabet Inc.     3083.41
4   GOOGL                 Alphabet Inc.     3080.58
5    AMZN              Amazon.com, Inc.     2468.71
6    META          Meta Platforms, Inc.     1955.40
7    AVGO                 Broadcom Inc.     1628.93
8    TSLA                   Tesla, Inc.     1416.75
9    NFLX                 Netflix, Inc.      521.37
10   PLTR    Palantir Technologies Inc.      432.69
11   COST  Costco Wholesale Corporation      421.82
12   ASML             ASML Holding N.V.      361.06
13   CSCO           Cisco Systems, Inc.      269.65
14   TMUS             T-Mobile US, Inc.      268.04
15    AMD  Advanced Micro Devices, Inc.      255.42
16    AZN               AstraZeneca PLC      236.24
17    LIN                     Linde plc      224.61
18    APP   

In [6]:
top_24_by_volume

Unnamed: 0,No.,Symbol,Company Name,Market Cap,Stock Price,% Change,Revenue
0,1,NVDA,NVIDIA Corporation,4293.08,176.67,0.24%,165.22B
1,2,MSFT,Microsoft Corporation,3849.86,517.93,1.86%,281.72B
2,3,AAPL,Apple Inc.,3643.32,245.5,3.20%,408.63B
3,4,GOOG,Alphabet Inc.,3083.41,255.24,1.15%,371.40B
4,5,GOOGL,Alphabet Inc.,3080.58,254.72,1.07%,371.40B
5,6,AMZN,"Amazon.com, Inc.",2468.71,231.48,0.11%,670.04B
6,7,META,"Meta Platforms, Inc.",1955.4,778.38,-0.24%,178.80B
7,8,AVGO,Broadcom Inc.,1628.93,344.94,-0.12%,59.93B
8,9,TSLA,"Tesla, Inc.",1416.75,426.07,2.21%,92.72B
9,10,NFLX,"Netflix, Inc.",521.37,1226.97,1.59%,41.69B


In [7]:
tickers_list =  top_24_by_volume['Symbol'].tolist()[0:]

In [8]:
tickers_list

['NVDA',
 'MSFT',
 'AAPL',
 'GOOG',
 'GOOGL',
 'AMZN',
 'META',
 'AVGO',
 'TSLA',
 'NFLX',
 'PLTR',
 'COST',
 'ASML',
 'CSCO',
 'TMUS',
 'AMD',
 'AZN',
 'LIN',
 'APP',
 'SHOP',
 'PEP',
 'INTU',
 'PDD',
 'MU']

In [9]:
%%time 

ticker_data = {}
dataset_price = pd.DataFrame()

for i,ticker in enumerate(tickers_list):

  print(f"{i} - Fetching data for {ticker} stock...\n")
  print("-"*24, "\n")
  
  ticker_data[ticker] = yf.Ticker(ticker).history(start=START_DATE, end=END_DATE, interval='1d')
  price_data = features_based_on_price(ticker_data[ticker], ticker=ticker)
  print("-"*24, "\n")
  
  dataset_price = pd.concat([dataset_price, price_data], axis=0)
  
  if i == 1:
    break
  


0 - Fetching data for NVDA stock...

------------------------ 

'Adj Close' not present in columns for NVDA.Using 'Close' instead.
Data columns:  Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits'], dtype='object')
Data index type:  <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
Data shape:  (1423, 7)
Resampling to month-end frequency... 

Resampling done. The new data shape is  (68, 22)
Final data shape for NVDA:  (63, 33)
------------------------ 

1 - Fetching data for MSFT stock...

------------------------ 

'Adj Close' not present in columns for MSFT.Using 'Close' instead.
Data columns:  Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits'], dtype='object')
Data index type:  <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
Data shape:  (1423, 7)
Resampling to month-end frequency... 

Resampling done. The new data shape is  (68, 22)
Final data shape for MSFT:  (63, 33)
------------------------ 

CPU times: user 99.7 ms,

In [10]:
dataset_price.ticker.value_counts()

ticker
NVDA    63
MSFT    63
Name: count, dtype: int64

In [11]:
dataset_price.iloc[-1]

Date                      2025-08-31 00:00:00
Close                                  506.69
ln_volume_min                         16.5583
ln_volume_median                      16.8582
ln_volume_max                         17.2442
daily_growth_min                      -0.0176
daily_growth_median                   -0.0044
daily_growth_max                        0.022
weekly_growth_min                     -0.0368
weekly_growth_median                  -0.0039
weekly_growth_max                      0.0452
biweekly_growth_min                   -0.0498
biweekly_growth_median                -0.0207
biweekly_growth_max                    0.0502
monthly_growth_min                    -0.0487
monthly_growth_median                  0.0166
monthly_growth_max                     0.0738
rel_spread_oc_median                   0.0046
rel_spread_hl_median                   0.0146
vol_10d_mean_median                    -0.002
vol_10d_std_median                      0.012
vol_10d_min_median                

In [12]:
%%time 

dataset_fundamentals = dict()

for i,ticker in enumerate(tickers_list):

    print(f"{i} - Fetching data for {ticker} stock...\n")
    print("-"*24, "\n")

    dataset_fundamentals[ticker] = features_based_on_fundamentals(ticker, END_DATE)

    print("-"*24, "\n")

    if i == 1:
        break

    

0 - Fetching data for NVDA stock...

------------------------ 

Number of missing values in fund_feats_float: 0
------------------------ 

1 - Fetching data for MSFT stock...

------------------------ 

Number of missing values in fund_feats_float: 0
------------------------ 

CPU times: user 310 ms, sys: 36.6 ms, total: 347 ms
Wall time: 5.42 s


In [13]:
ticker_data[ticker]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,ln_volume,daily_growth,weekly_growth,biweekly_growth,monthly_growth,rel_spread_oc,rel_spread_hl,vol_10d_mean,vol_10d_std,vol_10d_min,vol_10d_max
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2020-01-02 00:00:00-05:00,151.040810,152.895761,150.612747,152.791122,22622100,0.0,0.0,16.934438,,,,,-0.011456,0.014942,,,,
2020-01-03 00:00:00-05:00,150.603230,152.153771,150.355893,150.888596,21116200,0.0,0.0,16.865551,-0.012452,,,,-0.001891,0.011915,,,,
2020-01-06 00:00:00-05:00,149.423674,151.345221,148.881450,151.278625,20813700,0.0,0.0,16.851122,0.002585,,,,-0.012262,0.016286,,,,
2020-01-07 00:00:00-05:00,151.554471,151.887403,149.651955,149.899277,21634100,0.0,0.0,16.889781,-0.009118,,,,0.011042,0.014913,,,,
2020-01-08 00:00:00-05:00,151.183478,152.962341,150.251249,152.286942,27746500,0.0,0.0,17.138620,0.015928,,,,-0.007246,0.017803,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-08-25 00:00:00-04:00,506.630005,508.190002,504.119995,504.260010,21638600,0.0,0.0,16.889989,-0.005855,-0.023228,-0.031970,-0.016782,0.004700,0.008071,-0.003205,0.009267,-0.016363,0.014317
2025-08-26 00:00:00-04:00,504.359985,504.980011,498.510010,502.040009,30835700,0.0,0.0,17.244184,-0.004402,-0.013545,-0.049835,-0.018799,0.004621,0.012887,-0.005077,0.006930,-0.016363,0.005930
2025-08-27 00:00:00-04:00,502.000000,507.290009,499.899994,506.739990,17277900,0.0,0.0,16.664939,0.009362,0.003664,-0.024986,-0.009749,-0.009354,0.014583,-0.002505,0.007049,-0.014175,0.009362
2025-08-28 00:00:00-04:00,507.089996,511.089996,505.500000,509.640015,18015600,0.0,0.0,16.706749,0.005723,0.010709,-0.022972,-0.005382,-0.005004,0.010969,-0.002297,0.007277,-0.014175,0.009362


In [14]:
%%time 

dataset_talib_pattern = dict()
dataset_talib_momentum = dict()

for i,ticker in enumerate(tickers_list):

    print(f"{i} - Calculate TALIB-based fearures for {ticker} stock...\n")
    print("-"*24, "\n")

    data_with_date = ticker_data[ticker].reset_index()
    data_with_date_ticker = data_with_date.copy()
    data_with_date_ticker['Ticker'] = ticker

    rows_initial = len(data_with_date_ticker)
    
    # Ensure columns are float64 before passing to TA-Lib
    # That part should resolve the issue with the `mfi` indicator, but it did not
    # TODO: investigate further if needed
    for col in ['Open', 'High', 'Low', 'Close']:
        data_with_date_ticker[col] = pd.to_numeric(data_with_date_ticker[col], errors='coerce')

    rows_after_conversion = len(data_with_date_ticker.dropna(subset=['Open', 'High', 'Low', 'Close']))
    
    if rows_initial != rows_after_conversion:
        print(f"Warning: Number of rows changed after numeric conversion for {ticker} stock: {rows_initial} -> {rows_after_conversion}")
    
    dataset_talib_pattern[ticker] = _get_talib_pattern_indicators(data_with_date_ticker)
    dataset_talib_momentum[ticker] = _get_talib_momentum_indicators(data_with_date_ticker)

    print("-"*24, "\n")

    if i == 1:
        break

    


0 - Calculate TALIB-based fearures for NVDA stock...

------------------------ 

------------------------ 

1 - Calculate TALIB-based fearures for MSFT stock...

------------------------ 

------------------------ 

CPU times: user 12.2 ms, sys: 1.94 ms, total: 14.1 ms
Wall time: 39.7 ms


In [15]:
(dataset_talib_pattern[ticker] == 0).sum()

Date                      0
Ticker                    0
cdl2crows              1421
cdl3blackrows          1423
cdl3inside             1388
                       ... 
cdlthrusting           1414
cdltristar             1422
cdlunique3river        1418
cdlupsidegap2crows     1421
cdlxsidegap3methods    1414
Length: 63, dtype: int64

In [16]:
(dataset_talib_momentum[ticker] == 0).sum()

Date                0
Ticker              0
adx                 0
adxr                0
apo                 0
aroon_1           302
aroon_2           153
aroonosc            0
bop                 2
cci                 0
cmo                 0
dx                  0
macd                0
macdsignal          0
macdhist            0
macd_ext            0
macdsignal_ext      0
macdhist_ext        0
macd_fix            0
macdsignal_fix      0
macdhist_fix        0
minus_di            0
mom                 0
plus_di             0
dm                  0
ppo                 0
roc                 0
rocp                0
rocr                0
rocr100             0
rsi                 0
slowk               0
slowd               0
fastk               0
fastd               0
fastk_rsi         339
fastd_rsi           2
trix                0
ultosc              0
willr               2
dtype: int64

In [17]:
dataset_talib_momentum[ticker]

Unnamed: 0,Date,Ticker,adx,adxr,apo,aroon_1,aroon_2,aroonosc,bop,cci,cmo,dx,macd,macdsignal,macdhist,macd_ext,macdsignal_ext,macdhist_ext,macd_fix,macdsignal_fix,macdhist_fix,minus_di,mom,plus_di,dm,ppo,roc,rocp,rocr,rocr100,rsi,slowk,slowd,fastk,fastd,fastk_rsi,fastd_rsi,trix,ultosc,willr
0,2020-01-02 05:00:00,MSFT,,,,,,,0.766667,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2020-01-03 05:00:00,MSFT,,,,,,,0.158724,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2020-01-06 05:00:00,MSFT,,,,,,,0.752891,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2020-01-07 05:00:00,MSFT,,,,,,,-0.740431,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2020-01-08 05:00:00,MSFT,,,,,,,0.407018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1418,2025-08-25 04:00:00,MSFT,27.675275,37.693457,-1.252033,92.857143,0.000000,-92.857143,-0.582307,-104.710467,-13.612419,1.512952,0.306354,3.673391,-3.367037,-1.252033,4.648874,-5.900906,0.329747,3.528813,-3.199065,24.718683,-16.653687,22.058674,25.478137,-0.242644,-3.197015,-0.031970,0.968030,96.802985,43.193790,15.647768,11.329724,15.540414,15.647768,16.815806,16.043623,0.266401,30.057058,-93.710831
1419,2025-08-26 04:00:00,MSFT,26.369348,36.389775,-2.571037,100.000000,35.714286,-64.285714,-0.358574,-112.675507,-17.147257,9.392291,-0.506427,2.837427,-3.343855,-2.571037,3.377160,-5.948197,-0.446522,2.733746,-3.180268,28.563048,-26.331390,20.803682,23.658270,-0.498533,-4.983500,-0.049835,0.950165,95.016500,41.426372,23.026606,17.037280,28.887058,23.026606,0.000000,16.043623,0.257357,33.848168,-89.404412
1420,2025-08-27 04:00:00,MSFT,24.801400,35.001562,-3.840612,92.857143,28.571429,-64.285714,0.641405,-75.728541,-7.150903,4.418082,-0.762523,2.117437,-2.879960,-3.840612,1.953709,-5.794321,-0.699341,2.047129,-2.746470,26.522831,-12.985657,21.487470,24.278391,-0.744580,-2.498560,-0.024986,0.975014,97.501440,46.424548,37.258638,25.311004,67.348442,37.258638,100.000000,38.938602,0.248182,40.568655,-75.297021
1421,2025-08-28 04:00:00,MSFT,23.270318,33.786525,-5.578494,85.714286,21.428571,-64.285714,0.456175,-35.921469,-1.401234,3.366254,-0.723137,1.549322,-2.272460,-5.578494,0.352838,-5.931332,-0.672751,1.503153,-2.175904,24.628343,-11.982483,23.839190,26.344208,-1.081133,-2.297156,-0.022972,0.977028,97.702844,49.299383,61.569800,40.618348,88.473901,61.569800,100.000000,66.666667,0.239107,46.433017,-66.592354


In [18]:
dataset_talib_momentum[ticker].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1423 entries, 0 to 1422
Data columns (total 40 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            1423 non-null   datetime64[ns]
 1   Ticker          1423 non-null   object        
 2   adx             1396 non-null   float64       
 3   adxr            1383 non-null   float64       
 4   apo             1398 non-null   float64       
 5   aroon_1         1409 non-null   float64       
 6   aroon_2         1409 non-null   float64       
 7   aroonosc        1409 non-null   float64       
 8   bop             1423 non-null   float64       
 9   cci             1410 non-null   float64       
 10  cmo             1409 non-null   float64       
 11  dx              1409 non-null   float64       
 12  macd            1390 non-null   float64       
 13  macdsignal      1390 non-null   float64       
 14  macdhist        1390 non-null   float64       
 15  macd

In [19]:
#df_resampled = df.resample('ME').agg(agg_funcs)
dataset_talib_momentum[ticker].set_index('Date').drop(columns='Ticker').resample('ME').median().reset_index().dropna()
dataset_talib_pattern[ticker].set_index('Date').drop(columns='Ticker').resample('ME').median().reset_index().dropna()



Unnamed: 0,Date,cdl2crows,cdl3blackrows,cdl3inside,cdl3linestrike,cdl3outside,cdl3starsinsouth,cdl3whitesoldiers,cdlabandonedbaby,cdladvancedblock,cdlbelthold,cdlbreakaway,cdlclosingmarubozu,cdlconcealbabyswall,cdlcounterattack,cdldarkcloudcover,cdldoji,cdldojistar,cdldragonflydoji,cdlengulfing,cdleveningdojistar,cdleveningstar,cdlgapsidesidewhite,cdlgravestonedoji,cdlhammer,...,cdlladderbottom,cdllongleggeddoji,cdllongline,cdlmarubozu,cdlmatchinglow,cdlmathold,cdlmorningdojistar,cdlmorningstar,cdlonneck,cdlpiercing,cdlrickshawman,cdlrisefall3methods,cdlseparatinglines,cdlshootingstar,cdlshortline,cdlspinningtop,cdlstalledpattern,cdlsticksandwich,cdltakuru,cdltasukigap,cdlthrusting,cdltristar,cdlunique3river,cdlupsidegap2crows,cdlxsidegap3methods
0,2020-01-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-02-29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-03-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-04-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-05-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,2025-04-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64,2025-05-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65,2025-06-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66,2025-07-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
test = dataset_talib_pattern[ticker].set_index('Date').drop(columns='Ticker').resample('ME').median().reset_index().dropna()

In [21]:
test.columns

Index(['Date', 'cdl2crows', 'cdl3blackrows', 'cdl3inside', 'cdl3linestrike',
       'cdl3outside', 'cdl3starsinsouth', 'cdl3whitesoldiers',
       'cdlabandonedbaby', 'cdladvancedblock', 'cdlbelthold', 'cdlbreakaway',
       'cdlclosingmarubozu', 'cdlconcealbabyswall', 'cdlcounterattack',
       'cdldarkcloudcover', 'cdldoji', 'cdldojistar', 'cdldragonflydoji',
       'cdlengulfing', 'cdleveningdojistar', 'cdleveningstar',
       'cdlgapsidesidewhite', 'cdlgravestonedoji', 'cdlhammer',
       'cdlhangingman', 'cdlharami', 'cdlharamicross', 'cdlhighwave',
       'cdlhikkake', 'cdlhikkakemod', 'cdlhomingpigeon', 'cdlidentical3crows',
       'cdlinneck', 'cdlinvertedhammer', 'cdlkicking', 'cdlkickingbylength',
       'cdlladderbottom', 'cdllongleggeddoji', 'cdllongline', 'cdlmarubozu',
       'cdlmatchinglow', 'cdlmathold', 'cdlmorningdojistar', 'cdlmorningstar',
       'cdlonneck', 'cdlpiercing', 'cdlrickshawman', 'cdlrisefall3methods',
       'cdlseparatinglines', 'cdlshootingstar', 'cd

In [22]:
msft = price_data[ price_data['ticker'] == 'MSFT']

In [23]:
msft.columns

Index(['Date', 'Close', 'ln_volume_min', 'ln_volume_median', 'ln_volume_max',
       'daily_growth_min', 'daily_growth_median', 'daily_growth_max',
       'weekly_growth_min', 'weekly_growth_median', 'weekly_growth_max',
       'biweekly_growth_min', 'biweekly_growth_median', 'biweekly_growth_max',
       'monthly_growth_min', 'monthly_growth_median', 'monthly_growth_max',
       'rel_spread_oc_median', 'rel_spread_hl_median', 'vol_10d_mean_median',
       'vol_10d_std_median', 'vol_10d_min_median', 'vol_10d_max_median',
       'return_1m', 'return_2m', 'return_3m', 'vol_3m', 'momentum_3m',
       'mma_3_scaled', 'mma_6_scaled', 'year', 'month', 'ticker'],
      dtype='object')

In [24]:
data_m1 = pd.merge(msft, test, how='left',
                   on=['Date'],
                   validate = "many_to_one"
                   )

In [25]:
data_m1.shape, test.shape, msft.shape

((63, 94), (68, 62), (63, 33))