In [254]:
import yfinance as yf
import pandas as pd
import os
from os.path import join
from datetime import datetime, timezone


def load_ticker_list(pth: str) -> list[str]:
    ticker_list = pd.read_csv(pth)['ticker_name'].to_list()
    return ticker_list

Schema = {
    'Date': 'datetime64[ns, Europe/Berlin]',
    'Open': 'float64',
    'High': 'float64',
    'Low': 'float64',
    'Close': 'float64',
    'Volume': 'int64',
    'Dividends': 'float64',
    'Stock Splits': 'float64',
    'Repaired?': 'bool'
}



In [None]:
load_ticker_list

In [1]:
from yforseer.scraping import update_raw_tables, load_ticker_list
data_dir = 'data/yahoo/tickerdaily'
ticker_list = load_ticker_list('yforseer/ticker_list.csv')
update_raw_tables(data_dir, ticker_list)

0/5: NVD.DE
Existing NVD.DE table found.
NVD.DE is up to date. Last UTC date is 2024-07-09 >= 2024-07-09.
1/5: MBG.DE
Downloading MBG.DE for the whole period
2/5: MSF.DE
Downloading MSF.DE for the whole period
3/5: APC.DE
Downloading APC.DE for the whole period
4/5: 1COV.DE
Existing 1COV.DE table found.
1COV.DE is up to date. Last UTC date is 2024-07-09 >= 2024-07-09.


In [204]:
today_de = datetime.now().date()
nowtime_utc = datetime.now(timezone.utc)
today_de = nowtime_de.date()
today_utc = nowtime_utc.date()



datetime.datetime(2024, 7, 10, 12, 17, 9, 285333, tzinfo=datetime.timezone.utc)

In [9]:
pth = 'yforseer/ticker_list.csv'
ticker_list = load_ticker_list(pth)

In [17]:
data_dir = 'data/yahoo/tickerdaily'
def update_raw_tables(data_dir, ticker_list):
    '''
    Control flow
    - If ticker_name.csv not exists, download and store
    - If ticker_name.csv exists
        - If last date is not today, download and update
        - If last date is today, do nothing
    '''
    N = len(ticker_list)
    for i, ticker_name in enumerate(ticker_list):
        print('%d/%d: %s'%(i, N, ticker_name))

        ticker = yf.Ticker(ticker_name)
        csv_pth = join(data_dir, f'{ticker_name}.csv')

        today = datetime.now().date()

        if os.path.exists(csv_pth):
            print('Existing %s table found.'%(ticker_name))

            df = pd.read_csv(csv_pth, index_col=0)
            df = df.astype(Schema)
            latest_datetime = df['Date'].max()  
            latest_day = latest_datetime.date() 
            endday = today - pd.Timedelta(days=1)
            if latest_day >= endday:
                print(f'{ticker_name} is up to date. Last UTC date is {str(latest_day)} >= {endday}.')
            else:
                print(f'{ticker_name} = {str(latest_day)}, while today is {today}')

                start_scrape_day = latest_day + pd.Timedelta(days=1)  # in DE time
                hist = ticker.history(
                    interval = '1d',
                    period = None,
                    end = today,  # exclusive, so the still-updating data today is not included.
                    start= start_scrape_day,  # Inclusive, so starting one day after the last day.
                    repair=True)
                hist.reset_index(inplace=True)
                hist = hist.astype(Schema)
                hist['Date'] = pd.to_datetime(hist['Date'])
                print('Extracted date %s to %s'%(str(hist['Date'].min().date()), str(hist['Date'].max().date())))
            df2 = pd.concat([df, hist], ignore_index=True)
            df2.to_csv(csv_pth)

        else:
            print('Downloading %s for the whole period'%(ticker_name))
            hist = ticker.history(interval = '1d', period = None, end=today, repair=True)
            hist.reset_index(inplace=True)
            hist = hist.astype(Schema)
            hist.to_csv(csv_pth)








In [233]:
ticker_name = 'NVD.DE'
ticker = yf.Ticker(ticker_name)
csv_pth = join(data_dir, f'{ticker_name}.csv')

today = datetime.now().date()

if os.path.exists(csv_pth):
    print('Existing %s table found.'%(ticker_name))

    df = pd.read_csv(csv_pth, index_col=0)
    df = df.astype(Schema)
    latest_datetime = df['Date'].max()  
    latest_day = latest_datetime.date() 
    endday = today - pd.Timedelta(days=1)
    if latest_day >= endday:
        print(f'{ticker_name} is up to date. Last UTC date is {str(latest_day)} >= {endday}.')
    else:
        print(f'{ticker_name} = {str(latest_day)}, while today is {today}')

        start_scrape_day = latest_day + pd.Timedelta(days=1)  # in DE time
        hist = ticker.history(
            interval = '1d',
            period = None,
            end = today,  # exclusive, so the still-updating data today is not included.
            start= start_scrape_day,  # Inclusive, so starting one day after the last day.
            repair=True)
        hist.reset_index(inplace=True)
        hist = hist.astype(Schema)
        hist['Date'] = pd.to_datetime(hist['Date'])
        print('Extracted date %s to %s'%(str(hist['Date'].min().date()), str(hist['Date'].max().date())))
    df2 = pd.concat([df, hist], ignore_index=True)
    df2.to_csv(csv_pth)

else:
    print('Downloading %s for the whole period'%(ticker_name))
    hist = ticker.history(interval = '1d', period = None, end=today, repair=True)
    hist.reset_index(inplace=True)
    hist = hist.astype(Schema)
    hist.to_csv(csv_pth)


Existing NVD.DE table found.
NVD.DE is up to date. Last UTC date is 2024-07-09 >= 2024-07-09.


  df['Date'] = pd.to_datetime(df['Date'])


In [251]:
df.dtypes

Date            datetime64[ns, Europe/Berlin]
Open                                  float64
High                                  float64
Low                                   float64
Close                                 float64
Volume                                  int64
Dividends                             float64
Stock Splits                          float64
Repaired?                                bool
dtype: object

In [253]:


df = pd.read_csv(csv_pth, index_col=0)
df = df.astype(Schema)
df['Date']

0      2007-12-13 00:00:00+01:00
1      2007-12-14 00:00:00+01:00
2      2007-12-17 00:00:00+01:00
3      2007-12-18 00:00:00+01:00
4      2007-12-19 00:00:00+01:00
                  ...           
4222   2024-07-03 00:00:00+02:00
4223   2024-07-04 00:00:00+02:00
4224   2024-07-05 00:00:00+02:00
4225   2024-07-08 00:00:00+02:00
4226   2024-07-09 00:00:00+02:00
Name: Date, Length: 4227, dtype: datetime64[ns, Europe/Berlin]

In [245]:
df = pd.read_csv(csv_pth, index_col=0)
df['Date'] = pd.to_datetime(df['Date'], utc=True)
df['Date'] = df['Date'].dt.tz_convert('Europe/Berlin')
df['Date']

0      2007-12-13 00:00:00+01:00
1      2007-12-14 00:00:00+01:00
2      2007-12-17 00:00:00+01:00
3      2007-12-18 00:00:00+01:00
4      2007-12-19 00:00:00+01:00
                  ...           
4222   2024-07-03 00:00:00+02:00
4223   2024-07-04 00:00:00+02:00
4224   2024-07-05 00:00:00+02:00
4225   2024-07-08 00:00:00+02:00
4226   2024-07-09 00:00:00+02:00
Name: Date, Length: 4227, dtype: datetime64[ns, Europe/Berlin]

In [244]:
df['Date'].dt.tz_convert('Europe/Berlin')

0      2007-12-13 00:00:00+01:00
1      2007-12-14 00:00:00+01:00
2      2007-12-17 00:00:00+01:00
3      2007-12-18 00:00:00+01:00
4      2007-12-19 00:00:00+01:00
                  ...           
4222   2024-07-03 00:00:00+02:00
4223   2024-07-04 00:00:00+02:00
4224   2024-07-05 00:00:00+02:00
4225   2024-07-08 00:00:00+02:00
4226   2024-07-09 00:00:00+02:00
Name: Date, Length: 4227, dtype: datetime64[ns, Europe/Berlin]

In [213]:
today_utc = datetime.now(timezone.utc).date()
today_utc

datetime.date(2024, 7, 10)

In [211]:
endday_utc = today_utc - pd.Timedelta(days=1)
endday_utc

datetime.date(2024, 7, 9)

In [184]:
hist2 = ticks.history(interval = '1d', period = None, start=today_de - pd.Timedelta(days=2), repair=True)
hist2.reset_index(inplace=True)
hist2['Date'] = pd.to_datetime(hist2['Date'], utc=True)

hist2

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Repaired?
0,2024-07-07 22:00:00+00:00,65.290001,65.93,65.169998,65.389999,1568407,0.0,0.0,False
1,2024-07-08 22:00:00+00:00,63.84,64.220001,62.849998,63.150002,5259368,0.0,0.0,False
2,2024-07-09 22:00:00+00:00,63.23,63.639999,62.689999,63.25,735967,0.0,0.0,False


# Processing return Hist data

In [153]:
ticker_name = '1COV.DE'

ticker = yf.Ticker(ticker_name)

In [157]:
hist2 = ticker.history(
    interval = '1d',
    start= '2024-05-08',
    end = '2024-06-20',
    repair=True)
hist2.reset_index(inplace=True)

hist2['Date']   = pd.to_datetime(hist2['Date'], utc=True)

hist2.to_csv('data/yahoo/tickerdaily/%s.csv'%(ticker_name))

In [145]:
hist2 = ticker.history(
    interval = '1d',
    start= '2024-07-08',
    # end = '2024-06-20',
    repair=True)
hist2.reset_index(inplace=True)

hist2

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Repaired?
0,2024-07-08 00:00:00+02:00,432.600006,433.950012,428.700012,430.25,15313,0.0,0.0,False
1,2024-07-09 00:00:00+02:00,431.450012,432.450012,427.75,429.100006,12949,0.0,0.0,False
2,2024-07-10 00:00:00+02:00,426.399994,427.549988,426.350006,426.649994,3864,0.0,0.0,False


In [140]:
pd.Timestamp.today().date()

datetime.date(2024, 7, 10)

In [134]:
hist = ticker.history(
    interval = '1d',
    start= '2024-06-10',
    end = '2024-06-20',
    repair=True)
hist.reset_index(inplace=True)
hist['Date'] = pd.to_datetime(hist['Date'], utc=True)
hist

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Repaired?
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-06-10 00:00:00+02:00,393.600006,398.200012,393.299988,397.700012,13203,0.0,0.0,False
2024-06-11 00:00:00+02:00,397.350006,399.450012,396.450012,399.450012,11820,0.0,0.0,False
2024-06-12 00:00:00+02:00,403.100006,405.75,399.950012,404.299988,16874,0.0,0.0,False
2024-06-13 00:00:00+02:00,409.049988,412.0,405.549988,411.700012,26773,0.0,0.0,False
2024-06-14 00:00:00+02:00,411.399994,414.350006,409.0,413.350006,23054,0.0,0.0,False
2024-06-17 00:00:00+02:00,414.700012,415.0,412.0,414.149994,19386,0.0,0.0,False
2024-06-18 00:00:00+02:00,417.899994,420.299988,414.450012,415.299988,23324,0.0,0.0,False
2024-06-19 00:00:00+02:00,416.399994,419.200012,416.350006,418.700012,15721,0.0,0.0,False


In [130]:
ticker_name = 'MSF.DE'
ticker = yf.Ticker(ticker_name)
hist = ticker.history(
    interval = '1d',
    start= '2024-06-10',
    end = '2024-06-15',
    repair=True)
hist.reset_index(inplace=True)
hist['Date'] = pd.to_datetime(hist['Date'], utc=True)


hist2 = ticker.history(
    interval = '1d',
    start= '2024-06-15',
    # end = '2024-06-20',
    repair=True)
hist2.reset_index(inplace=True)
hist2['Date'] = pd.to_datetime(hist2['Date'], utc=True)





In [132]:
hist2

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Repaired?
0,2024-06-16 22:00:00+00:00,414.700012,415.0,412.0,414.149994,19386,0.0,0.0,False
1,2024-06-17 22:00:00+00:00,417.899994,420.299988,414.450012,415.299988,23324,0.0,0.0,False
2,2024-06-18 22:00:00+00:00,416.399994,419.200012,416.350006,418.700012,15721,0.0,0.0,False
3,2024-06-19 22:00:00+00:00,416.799988,419.0,411.850006,415.149994,35123,0.0,0.0,False
4,2024-06-20 22:00:00+00:00,416.049988,420.600006,415.200012,419.75,21982,0.0,0.0,False
5,2024-06-23 22:00:00+00:00,420.799988,421.0,416.100006,419.450012,17177,0.0,0.0,False
6,2024-06-24 22:00:00+00:00,417.100006,420.600006,415.850006,419.700012,20836,0.0,0.0,False
7,2024-06-25 22:00:00+00:00,422.200012,424.549988,419.950012,423.899994,12949,0.0,0.0,False
8,2024-06-26 22:00:00+00:00,422.149994,426.149994,421.700012,424.0,21851,0.0,0.0,False
9,2024-06-27 22:00:00+00:00,425.100006,425.850006,422.25,423.399994,13079,0.0,0.0,False


In [117]:
hist['Date'].min().date()

datetime.date(2023, 6, 13)

In [114]:
pd.to_datetime(hist['Date'], utc=True).min().date()

datetime.date(2023, 6, 12)

In [99]:
df2[df2['Date'] > pd.to_datetime('2023-06-11', utc=)]

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Repaired?
3931,2023-06-11 22:00:00+00:00,36.473219,36.488214,35.898404,36.208305,178390,0.0,0.0,False
3932,2023-06-12 22:00:00+00:00,36.803108,37.832777,36.708141,37.662830,376930,0.0,0.0,False
3933,2023-06-12 22:00:00+00:00,68.640107,69.001907,67.257844,69.001907,2562255,0.0,0.0,False
3934,2023-06-13 22:00:00+00:00,69.103954,70.365619,69.029744,69.901772,3412044,0.0,0.0,False
3935,2023-06-14 22:00:00+00:00,69.725521,69.781181,69.159627,69.744072,2180019,0.0,0.0,False
...,...,...,...,...,...,...,...,...,...
4204,2024-07-03 22:00:00+00:00,65.580002,66.279999,65.519997,65.940002,1860262,0.0,0.0,False
4205,2024-07-04 22:00:00+00:00,65.849998,66.250000,65.180000,65.470001,1852800,0.0,0.0,False
4206,2024-07-07 22:00:00+00:00,65.290001,65.930000,65.169998,65.389999,1568407,0.0,0.0,False
4207,2024-07-08 22:00:00+00:00,63.840000,64.220001,62.849998,63.150002,5259368,0.0,0.0,False


In [76]:
pd.to_datetime(hist['Date'], utc=True)


0     2023-06-13 22:00:00+00:00
1     2023-06-14 22:00:00+00:00
2     2023-06-15 22:00:00+00:00
3     2023-06-18 22:00:00+00:00
4     2023-06-19 22:00:00+00:00
                 ...           
270   2024-07-03 22:00:00+00:00
271   2024-07-04 22:00:00+00:00
272   2024-07-07 22:00:00+00:00
273   2024-07-08 22:00:00+00:00
274   2024-07-09 22:00:00+00:00
Name: Date, Length: 275, dtype: datetime64[ns, UTC]

In [81]:
df = pd.read_csv(csv_pth)
df['Date'] = pd.to_datetime(df['Date'])
df.dtypes

  df['Date'] = pd.to_datetime(df['Date'])


Date             object
Open            float64
High            float64
Low             float64
Close           float64
Volume            int64
Dividends       float64
Stock Splits    float64
Repaired?          bool
dtype: object

In [83]:
pd.to_datetime(df['Date'], utc=True)

0      2007-12-12 23:00:00+00:00
1      2007-12-13 23:00:00+00:00
2      2007-12-16 23:00:00+00:00
3      2007-12-17 23:00:00+00:00
4      2007-12-18 23:00:00+00:00
                  ...           
3928   2023-06-06 22:00:00+00:00
3929   2023-06-07 22:00:00+00:00
3930   2023-06-08 22:00:00+00:00
3931   2023-06-11 22:00:00+00:00
3932   2023-06-12 22:00:00+00:00
Name: Date, Length: 3933, dtype: datetime64[ns, UTC]

In [40]:
5000 * 13

65000

In [42]:
df2 = pd.concat([df, hist])

In [52]:
df[df.index > pd.to_datetime('2023-06-09')]


TypeError: '>' not supported between instances of 'str' and 'Timestamp'

In [45]:
df2[df2.index > pd.to_datetime('2023-06-09')]

TypeError: '>' not supported between instances of 'str' and 'Timestamp'