In [1]:
# Check the updates in the data storage is accurate

In [34]:
import polars as pl
from alpha_utils import get_bucket_name, get_profile_name
from s3io import S3IO
from datetime import datetime

In [35]:
# get the ticker data
ticker_table = "stock_tracker/tickers.parq"
ticker_queue_table = "stock_tracker/tickers_queue.parq"

In [36]:
# create the s3io object
s3io = S3IO(bucket=get_bucket_name(),
            profile=get_profile_name())

# get the ticker table and ticker queue
ticker_queue = s3io.s3_read_parquet(file_path=ticker_queue_table)
tickers = s3io.s3_read_parquet(file_path=ticker_table)

print(ticker_table)
print(ticker_queue)

stock_tracker/tickers.parq
shape: (98, 4)
┌────────┬────────────────────────────┬────────────┬─────────────────┐
│ Symbol ┆ Download_time              ┆ Downloaded ┆ Download_Failed │
│ ---    ┆ ---                        ┆ ---        ┆ ---             │
│ str    ┆ datetime[μs]               ┆ bool       ┆ bool            │
╞════════╪════════════════════════════╪════════════╪═════════════════╡
│ XOM    ┆ 2025-01-28 09:53:42.654783 ┆ false      ┆ false           │
│ CVX    ┆ 2025-01-28 09:53:42.654783 ┆ false      ┆ false           │
│ BE     ┆ 2025-01-28 09:53:42.654783 ┆ false      ┆ false           │
│ BPT    ┆ 2025-01-28 09:53:42.654783 ┆ false      ┆ false           │
│ AR     ┆ 2025-01-28 09:53:42.654783 ┆ false      ┆ false           │
│ …      ┆ …                          ┆ …          ┆ …               │
│ CRK    ┆ 2025-01-28 08:11:46.291942 ┆ false      ┆ false           │
│ SJT    ┆ 2025-01-28 08:11:46.291942 ┆ false      ┆ false           │
│ VAL    ┆ 2025-01-28 08:11:46.2919

In [27]:
# check for any updated tickers
tickers.filter(pl.col("is_current") == False)

Symbol,Name,Market Cap,Country,IPO Year,Sector,Industry,Market Cap Name,is_current,updated_time
str,str,f64,str,i64,str,str,str,bool,datetime[μs]


In [28]:
# check for any duplicates
tickers.select(pl.col("Symbol")).to_series().value_counts().filter(pl.col("count") > 1)

Symbol,count
str,u32


In [29]:
# print the duplicate symbols in the dataframe
duplicate_tickers = (tickers.select(pl.col("Symbol"))
                     .to_series().value_counts()
                     .filter(pl.col("count") > 1)
                     .select(pl.col("Symbol")).to_series())

tickers.filter(pl.col("Symbol").is_in(duplicate_tickers)).unique()

Symbol,Name,Market Cap,Country,IPO Year,Sector,Industry,Market Cap Name,is_current,updated_time
str,str,f64,str,i64,str,str,str,bool,datetime[μs]


In [13]:
# drop duplicates and write back to s3
tickers_dropped = tickers.unique()

(tickers_dropped.select(pl.col("Symbol"))
                     .to_series().value_counts()
                     .filter(pl.col("count") > 1)
                     .select(pl.col("Symbol")).to_series())

Symbol
str


In [14]:
s3io.s3_write_parquet(df=tickers_dropped, file_path=ticker_table)
print("written to s3")

written to s3


In [30]:
(ticker_queue.select(pl.col("Symbol"))
                     .to_series().value_counts()
                     .filter(pl.col("count") > 1)
                     .select(pl.col("Symbol")).to_series())

Symbol
str


In [31]:
# check the number of symbols with what is in the ticker table
ticker_queue.select(pl.col("Symbol")).n_unique() == tickers_dropped.select(pl.col("Symbol")).n_unique()

True

In [32]:
downloaded_time = datetime(year=2025, month=1, day=28, hour=9, minute=11)
# set the download failed to false
ticker_queue_updated = ticker_queue.with_columns(
    Download_Failed=pl.when(pl.col("Downloaded") == False,
                            pl.col("Download_time") > downloaded_time).then(False).otherwise(pl.col("Download_Failed"))
)

ticker_queue_updated.filter(pl.col("Downloaded") == False,
                    pl.col("Download_Failed") == True,
                    pl.col("Download_time") > downloaded_time)

Symbol,Download_time,Downloaded,Download_Failed
str,datetime[μs],bool,bool


In [33]:
# write the que back to test the rerun
s3io.s3_write_parquet(df=ticker_queue_updated, file_path=ticker_queue_table)
print("written to s3")

written to s3


In [37]:
ticker_queue.filter(pl.col("Downloaded") == False,
                    pl.col("Download_Failed") == False)

Symbol,Download_time,Downloaded,Download_Failed
str,datetime[μs],bool,bool
"""XOM""",2025-01-28 09:53:42.654783,false,false
"""CVX""",2025-01-28 09:53:42.654783,false,false
"""BE""",2025-01-28 09:53:42.654783,false,false
"""BPT""",2025-01-28 09:53:42.654783,false,false
"""AR""",2025-01-28 09:53:42.654783,false,false
…,…,…,…
"""CRK""",2025-01-28 08:11:46.291942,false,false
"""SJT""",2025-01-28 08:11:46.291942,false,false
"""VAL""",2025-01-28 08:11:46.291942,false,false
"""SD""",2025-01-28 08:11:46.291942,false,false
