In [None]:
# Imports
import polars as pl 
import os


print(os.getcwd());

NOTE: semicolon ; is used to suppress sensitive output

If the current working directory is different from what you expected, you can change it using the os module

In [None]:
"""
os.chdir(your_absolute_file_path)
print(os.getcwd())
"""

In [None]:
# Sample file to explore
filename = "Data/Ticker_Data/MMM.csv"

#skips the metadata at the top of the CSV
df_test = pl.read_csv(filename, skip_rows=2, has_header=True)

#renames the header
df_test.columns = ["Date", "Close", "High", "Low", "Open", "Volume"]

df_test.head();

In [None]:
# Number of rows and columns
df_test.shape

In [None]:
# Observes schema
df_test.schema

In [None]:
# Number of nul values per column
df_test.null_count()

TODO: change 'Date' from 'String' to 'DateTime'

In [None]:
df_test = df_test.with_columns(pl.col("Date").str.strptime(pl.Date, "%Y-%m-%d"))
df_test.schema

In [None]:
df_test.head();

TODO: add 'Ticker' column to df_test

In [None]:
df_test = df_test.with_columns(pl.lit("MMM").alias("Ticker"))
df_test.head();

TODO: load in company info

In [None]:
info = pl.read_csv("Data/company_list.csv")
info.head();

TODO: select ticker, sector, and sub columns and rename

In [None]:
info = info.select(pl.col("Symbol").alias("Ticker"), pl.col("GICS Sector").alias("Sector"), pl.col("GICS Sub-Industry").alias("Sub-Industry"))
info.head();

TODO: combine company info and ticker df

In [None]:
big_df = df_test.join(info, on="Ticker", how="right")
big_df.head();

TODO: create characteristics(df) with 'assert'

In [None]:
def validate_df(df):
    assert set(["Date", "Open", "High", "Low", "Close", "Volume", "Ticker"]).issubset(df.columns), "Missing required columns"

    assert df["Ticker"].n_unique() >= 1, "Ticker column has no values"
    assert df["Date"].is_sorted(), "Date column is not sorted"

    total_nulls = df.null_count().to_numpy().sum()
    assert total_nulls == 0, f"Found {total_nulls} null values"

TODO: load in and validate all ticker CSVs

In [None]:
import glob

paths = glob.glob("Data/Ticker_Data/*.csv")
dfs = []

for path in paths:
    ticker = os.path.basename(path).removesuffix(".csv")

    df = pl.read_csv(path, skip_rows=2, has_header=True, new_columns=["Date", "Close", "High", "Low", "Open", "Volume"])
    df = df.with_columns(pl.lit(ticker).alias("Ticker"))
    
    try:
        validate_df(df)
    except Exception as e:
        print(f"{ticker} is not valid, skipping: {e}")
        continue
    
    df = df.join(info, on="Ticker", how="left")

    dfs.append(df)

    
