In [None]:
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from upload_overleaf.upload import upload
import tensorflow as tf

In [None]:
print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")

In [None]:
lf_prices = (
    pl.scan_parquet("prices.parquet")
    .with_columns([
        pl.col("ticker"),
        pl.col("datetime").alias("date"),
        pl.col("StockOpen").log().alias("log_open"),
        pl.col("StockHigh").log().alias("log_high"),
        pl.col("StockLow").log().alias("log_low"),
        pl.col("StockClose").log().alias("log_close"),
        pl.col("StockVol").alias("volume")
    ])
    .sort(["ticker", "datetime"])
)

In [None]:
# Calculate the log return within each ticker partition
lf_intraday = (
    lf_prices
    .select([
        "ticker",
        "date",
        "log_open",
        "log_close",
        "log_high",
        "log_low",
        "volume",
        (pl.col("log_close") - pl.col("log_close").shift(1)).over(pl.col("ticker")).alias("return")
    ])
)

In [None]:
# Print schema of the LazyFrame.
print(lf_intraday.schema)

In [None]:
# Print number of rows in the LazyFrame.
print(lf_intraday
    .count()
    .collect()
    .to_pandas()
    .iloc[0, 0]
)

In [None]:
# Group by date and ticker and sum StockVol to get daily volume
lf_dvol = (
    lf_prices
    .group_by(["ticker", "date"])
    .agg(
        pl.last("datetime").alias("datetime"),
        pl.sum("volume").cast(pl.Float64).alias("daily_volume")
    )
    # Only select last observation of each group
    .select([
        "ticker",
        "date",
        "datetime",
        "daily_volume"
    ])
)

In [None]:
print(lf_dvol.schema)

In [None]:
# Print number of rows in the LazyFrame.
print(lf_dvol
      .count()
      .collect()
      .to_pandas()
      .iloc[0, 0]
      )

In [None]:
# Group by date and ticker and sum StockVol to get daily volume
lf_last = (
    lf_prices
    .group_by(["ticker", "date"])
    .agg(
        pl.last("datetime").alias("datetime"),
        pl.last("log_open").cast(pl.Float64),
        pl.last("log_close").cast(pl.Float64),
        pl.last("log_high").cast(pl.Float64),
        pl.last("log_low").cast(pl.Float64)
    )
)

In [None]:
print(lf_last.schema)

In [ ]:
# Print number of rows in the LazyFrame.
print(lf_last
      .count()
      .collect()
      .to_pandas()
      .iloc[0, 0]
      )

In [None]:
df_last = lf_last.fetch(n_rows=2000)
df_dvol = lf_dvol.fetch(n_rows=2000)

In [None]:

# Only keep rows with time 20:59:00
lf_joined = lf_last.join(
    lf_dvol, on=["ticker", "date"]
)

# Clear LazyFrame
#del lf_last, lf_dvol

In [None]:
df_daily = lf_joined.fetch()

In [None]:
df_intraday = lf_intraday.fetch()

In [None]:

# Calculate the log return within each ticker partition
lf_daily = (
    lf_daily
    .select([
        "ticker",
        "date",
        "log_open",
        "log_close",
        "log_high",
        "log_low",
        "volume",
        (pl.col("log_close") - pl.col("log_close").shift(1)).over(pl.col("ticker")).alias("return")
    ])
)

In [None]:
schema = lf_intraday.schema

In [None]:
unique_tickers = (
    lf_daily.select("ticker")
    .unique()
    .collect()
    .to_pandas()["ticker"]
    .tolist()
)

# Display the list of unique tickers
print(unique_tickers)

In [None]:
df_aapl = lf_daily.filter(pl.col("ticker") == "AAPL").collect().to_pandas()

In [None]:
# Plot time series of StockClose
plt.figure(figsize=(15,10))
# Add second y-axis. Left is log price and right is return
ax = sns.lineplot(x="date", y="log_close", data=df_aapl, color="cornflowerblue")
ax2 = ax.twinx()
sns.lineplot(x="date", y="return", data=df_aapl, color="red", ax=ax2, alpha=0.5)
# Disable grid
ax.grid(False)
ax2.grid(False)
ax.set_title("AAPL Stock Price and Return")
ax.set_ylabel("Log Price")
ax2.set_ylabel("Return")
ax.set_xlabel("Date")
plt.show()

In [None]:
upload(plt, "Master's Thesis", 'figures/aapl_test.png')

In [None]:
# Plot time series of log_close for all tickers
plt.figure(figsize=(25,20))
ax = sns.lineplot(x="date", y="log_close", hue="ticker", data=lf_daily.collect().to_pandas(), palette="tab10")
ax.set_title("Stock Price for All Tickers")
ax.set_ylabel("Log Price")
ax.set_xlabel("Date")
plt.show()
