In [None]:
import polars as pl

from tsfx import (
    DynamicGroupBySettings,
    ExtractionSettings,
    FeatureSetting,
    extract_features,
)

lf = pl.scan_csv("../test_data/all_stocks_5yr.csv")
lf = lf.drop_nulls()

dyn_opts = DynamicGroupBySettings(
    time_col="date",
    every="1y",
    period="1y",
    offset="0",
    datetime_format="%Y-%m-%d",
)

opts = ExtractionSettings(
    grouping_col="Name",
    # value_cols=["open", "high", "low", "close", "volume"],
    value_cols=["close"],
    feature_setting=FeatureSetting.Efficient,
    dynamic_settings=None,
)
gdf = extract_features(lf, opts)
gdf = gdf.sort(by=["Name"])
gdf = gdf.with_columns(
    [
        (pl.exclude([pl.Date, pl.Utf8]) - pl.exclude([pl.Date, pl.Utf8]).mean())
        / pl.exclude([pl.Date, pl.Utf8]).std(),
    ]
)
gdf = gdf.fill_nan(None)
gdf = gdf[[s.name for s in gdf if s.null_count() < 1]]
gdf

In [None]:
data = gdf.select(pl.exclude([pl.Date, pl.Utf8])).to_numpy()
names = gdf["Name"].to_list()
# ids = gdf.select(
#     pl.concat_str([pl.col("Name"), pl.lit("_"), pl.col("date")])
# ).to_numpy()
data.shape

In [None]:
from qdrant_client import QdrantClient

client = QdrantClient(":memory:")

In [None]:
from qdrant_client.http.models import Distance, VectorParams

client.delete_collection(collection_name="test_collection")

client.create_collection(
    collection_name="test_collection",
    vectors_config=VectorParams(size=data.shape[1], distance=Distance.COSINE),
)

In [None]:
from qdrant_client.http.models import PointStruct

operation_info = client.upsert(
    collection_name="test_collection",
    wait=True,
    points=[
        PointStruct(id=i, vector=d.tolist(), payload={"name": name})
        for i, (d, name) in enumerate(zip(data, names, strict=True))
    ],
)

print(operation_info)

In [None]:
INDEX = 138
search_result = client.search(
    collection_name="test_collection", query_vector=data[INDEX, :].tolist(), limit=3
)

print(search_result)

In [None]:
import plotly.express as px

name1 = search_result[0].payload["name"]
print(name1)

name2 = search_result[1].payload["name"]
print(name2)

name3 = search_result[2].payload["name"]
print(name3)

df1 = lf.filter(pl.col("Name") == name1).collect()
fig = px.line(x=df1["date"], y=df1["close"])

df2 = lf.filter(pl.col("Name") == name2).collect()
fig.add_scatter(x=df2["date"], y=df2["close"], mode="lines")

df3 = lf.filter(pl.col("Name") == name3).collect()
fig.add_scatter(x=df3["date"], y=df3["close"], mode="lines")

## Exact Distance Matrix


In [None]:
import numpy as np
import pandas as pd
from numpy.linalg import norm

distance_matrix = pd.DataFrame(
    index=range(data.shape[0]),
    columns=range(data.shape[0]),
)
X_df = pd.DataFrame(data, index=range(data.shape[0]), columns=range(data.shape[1]))
for stock_id1 in X_df.index:
    stock_features1 = X_df.loc[stock_id1].to_numpy()
    for stock_id2 in X_df.index:
        stock_features2 = X_df.loc[stock_id2].to_numpy()
        distance_matrix.loc[stock_id1, stock_id2] = np.dot(
            stock_features1, stock_features2
        ) / (norm(stock_features1) * norm(stock_features2))

distance_matrix = distance_matrix.astype(float)
distance_matrix

In [None]:
dm = distance_matrix.to_numpy()
x_ind, y_ind = np.unravel_index(np.argsort(dm, axis=None), dm.shape)
index_pairs = list(zip(x_ind, y_ind, strict=True))
index_pairs = [(x, y) for x, y in index_pairs if x < y]
index_pairs = index_pairs[::-1]
index_pairs[:10]