# Vector Search Example Using Qdrant


## Load some data


In [None]:
import polars as pl

from tsfx import (
    DynamicGroupBySettings,
    ExtractionSettings,
    FeatureSetting,
    extract_features,
)

lf = pl.scan_csv("../test_data/all_stocks_5yr.csv")
lf = lf.drop_nulls()

dyn_opts = DynamicGroupBySettings(
    time_col="date",
    every="1y",
    period="1y",
    offset="0",
    datetime_format="%Y-%m-%d",
)

opts = ExtractionSettings(
    grouping_col="Name",
    value_cols=["close"],
    feature_setting=FeatureSetting.Efficient,
    dynamic_settings=dyn_opts,
)
# Extract features
gdf = extract_features(lf, opts)
gdf = gdf.sort(by=["Name"])

gdf

In [None]:
# Standardize
gdf = gdf.with_columns(
    [
        (pl.exclude([pl.Date, pl.Utf8]) - pl.exclude([pl.Date, pl.Utf8]).mean())
        / pl.exclude([pl.Date, pl.Utf8]).std(),
    ]
)
gdf = gdf.fill_nan(None)
gdf = gdf[[s.name for s in gdf if s.null_count() < 1]]
gdf

## Extract raw features


In [None]:
data = gdf.select(pl.exclude([pl.Date, pl.Utf8])).to_numpy()
names = gdf["Name"].to_list()
dates = gdf["date"].to_list()
data.shape

## Set up Qdrant Client


In [None]:
from qdrant_client import QdrantClient

client = QdrantClient(":memory:")

## Create collection


In [None]:
from qdrant_client.http.models import Distance, VectorParams

client.delete_collection(collection_name="test_collection")

client.create_collection(
    collection_name="test_collection",
    vectors_config=VectorParams(size=data.shape[1], distance=Distance.COSINE),
)

## Add vectors to collection


In [None]:
from datetime import timedelta

from qdrant_client.http.models import PointStruct

operation_info = client.upsert(
    collection_name="test_collection",
    wait=True,
    points=[
        PointStruct(
            id=i,
            vector=d.tolist(),
            payload={
                "name": name,
                "start_date": date.strftime("%Y-%m-%d"),
                "end_date": (date + timedelta(days=365)).strftime("%Y-%m-%d"),
            },
        )
        for i, (d, name, date) in enumerate(zip(data, names, dates, strict=True))
    ],
)

print(operation_info)

## Query vectors


In [None]:
INDEX = 432
K = 5
search_result = client.search(
    collection_name="test_collection",
    query_vector=data[INDEX, :].tolist(),
    limit=K,
)

print(search_result)

## Plot search results


In [None]:
import plotly.express as px

dfs = []
names = []
for search_res in search_result:
    id_ = search_res.id
    name = search_res.payload["name"]
    start_date = search_res.payload["start_date"]
    end_date = search_res.payload["end_date"]
    print(id_, name, start_date, end_date)
    dfs.append(
        lf.filter(pl.col("Name") == name)
        .filter(pl.col("date") > start_date)
        .filter(pl.col("date") < end_date)
        .collect()
    )
    names.append(name)

fig = px.line(x=dfs[0]["date"], y=dfs[0]["close"], title="Close price of " + names[0])

for df in dfs[1:]:
    fig.add_scatter(
        x=df["date"],
        y=df["close"],
        mode="lines",
        name="Close price of " + df["Name"][0],
    )
fig.show()

## (Debug) Compare with exact Distance Matrix


In [None]:
import numpy as np
import pandas as pd
from numpy.linalg import norm

distance_matrix = pd.DataFrame(
    index=range(data.shape[0]),
    columns=range(data.shape[0]),
)
X_df = pd.DataFrame(data, index=range(data.shape[0]), columns=range(data.shape[1]))
for stock_id1 in X_df.index:
    stock_features1 = X_df.loc[stock_id1].to_numpy()
    for stock_id2 in X_df.index:
        stock_features2 = X_df.loc[stock_id2].to_numpy()
        distance_matrix.loc[stock_id1, stock_id2] = np.dot(
            stock_features1, stock_features2
        ) / (norm(stock_features1) * norm(stock_features2))

distance_matrix = distance_matrix.astype(float)
distance_matrix

In [None]:
dm = distance_matrix.to_numpy()
x_ind, y_ind = np.unravel_index(np.argsort(dm, axis=None), dm.shape)
index_pairs = list(zip(x_ind, y_ind, strict=True))
index_pairs = [(x, y) for x, y in index_pairs if x < y]
index_pairs = index_pairs[::-1]
index_pairs[:10]

## K-means clustering

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

ks = []
inertias = []
for k in range(2, 21):
    kmeans = KMeans(n_clusters=k, random_state=0, n_init="auto").fit(data)
    print(k, kmeans.inertia_)
    ks.append(k)
    inertias.append(kmeans.inertia_)
fig, ax = plt.subplots(dpi=300)
ax.plot(ks, inertias)
ax.set_xlabel("Number of clusters")
ax.set_ylabel("Inertia")
ax.xaxis.set_ticks(range(2, 21, 1))

In [None]:
kmeans = KMeans(n_clusters=8, random_state=0, n_init="auto").fit(data)