Run this notebook in the same virtual environment with superlinked server
to ensure the same version of `superlinked` framework.

In [1]:
!pip freeze | grep superlinked

superlinked==28.8.2
superlinked-server==1.34.0


In [2]:
from pathlib import Path
import sys

# Determine current working directory
cwd = Path.cwd()

# Adapt logic for your specific project structure
if cwd.name == "intelligent-file-search":
    project_dir = cwd
elif cwd.name == "notebooks":
    project_dir = cwd.parent
else:
    # fallback (e.g., if inside a subfolder deeper in notebooks or dataset)
    project_dir = cwd
    while project_dir.name != "intelligent-file-search" and project_dir != project_dir.parent:
        project_dir = project_dir.parent

superlinked_app_dir = project_dir / "superlinked_app"
assert superlinked_app_dir.exists(), (
    f"{superlinked_app_dir} does not exist\n"
    "Are you sure you are in or below the intelligent-file-search directory?"
)

if str(project_dir) not in sys.path:
    sys.path.append(str(project_dir))
    print("project_dir is added to sys.path")
else:
    print("project_dir is already in sys.path")


project_dir is added to sys.path


In [3]:
from superlinked import framework as sl

from superlinked_app.index import index, file_schema
from superlinked_app.query import query
from superlinked_app.api import vector_database

import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


/home/biso/development/my_projects/varie/intelligent-file-search/dataset/categories.json
16:04:40 superlinked.framework.dsl.index.index INFO   initialized index


In [4]:
source = sl.InteractiveSource(file_schema)
executor = sl.InteractiveExecutor(
    sources=[source],
    indices=[index],
    vector_database=vector_database,
)
app = executor.run()

16:04:41 httpx INFO   HTTP Request: GET https://bad13d43-afc8-44b5-8b37-2fa1eb4f0236.eu-west-1-0.aws.cloud.qdrant.io:6333 "HTTP/1.1 200 OK"
16:04:41 httpx INFO   HTTP Request: GET https://bad13d43-afc8-44b5-8b37-2fa1eb4f0236.eu-west-1-0.aws.cloud.qdrant.io:6333/collections/default/exists "HTTP/1.1 200 OK"
16:04:41 httpx INFO   HTTP Request: GET https://bad13d43-afc8-44b5-8b37-2fa1eb4f0236.eu-west-1-0.aws.cloud.qdrant.io:6333/collections/default "HTTP/1.1 200 OK"
16:04:41 httpx INFO   HTTP Request: PUT https://bad13d43-afc8-44b5-8b37-2fa1eb4f0236.eu-west-1-0.aws.cloud.qdrant.io:6333/collections/default/index?wait=true "HTTP/1.1 200 OK"
16:04:41 httpx INFO   HTTP Request: PUT https://bad13d43-afc8-44b5-8b37-2fa1eb4f0236.eu-west-1-0.aws.cloud.qdrant.io:6333/collections/default/index?wait=true "HTTP/1.1 200 OK"
16:04:41 httpx INFO   HTTP Request: PUT https://bad13d43-afc8-44b5-8b37-2fa1eb4f0236.eu-west-1-0.aws.cloud.qdrant.io:6333/collections/default/index?wait=true "HTTP/1.1 200 OK"
16:04

In [15]:
from datetime import datetime
import pandas as pd
def date_to_unix(date_str):
    if date_str is None:
        return None
    return int(datetime.strptime(date_str, "%d/%m/%Y").timestamp())
# Natural language query parameters
params = {
    "natural_query": "Recent PDF files about fashion or sport",
    "limit": 5,
}

# Run the query with natural language interface
result = app.query(query, **params)

# Print the actual interpreted search parameters extracted by NLQ
print("Search Parameters Parsed by NLQ:")
print(result.metadata.search_params)

# Convert the results to a pandas DataFrame
df = sl.PandasConverter.to_pandas(result)

# Convert Unix timestamps to dd/mm/yyyy format in the DataFrame
import pandas as pd
df["Creation_Date"] = pd.to_datetime(df["Creation_Date"], unit="s").dt.strftime("%d/%m/%Y")
df["Last_Modified_Date"] = pd.to_datetime(df["Last_Modified_Date"], unit="s").dt.strftime("%d/%m/%Y")

df

16:09:06 httpx INFO   HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
16:10:40 httpx INFO   HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
16:10:43 superlinked.framework.query.query_dag_evaluator INFO   evaluated query
16:10:43 httpx INFO   HTTP Request: POST https://bad13d43-afc8-44b5-8b37-2fa1eb4f0236.eu-west-1-0.aws.cloud.qdrant.io:6333/collections/default/points/query "HTTP/1.1 200 OK"
16:10:44 superlinked.framework.dsl.executor.query.query_executor INFO   executed query
Search Parameters Parsed by NLQ:
{'content_query': 'fashion or sport', 'similar_content_weight': 1.0, 'filename_query': 'pdf', 'similar_filename_weight': 1.0, 'limit': 5, 'select_param__': ['Content', 'File_Type', 'Tags', 'Filename', 'Path', 'Size_kB', 'Creation_Date', 'Last_Modified_Date'], 'min_size_kb': None, 'max_size_kb': None, 'min_creation_date': None, 'max_creation_date': None, 'min_modified_date': None, 'max_modified_date': None, 'filet

Unnamed: 0,Content,File_Type,Tags,Filename,Path,Size_kB,Creation_Date,Last_Modified_Date,id,similarity_score
0,Football basketball championship championship ...,[pdf],[sport],football.pdf,/most/football.pdf,6633.0,23/08/2021,22/04/2024,id_80,0.64344
1,Basketball athlete basketball championship. Tr...,[pdf],"[sport, fashion]",basketba.pdf,/ago/basketba.pdf,3346.0,04/08/2015,31/03/2020,id_91,0.569138
2,Football tennis basketball football. Champions...,[pdf],[sport],football.pdf,/marriage/anything/football.pdf,8874.0,17/05/2010,14/03/2017,id_7,0.568932
3,Tennis athlete basketball athlete championship...,[pdf],"[sport, cars, cinema]",tennis.pdf,/nearly/could/whether/tennis.pdf,9540.0,18/07/2006,17/12/2022,id_2,0.563909
4,Runway runway dress trend runway. Race engine ...,[pdf],"[fashion, cars, cinema]",runway.pdf,/successful/runway.pdf,6376.0,17/05/2021,09/10/2021,id_87,0.545738


In [16]:

# These are the names of the semantic/numeric spaces in the same order as in your index.spaces list
space_names = [
    "content_similarity",
    "filename_similarity",
    "size_score",
    "creation_date_score",
    "modified_date_score"
]

rows = []

for entry in result.entries:
    partial_scores = dict(zip(space_names, entry.metadata.partial_scores))
    row = {"id": entry.id, **partial_scores}
    rows.append(row)
    
df = pd.DataFrame(rows)
df


Unnamed: 0,id,content_similarity,filename_similarity,size_score,creation_date_score,modified_date_score
0,id_80,0.147217,0.197754,0.0,0.0,0.29834
1,id_91,0.178345,0.15271,0.0,0.0,0.238159
2,id_7,0.147827,0.197754,0.0,0.0,0.223389
3,id_2,0.125366,0.154541,0.0,0.0,0.283936
4,id_87,0.111694,0.169678,0.0,0.0,0.264404


In [13]:
from datetime import datetime
import pandas as pd

def date_to_unix(date_str):
    if date_str is None:
        return None
    return int(datetime.strptime(date_str, "%d/%m/%Y").timestamp())

# Define your parameters with human-readable date strings or None
params = {
    # Text similarity inputs
    "content_query": "cinema fashion",
    "filename_query": "technical",

    # Weights for semantic spaces (matching what's in query.weights)
    "content_weight": 1.0,
    "filename_weight": 0.8,
    "size_weight": 0.5,
    "creation_date_weight": 0.6,
    "modified_date_weight": 1.0,

    # Weights for similarity matching
    "similar_content_weight": 1.0,
    "similar_filename_weight": 1.0,

    # Size filters (in kilobytes)
    "min_size_kb": 1000,
    "max_size_kb": 10000,

    # Date filters (input as "dd/mm/yyyy" strings or None)
    "min_creation_date": date_to_unix("01/01/2020"),
    "max_creation_date": date_to_unix("31/12/2024"),
    "min_modified_date": date_to_unix(None),  # No lower bound
    "max_modified_date": date_to_unix("30/06/2025"),

    # Categorical filters (must match query.py)
    "filetype_include_any": ["pdf"],
    "tags_include_any": ["cinema", "fashion"],

    # Result limit
    "limit": 5,
}

# Run the query
result = app.query(query, **params)

# Inspect parameters used
print(result.metadata.search_params)

# Convert to pandas DataFrame
df = sl.PandasConverter.to_pandas(result)

# Convert Unix timestamps in the DataFrame to dd/mm/yyyy strings for display
df["Creation_Date"] = pd.to_datetime(df["Creation_Date"], unit="s").dt.strftime("%d/%m/%Y")
df["Last_Modified_Date"] = pd.to_datetime(df["Last_Modified_Date"], unit="s").dt.strftime("%d/%m/%Y")

df


16:08:31 superlinked.framework.query.query_dag_evaluator INFO   evaluated query


16:08:33 httpx INFO   HTTP Request: POST https://bad13d43-afc8-44b5-8b37-2fa1eb4f0236.eu-west-1-0.aws.cloud.qdrant.io:6333/collections/default/points/query "HTTP/1.1 200 OK"
16:08:33 superlinked.framework.dsl.executor.query.query_executor INFO   executed query
{'content_query': 'cinema fashion', 'similar_content_weight': 1.0, 'filename_query': 'technical', 'similar_filename_weight': 1.0, 'limit': 5, 'select_param__': ['Content', 'File_Type', 'Tags', 'Filename', 'Path', 'Size_kB', 'Creation_Date', 'Last_Modified_Date'], 'min_size_kb': 1000.0, 'max_size_kb': 10000.0, 'min_creation_date': 1577833200, 'max_creation_date': 1735599600, 'min_modified_date': None, 'max_modified_date': 1751234400, 'filetype_include_all': None, 'filetype_include_any': ['pdf'], 'filetype_exclude': None, 'tags_include_all': None, 'tags_include_any': ['cinema', 'fashion'], 'tags_exclude': None, 'natural_query': None, 'system_prompt_param__': "Extract search parameters from the user's natural language query about fi

Unnamed: 0,Content,File_Type,Tags,Filename,Path,Size_kB,Creation_Date,Last_Modified_Date,id,similarity_score
0,Runway runway dress trend runway. Race engine ...,[pdf],"[fashion, cars, cinema]",runway.pdf,/successful/runway.pdf,6376.0,17/05/2021,09/10/2021,id_87,0.566194
1,Movie award award director director.,[pdf],[cinema],movie.pdf,/alone/past/movie.pdf,5791.0,11/09/2021,12/09/2022,id_59,0.562001


In [14]:

# These are the names of the semantic/numeric spaces in the same order as in your index.spaces list
space_names = [
    "content_similarity",
    "filename_similarity",
    "size_score",
    "creation_date_score",
    "modified_date_score"
]

rows = []

for entry in result.entries:
    partial_scores = dict(zip(space_names, entry.metadata.partial_scores))
    row = {"id": entry.id, **partial_scores}
    rows.append(row)
    
df = pd.DataFrame(rows)
df


Unnamed: 0,id,content_similarity,filename_similarity,size_score,creation_date_score,modified_date_score
0,id_87,0.106934,0.039032,0.108826,0.114685,0.196777
1,id_59,0.097839,0.026779,0.11145,0.117432,0.208496
