Run this notebook in the same virtual environment with superlinked server
to ensure the same version of `superlinked` framework.

In [None]:
!pip freeze | grep superlinked

In [6]:
from pathlib import Path
import sys

# Determine current working directory
cwd = Path.cwd()

# Adapt logic for your specific project structure
if cwd.name == "intelligent-file-search":
    project_dir = cwd
elif cwd.name == "notebooks":
    project_dir = cwd.parent
else:
    # fallback (e.g., if inside a subfolder deeper in notebooks or dataset)
    project_dir = cwd
    while project_dir.name != "intelligent-file-search" and project_dir != project_dir.parent:
        project_dir = project_dir.parent

superlinked_app_dir = project_dir / "superlinked_app"
assert superlinked_app_dir.exists(), (
    f"{superlinked_app_dir} does not exist\n"
    "Are you sure you are in or below the intelligent-file-search directory?"
)

if str(project_dir) not in sys.path:
    sys.path.append(str(project_dir))
    print("project_dir is added to sys.path")
else:
    print("project_dir is already in sys.path")


project_dir is already in sys.path


### This is to use the collection_name=filesearch

In [7]:
import os
os.environ["APP_ID"] = "filesearch"


In [8]:
from superlinked import framework as sl

from superlinked_app.index import index, file_schema
from superlinked_app.query import query
from superlinked_app.api import vector_database

import pandas as pd

In [10]:
source = sl.InteractiveSource(file_schema)
executor = sl.InteractiveExecutor(
    sources=[source],
    indices=[index],
    vector_database=vector_database,
)
app = executor.run()

17:44:46 httpx INFO   HTTP Request: GET https://bad13d43-afc8-44b5-8b37-2fa1eb4f0236.eu-west-1-0.aws.cloud.qdrant.io:6333/collections/filesearch/exists "HTTP/1.1 200 OK"
17:44:46 httpx INFO   HTTP Request: GET https://bad13d43-afc8-44b5-8b37-2fa1eb4f0236.eu-west-1-0.aws.cloud.qdrant.io:6333/collections/filesearch "HTTP/1.1 200 OK"
17:44:46 httpx INFO   HTTP Request: PUT https://bad13d43-afc8-44b5-8b37-2fa1eb4f0236.eu-west-1-0.aws.cloud.qdrant.io:6333/collections/filesearch/index?wait=true "HTTP/1.1 200 OK"
17:44:46 httpx INFO   HTTP Request: PUT https://bad13d43-afc8-44b5-8b37-2fa1eb4f0236.eu-west-1-0.aws.cloud.qdrant.io:6333/collections/filesearch/index?wait=true "HTTP/1.1 200 OK"
17:44:46 httpx INFO   HTTP Request: PUT https://bad13d43-afc8-44b5-8b37-2fa1eb4f0236.eu-west-1-0.aws.cloud.qdrant.io:6333/collections/filesearch/index?wait=true "HTTP/1.1 200 OK"
17:44:46 httpx INFO   HTTP Request: PUT https://bad13d43-afc8-44b5-8b37-2fa1eb4f0236.eu-west-1-0.aws.cloud.qdrant.io:6333/collecti

In [13]:
from datetime import datetime
import pandas as pd
def date_to_unix(date_str):
    if date_str is None:
        return None
    return int(datetime.strptime(date_str, "%d/%m/%Y").timestamp())
# Natural language query parameters
params = {
    "natural_query": "Recent PDF files about fashion or sport",
    "limit": 5,
}

# Run the query with natural language interface
result = app.query(query, **params)

# Print the actual interpreted search parameters extracted by NLQ
print("Search Parameters Parsed by NLQ:")
print(result.metadata.search_params)

# Convert the results to a pandas DataFrame
df = sl.PandasConverter.to_pandas(result)

# Convert Unix timestamps to dd/mm/yyyy format in the DataFrame
import pandas as pd
df["Creation_Date"] = pd.to_datetime(df["Creation_Date"], unit="s").dt.strftime("%d/%m/%Y")
df["Last_Modified_Date"] = pd.to_datetime(df["Last_Modified_Date"], unit="s").dt.strftime("%d/%m/%Y")

df

17:48:10 httpx INFO   HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
17:48:13 superlinked.framework.query.query_dag_evaluator INFO   evaluated query
17:48:13 httpx INFO   HTTP Request: POST https://bad13d43-afc8-44b5-8b37-2fa1eb4f0236.eu-west-1-0.aws.cloud.qdrant.io:6333/collections/filesearch/points/query "HTTP/1.1 200 OK"
17:48:14 superlinked.framework.dsl.executor.query.query_executor INFO   executed query
Search Parameters Parsed by NLQ:
{'content_query': 'fashion or sport', 'similar_content_weight': 1.0, 'filename_query': 'pdf', 'similar_filename_weight': 1.0, 'limit': 5, 'select_param__': ['Content', 'File_Type', 'Tags', 'Filename', 'Path', 'Size_kB', 'Creation_Date', 'Last_Modified_Date'], 'min_size_kb': None, 'max_size_kb': None, 'min_creation_date': None, 'max_creation_date': None, 'min_modified_date': None, 'max_modified_date': 1609459200, 'filetype_include_all': None, 'filetype_include_any': ['pdf'], 'filetype_exclude': None, 'tags_include

Unnamed: 0,Content,File_Type,Tags,Filename,Path,Size_kB,Creation_Date,Last_Modified_Date,id,similarity_score
0,Model designer trend dress.,[pdf],[fashion],trend.pdf,/focus/impact/sound/trend.pdf,8776.0,20/12/2001,24/01/2014,id_96,0.534661
1,Basketball athlete tennis football championshi...,[pdf],"[sport, fashion, cinema]",basketba.pdf,/argue/basketba.pdf,6608.0,09/11/2012,17/10/2014,id_58,0.52569
2,Award movie scene director scene movie. Footba...,[pdf],"[cinema, sport, cars]",award.pdf,/financial/under/three/award.pdf,8383.0,24/11/2001,16/12/2011,id_20,0.475729
3,Athlete football athlete football athlete bask...,[pdf],[sport],athlete.pdf,/still/everything/economic/athlete.pdf,1114.0,28/05/2010,16/02/2014,id_24,0.428723


In [None]:

# These are the names of the semantic/numeric spaces in the same order as in your index.spaces list
space_names = [
    "content_similarity",
    "filename_similarity",
    "size_score",
    "creation_date_score",
    "modified_date_score"
]

rows = []

for entry in result.entries:
    partial_scores = dict(zip(space_names, entry.metadata.partial_scores))
    row = {"id": entry.id, **partial_scores}
    rows.append(row)
    
df = pd.DataFrame(rows)
df


In [None]:
from datetime import datetime
import pandas as pd

def date_to_unix(date_str):
    if date_str is None:
        return None
    return int(datetime.strptime(date_str, "%d/%m/%Y").timestamp())

# Define your parameters with human-readable date strings or None
params = {
    # Text similarity inputs
    "content_query": "cinema fashion",
    "filename_query": "technical",

    # Weights for semantic spaces (matching what's in query.weights)
    "content_weight": 1.0,
    "filename_weight": 0.8,
    "size_weight": 0.5,
    "creation_date_weight": 0.6,
    "modified_date_weight": 1.0,

    # Weights for similarity matching
    "similar_content_weight": 1.0,
    "similar_filename_weight": 1.0,

    # Size filters (in kilobytes)
    "min_size_kb": 1000,
    "max_size_kb": 10000,

    # Date filters (input as "dd/mm/yyyy" strings or None)
    "min_creation_date": date_to_unix("01/01/2020"),
    "max_creation_date": date_to_unix("31/12/2024"),
    "min_modified_date": date_to_unix(None),  # No lower bound
    "max_modified_date": date_to_unix("30/06/2025"),

    # Categorical filters (must match query.py)
    "filetype_include_any": ["pdf"],
    "tags_include_any": ["cinema", "fashion"],

    # Result limit
    "limit": 5,
}

# Run the query
result = app.query(query, **params)

# Inspect parameters used
print(result.metadata.search_params)

# Convert to pandas DataFrame
df = sl.PandasConverter.to_pandas(result)

# Convert Unix timestamps in the DataFrame to dd/mm/yyyy strings for display
df["Creation_Date"] = pd.to_datetime(df["Creation_Date"], unit="s").dt.strftime("%d/%m/%Y")
df["Last_Modified_Date"] = pd.to_datetime(df["Last_Modified_Date"], unit="s").dt.strftime("%d/%m/%Y")

df


In [None]:
print(df.columns)
print(df.head())

In [None]:

# These are the names of the semantic/numeric spaces in the same order as in your index.spaces list
space_names = [
    "content_similarity",
    "filename_similarity",
    "size_score",
    "creation_date_score",
    "modified_date_score"
]

rows = []

for entry in result.entries:
    partial_scores = dict(zip(space_names, entry.metadata.partial_scores))
    row = {"id": entry.id, **partial_scores}
    rows.append(row)
    
df = pd.DataFrame(rows)
df
