In [None]:
# ====================================================
# 03 Semantic Matching with Multi-Modal Embeddings
# ====================================================

In [None]:
# --- 0) Install dependencies ---
# pip install sentence-transformers faiss-cpu

In [1]:
# --- 1) Imports ---
import os
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss
import ast
import requests

In [None]:
# --- 2) Configuration ---
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_DIR = os.path.join(PROJECT_ROOT, "data")
PROC_DIR = os.path.join(DATA_DIR, "processed")
os.makedirs(PROC_DIR, exist_ok=True)

CSV_PATH = os.path.join(PROC_DIR, "cleaned_project_data.csv")
DOWNLOAD_URL = "https://drive.google.com/uc?id=1A1j9JsYjmD1EuanF9FFWsgi2wGbKQ1bg&export=download"

# Column names
PROJECT_TEXT_COL = "project_text_simple"
TOPIC_ID_COL = "topics"
TOPIC_TEXT_COL = "topic_text"
PROJECT_KEYWORDS_COL = "keywords_clean"

# Model settings

# NOTE: Originally, 'multi-qa-mpnet-base-dot-v1' was tested and gave similar evaluation results.
# However, in practice, when using longer project texts along with multi-modal features
# (keywords + year vectors), it caused errors or failed to handle longer input efficiently.
# Therefore, 'all-MiniLM-L6-v2' was chosen for robust, scalable processing in this pipeline.

EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
EVAL_TOP_K = 10
BATCH_ENCODE = 64

# Optional: small batch for cross-encoder demo
# Important Note: Cross-encoder might appear worse on small batches, but it is much slower and not feasible for full dataset retrieval.

CROSS_ENCODER_BATCH = 50 # Adjust based on your hardware capabilities
CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"

In [3]:
# --- 3) Load Data (download if missing) ---
if not os.path.exists(CSV_PATH):
    print(f"{CSV_PATH} not found locally. Downloading from Drive...")
    response = requests.get(DOWNLOAD_URL)
    if response.status_code == 200:
        with open(CSV_PATH, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded dataset to {CSV_PATH}")
    else:
        raise FileNotFoundError(f"Failed to download dataset. HTTP Status: {response.status_code}")

df = pd.read_csv(CSV_PATH, encoding="utf-8")
print(f"Loaded dataset: {len(df)} projects")

# Safely parse keywords
df[PROJECT_KEYWORDS_COL] = df[PROJECT_KEYWORDS_COL].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# Extract unique topics
df_topics = df.groupby(TOPIC_ID_COL).first().reset_index()
print(f"Unique topics: {len(df_topics)}")

# Identify year columns
year_cols = [col for col in df.columns if col.startswith('year_')]
required_cols = [PROJECT_TEXT_COL, TOPIC_ID_COL, TOPIC_TEXT_COL, PROJECT_KEYWORDS_COL] + year_cols
assert all(col in df.columns for col in required_cols), "Missing columns in DataFrame"

Loaded dataset: 4429 projects
Unique topics: 1873


In [4]:
# --- 4) Load Embedding Model ---
model = SentenceTransformer(EMBED_MODEL_NAME)
embedding_dim = model.get_sentence_embedding_dimension()
print(f"Loaded '{EMBED_MODEL_NAME}', embedding dimension: {embedding_dim}")

Loaded 'all-MiniLM-L6-v2', embedding dimension: 384


In [5]:
# --- 5) Helper: Encode & Cache ---
def encode_and_cache(texts, path, model, batch_size=BATCH_ENCODE):
    if os.path.exists(path):
        print(f"Loading embeddings from {path}")
        return np.load(path)
    else:
        print(f"Creating embeddings and saving to {path}")
        embeddings = model.encode(texts, show_progress_bar=True, batch_size=batch_size)
        np.save(path, embeddings)
        return embeddings
    
# Paths for the .npy files
project_embeddings_path = os.path.join(PROC_DIR, 'project_embeddings.npy')
topic_embeddings_path   = os.path.join(PROC_DIR, 'topic_text_embeddings.npy')
project_keyword_embeddings_path = os.path.join(PROC_DIR, 'project_keyword_embeddings.npy')
topic_keyword_embeddings_path   = os.path.join(PROC_DIR, 'topic_keyword_embeddings.npy')


In [6]:
# --- 6) Create Embeddings (Unified and Cached for App Use) ---

# Paths for cached embeddings
project_embeddings_path = os.path.join(PROC_DIR, 'project_embeddings.npy')
topic_embeddings_path = os.path.join(PROC_DIR, 'topic_text_embeddings.npy')
project_keyword_embeddings_path = os.path.join(PROC_DIR, 'project_keyword_embeddings.npy')
topic_keyword_embeddings_path = os.path.join(PROC_DIR, 'topic_keyword_embeddings.npy')

# Helper function: Encode & cache
def encode_and_cache(texts, path, model, batch_size=BATCH_ENCODE):
    if os.path.exists(path):
        print(f"Loading embeddings from {path}")
        return np.load(path)
    else:
        print(f"Creating embeddings and saving to {path}")
        embeddings = model.encode(texts, show_progress_bar=True, batch_size=batch_size)
        np.save(path, embeddings)
        return embeddings

# 1) Project text embeddings
project_text_embeddings = encode_and_cache(df[PROJECT_TEXT_COL].tolist(), project_embeddings_path, model)

# 2) Topic text embeddings
topic_text_embeddings = encode_and_cache(df_topics[TOPIC_TEXT_COL].tolist(), topic_embeddings_path, model)

# 3) Project keyword embeddings
project_keyword_embeddings = encode_and_cache(
    df[PROJECT_KEYWORDS_COL].apply(lambda x: ' '.join(x)).tolist(),
    project_keyword_embeddings_path,
    model
)

# 4) Topic keyword embeddings (aggregated first)
topic_keywords_agg = df.groupby(TOPIC_ID_COL)[PROJECT_KEYWORDS_COL].apply(
    lambda lists: list(set([kw for sublist in lists for kw in sublist]))
).reset_index()
topic_keywords_agg = pd.merge(df_topics[[TOPIC_ID_COL]], topic_keywords_agg, on=TOPIC_ID_COL, how='left')
topic_keywords_agg[PROJECT_KEYWORDS_COL] = topic_keywords_agg[PROJECT_KEYWORDS_COL].fillna('').apply(lambda x: ' '.join(x))

topic_keyword_embeddings = encode_and_cache(
    topic_keywords_agg[PROJECT_KEYWORDS_COL].tolist(),
    topic_keyword_embeddings_path,
    model
)

print("All individual embeddings created and saved for application use.")


Creating embeddings and saving to /Users/timschnelzer/Developer/tender-matching-horizon-europe/data/processed/project_embeddings.npy


Batches:   0%|          | 0/70 [00:00<?, ?it/s]

Creating embeddings and saving to /Users/timschnelzer/Developer/tender-matching-horizon-europe/data/processed/topic_text_embeddings.npy


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Creating embeddings and saving to /Users/timschnelzer/Developer/tender-matching-horizon-europe/data/processed/project_keyword_embeddings.npy


Batches:   0%|          | 0/70 [00:00<?, ?it/s]

Creating embeddings and saving to /Users/timschnelzer/Developer/tender-matching-horizon-europe/data/processed/topic_keyword_embeddings.npy


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

All individual embeddings created and saved for application use.


In [7]:
# --- 7) Build FAISS Index & Evaluate ---
# Combine topic embeddings on the fly for FAISS
topic_embeddings_combined = np.hstack([topic_text_embeddings, topic_keyword_embeddings, df_topics[year_cols].values]).astype('float32')
faiss.normalize_L2(topic_embeddings_combined)

index = faiss.IndexFlatIP(topic_embeddings_combined.shape[1])
index.add(topic_embeddings_combined)
print(f"FAISS index created. Entries: {index.ntotal}")

# Evaluate Top-K retrieval on the full dataset
top_k_hits = 0
top_1_hits = 0
mrr_sum = 0.0

for i, row in df.iterrows():
    proj_vec = np.hstack([
        project_text_embeddings[i:i+1],
        project_keyword_embeddings[i:i+1],
        df[year_cols].values[i:i+1]
    ]).astype('float32')
    faiss.normalize_L2(proj_vec)

    true_topic = row[TOPIC_ID_COL]
    true_idx_series = df_topics[df_topics[TOPIC_ID_COL]==true_topic].index
    if true_idx_series.empty: continue
    true_idx = true_idx_series[0]

    D, I = index.search(proj_vec, EVAL_TOP_K)
    ranked_idx = I[0]

    if ranked_idx[0] == true_idx: top_1_hits += 1
    if true_idx in ranked_idx:
        top_k_hits += 1
        rank = np.where(ranked_idx == true_idx)[0][0] + 1
        mrr_sum += 1.0 / rank

top_k_acc = top_k_hits / len(df)
top_1_acc = top_1_hits / len(df)
mrr_at_k = mrr_sum / len(df)

print("\n--- FAISS Bi-Encoder Evaluation ---")
print(f"Top-{EVAL_TOP_K} Accuracy: {top_k_acc:.4f}")
print(f"Top-1 Accuracy: {top_1_acc:.4f}")
print(f"MRR@{EVAL_TOP_K}: {mrr_at_k:.4f}")

FAISS index created. Entries: 1873

--- FAISS Bi-Encoder Evaluation ---
Top-10 Accuracy: 0.9564
Top-1 Accuracy: 0.8905
MRR@10: 0.9148


In [None]:
# --- 8) Cross-Encoder Demo ---
cross_demo_df = df.head(CROSS_ENCODER_BATCH)
cross_model = CrossEncoder(CROSS_ENCODER_MODEL)

top_k_hits_ce = 0
top_1_hits_ce = 0
mrr_ce = 0.0

for idx, row in cross_demo_df.iterrows():
    project_text = row[PROJECT_TEXT_COL]
    true_topic = row[TOPIC_ID_COL]

    pairs = [[project_text, t_text] for t_text in df_topics[TOPIC_TEXT_COL]]
    scores = cross_model.predict(pairs)
    ranked_idx = scores.argsort()[::-1][:EVAL_TOP_K]
    top_topics = df_topics.iloc[ranked_idx][TOPIC_ID_COL].tolist()

    if top_topics[0] == true_topic: top_1_hits_ce += 1
    if true_topic in top_topics:
        top_k_hits_ce += 1
        rank = top_topics.index(true_topic) + 1
        mrr_ce += 1.0 / rank

top_k_acc_ce = top_k_hits_ce / len(cross_demo_df)
top_1_acc_ce = top_1_hits_ce / len(cross_demo_df)
mrr_ce /= len(cross_demo_df)

print("\n--- Cross-Encoder (Sample) Evaluation ---")
print(f"Top-{EVAL_TOP_K} Accuracy: {top_k_acc_ce:.4f}")
print(f"Top-1 Accuracy: {top_1_acc_ce:.4f}")
print(f"MRR@{EVAL_TOP_K}: {mrr_ce:.4f}")

print("\nNote: Cross-Encoder was applied to a small batch only for demonstration due to computational cost.")
