# Semantic Search in articles using NLP

## Prepare DataFrame


In [263]:
import pandas as pd
data =pd.read_csv("../data/processed/articles.csv")
df=data.copy()

In [264]:
df['category'].value_counts()


category
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

In [265]:
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

df['text'] = data['text'].apply(clean_text)

In [266]:
df.head()

Unnamed: 0,id,category,text
0,0,business,ad sales boost time warner profit quarterly pr...
1,1,business,dollar gains on greenspan speech the dollar ha...
2,2,business,yukos unit buyer faces loan claim the owners o...
3,3,business,high fuel prices hit bas profits british airwa...
4,4,business,pernod takeover talk lifts domecq shares in uk...


## Hot Keywords Extraction

### 1.using YAKE

In [267]:
import yake
def extract_hot_keywords_yake(text, top_n=10):
    """
    Extract hot keywords using YAKE

    Args:
        text: Article text
        top_n: Number of keywords to extract

    Returns:
        List of (keyword, score) tuples
    """
    # Initialize YAKE
    kw_extractor = yake.KeywordExtractor(
        lan="en",                    # Language
        n=3,                         # Max n-gram size (1-3 word phrases)
        dedupLim=0.9,                # Deduplication threshold (0.9 = high)
        dedupFunc='seqm',            # Deduplication function
        windowsSize=1,               # Window size for co-occurrence
        top=top_n,                   # Number of keywords
        features=None
    )

    keywords = kw_extractor.extract_keywords(text)
    return keywords

In [268]:
yake_hot_keywords_df = pd.DataFrame({
    'article_id': df['id'],
    'hot_keywords': df['text'].apply(
        lambda text: [kw[0] for kw in extract_hot_keywords_yake(text, top_n=10)]
    )
})

In [269]:
yake_hot_keywords_df.head()

Unnamed: 0,article_id,hot_keywords
0,0,"[fourth quarter profits, warners fourth quarte..."
1,1,"[current account deficit, federal reserve head..."
2,2,"[owner menatep group, case menatep groups, men..."
3,3,"[high fuel prices, blamed high fuel, fuel cost..."
4,4,"[allied domecq shares, lifts domecq shares, al..."


In [270]:
yake_hot_keywords_df.to_json(
    '../data/hot_keywords/yake_hot_keywords.json', 
    orient='records',
    indent=2,
    force_ascii=False
)

In [271]:
df.head()

Unnamed: 0,id,category,text
0,0,business,ad sales boost time warner profit quarterly pr...
1,1,business,dollar gains on greenspan speech the dollar ha...
2,2,business,yukos unit buyer faces loan claim the owners o...
3,3,business,high fuel prices hit bas profits british airwa...
4,4,business,pernod takeover talk lifts domecq shares in uk...


### 2.using Keybert

In [272]:
from keybert import KeyBERT
kw_model = KeyBERT(model='all-MiniLM-L6-v2')


In [273]:
def extract_hot_keywords_keybert(text, top_n=10):
    """
    Extract keywords using KeyBERT (transformer-based).
    """
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 3),   # include unigrams, bigrams, trigrams
        stop_words='english',
        top_n=top_n
    )
    # Returns list of tuples (keyword, score)
    return keywords


In [274]:
bert_hot_keywords_df = pd.DataFrame({
    'article_id': df['id'],
    'hot_keywords': df['text'].apply(
        lambda text: [kw[0] for kw in extract_hot_keywords_yake(text, top_n=10)]
    )
})

In [275]:
bert_hot_keywords_df.head()

Unnamed: 0,article_id,hot_keywords
0,0,"[fourth quarter profits, warners fourth quarte..."
1,1,"[current account deficit, federal reserve head..."
2,2,"[owner menatep group, case menatep groups, men..."
3,3,"[high fuel prices, blamed high fuel, fuel cost..."
4,4,"[allied domecq shares, lifts domecq shares, al..."


In [276]:
bert_hot_keywords_df.to_json(
    '../data/hot_keywords/bert_hot_keywords.json', 
    orient='records',
    indent=2,
    force_ascii=False
)

## First: Lexical Search

In [277]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - ------------------------------------- 0.5/12.8 MB 882.6 kB/s eta 0:00:14
     - ------------------------------------- 0.5/12.8 MB 882.6 kB/s eta 0:00:14
     -- ------------------------------------ 0.8/12.8 MB 798.0 kB/s eta 0:00:16
     --- ----------------------------------- 1.0/12.8 MB 825.2 kB/s eta 0:00:15
     --- ----------------------------------- 1.3/12.8 MB 882.6 kB/s eta 0:00:14
     ---- ---------------------------------- 1.6/12.8 MB 902.1 k

In [278]:
import spacy
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
from contractions import fix
import os
import pickle
from rank_bm25 import BM25Okapi
import numpy as np


In [279]:
def preprocess_text(text):
    """
    Complete preprocessing pipeline
    """
    # Lowercasing
    text = text.lower()
    # 2. Contraction expansion
    text = fix(text)

    # 3. Tokenization, cleaning, lemmatization
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop and len(token) > 2]
    return tokens




In [280]:
corpus= df['text'].apply(preprocess_text).tolist()

In [281]:
file_path = os.path.join("../data/processed/", "corpus.pkl")
with open(file_path, "wb") as f:
    pickle.dump(corpus, f)

In [282]:
bm25 = BM25Okapi(corpus)

In [283]:
os.makedirs("../models", exist_ok=True)
with open("../models/bm25_model.pkl", "wb") as f:
    pickle.dump(bm25, f)

In [284]:
def search_articles_bm25(query_sentence,df,top_n=5):
    """
    Retrieve the most relevant BBC articles for a sentence query using BM25.
    """
    query_sentence=clean_text(query_sentence)
    # Preprocess query sentence
    query_tokens = preprocess_text(query_sentence)

    # Compute BM25 scores for all docs
    scores = bm25.get_scores(query_tokens)

    # Sort and select top-N
    top_n_idx = np.argsort(scores)[::-1][:top_n]

    # Build results DataFrame
    results = df.iloc[top_n_idx].copy()
    results["bm25_score"] = scores[top_n_idx]

    return results[["id", "category", "bm25_score", "text"]]


In [285]:
sentence_query="Analyze China's export surge of 35% that propelled its trade surplus to a six-year high, identifying the key export sectors, the role of currency valuation, the impact on global trade balances, and the responses from trading partners."
results_bm25 = search_articles_bm25(sentence_query, df,top_n=5)


In [286]:
results_bm25

Unnamed: 0,id,category,bm25_score,text
426,426,business,59.00927,chinese exports rise 25 in 2004 exports from c...
439,439,business,49.612413,us trade deficit widens sharply the gap betwee...
504,504,business,42.571624,china now top trader with japan china overtook...
23,23,business,33.683889,us trade gap hits record in 2004 the gap betwe...
428,428,business,33.15755,us trade gap ballooned in october the us trade...


## Second: using spacy embeddings

In [287]:
!python -m spacy download en_core_web_lg 

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     ---------------------------------------- 0.3/400.7 MB ? eta -:--:--
     ---------------------------------------- 0.5/400.7 MB 1.2 MB/s eta 0:05:35
     ---------------------------------------- 0.8/400.7 MB 1.3 MB/s eta 0:05:10
     ---------------------------------------- 1.0/400.7 MB 1.2 MB/s eta 0:05:42
     ---------------------------------------- 1.0/400.7 MB 1.2 MB/s eta 0:05:42
     ---------------------------------------- 1.0/400.7 MB 1.2 MB/s eta 0:05:42
     ---------------------------------------- 1.6/400.7 MB 1.1 MB/s eta 0:06:07
     ---------------------------------------- 1.8/400.7 MB 1.2 MB/s eta 0:05:45
     ---------------------------------------

ERROR: Exception:
Traceback (most recent call last):
  File "f:\projects\SemSearch\.venv\Lib\site-packages\pip\_vendor\urllib3\response.py", line 438, in _error_catcher
    yield
  File "f:\projects\SemSearch\.venv\Lib\site-packages\pip\_vendor\urllib3\response.py", line 561, in read
    data = self._fp_read(amt) if not fp_closed else b""
           ^^^^^^^^^^^^^^^^^^
  File "f:\projects\SemSearch\.venv\Lib\site-packages\pip\_vendor\urllib3\response.py", line 527, in _fp_read
    return self._fp.read(amt) if amt is not None else self._fp.read()
           ^^^^^^^^^^^^^^^^^^
  File "f:\projects\SemSearch\.venv\Lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 102, in read
    self.__buf.write(data)
  File "C:\Program Files\Python312\Lib\tempfile.py", line 499, in func_wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
OSError: [Errno 28] No space left on device

During handling of the above exception, another exception occurred:

Traceback (most rec

In [288]:
import spacy
import numpy as np
import os
import faiss
nlp = spacy.load("en_core_web_lg")

In [289]:
def compute_spacy_doc_vectors(df, text_col="text", save_path="../data/embeddings/spacy_doc_vectors.npy"):
    """
    Compute and cache document embeddings using spaCy (non-transformer model).
    If the file already exists, it loads from cache.
    """
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    if os.path.exists(save_path):
        return np.load(save_path)
    
    doc_vectors = np.vstack([nlp(text).vector for text in df[text_col]])
    np.save(save_path, doc_vectors)

    return doc_vectors

In [290]:
def build_faiss_index(doc_vectors, save_path="../data/embeddings/faiss_index.bin"):
    """
    Build a FAISS index for fast similarity search and save it.
    """
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    # Ensure float32 dtype for FAISS
    doc_vectors = doc_vectors.astype("float32")

    # Normalize vectors (for cosine similarity)
    faiss.normalize_L2(doc_vectors)

    # Create FAISS index (Inner Product = cosine similarity)
    index = faiss.IndexFlatIP(doc_vectors.shape[1])
    index.add(doc_vectors)

    # Save FAISS index
    faiss.write_index(index, save_path)

    return index

In [291]:
def search_articles_spacy(query, df, index, top_n=5):
    """
    Perform semantic search using FAISS + spaCy.
    Returns top N most similar articles.
    """
    query = clean_text(query)
    query_vec = nlp(query).vector.astype("float32").reshape(1, -1)
    faiss.normalize_L2(query_vec)

    # Perform search
    distances, indices = index.search(query_vec, top_n)

    results = df.iloc[indices[0]].copy()
    results["similarity"] = distances[0]
    return results[["id", "category", "similarity", "text"]]

In [292]:
doc_vectors_spacy = compute_spacy_doc_vectors(df)
index_spacy = build_faiss_index(doc_vectors_spacy,save_path="../data/embeddings/faiss_index_spacy.bin")

In [293]:
sentence_query="Analyze China's export surge of 35% that propelled its trade surplus to a six-year high, identifying the key export sectors, the role of currency valuation, the impact on global trade balances, and the responses from trading partners."
results_spacy = search_articles_spacy(sentence_query, df,index_spacy, top_n=5)


In [294]:
results_spacy

Unnamed: 0,id,category,similarity,text
66,66,business,0.920587,fao warns on impact of subsidies billions of f...
171,171,business,0.911932,newest eu members underpin growth the european...
426,426,business,0.905781,chinese exports rise 25 in 2004 exports from c...
282,282,business,0.905367,stock market eyes japan recovery japanese shar...
339,339,business,0.905277,venezuela and china sign oil deal venezuelan p...


## third: using transformer based embeddings

In [295]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
embedder = SentenceTransformer('all-MiniLM-L6-v2')


In [296]:
def compute_transformer_doc_vectors(df, text_col="text", cache_path="../data/embeddings/transformer_doc_vectors.npy"):
    """
    Compute and cache SentenceTransformer embeddings for each document.
    """
    import os
    if os.path.exists(cache_path):
        embeddings = np.load(cache_path)
    else:
        embeddings = embedder.encode(df[text_col].tolist(), convert_to_numpy=True, show_progress_bar=True)
        np.save(cache_path, embeddings)
    return embeddings


In [297]:
def search_articles_semantic(query, df, index, top_n=5):
    """
    Perform semantic search using SentenceTransformer embeddings + FAISS.
    """
    query=clean_text(query)
    query_vec = embedder.encode([query], convert_to_numpy=True).astype("float32")
    faiss.normalize_L2(query_vec)

    distances, indices = index.search(query_vec, top_n)
    results = df.iloc[indices[0]].copy()
    results["similarity"] = distances[0]
    
    return results[["id", "category", "similarity", "text"]]

In [298]:
doc_vectors_semantic = compute_transformer_doc_vectors(df)
index_semantic = build_faiss_index(doc_vectors_semantic,save_path="../data/embeddings/faiss_index_semantic.bin")

In [299]:
sentence_query="Analyze China's export surge of 35% that propelled its trade surplus to a six-year high, identifying the key export sectors, the role of currency valuation, the impact on global trade balances, and the responses from trading partners."
results_semantic = search_articles_semantic(sentence_query, df,index_semantic, top_n=5)

In [300]:
results_semantic

Unnamed: 0,id,category,similarity,text
426,426,business,0.649032,chinese exports rise 25 in 2004 exports from c...
428,428,business,0.647292,us trade gap ballooned in october the us trade...
15,15,business,0.630536,china keeps tight rein on credit chinas effort...
504,504,business,0.610793,china now top trader with japan china overtook...
23,23,business,0.591622,us trade gap hits record in 2004 the gap betwe...


## Evaluation

In [301]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score


In [302]:
def load_ground_truth(csv_path):
    """
    Load ground truth data from CSV.
    Returns a dictionary mapping query -> list of relevant doc IDs (ordered by relevance)
    """
    df_eval = pd.read_csv(csv_path)
    ground_truth = {}
    
    for _, row in df_eval.iterrows():
        # Extract relevant IDs in order of relevance (id_1 is most relevant)
        relevant_ids = []
        for col in ["id_1", "id_2", "id_3", "id_4", "id_5"]:
            if pd.notna(row[col]):
                relevant_ids.append(int(row[col]))
        ground_truth[row["query"]] = relevant_ids
    
    return ground_truth

In [303]:
def precision_at_k(retrieved_ids, relevant_ids, k=5):
    """
    Calculate Precision@K
    Precision@K = (# of relevant items in top-K) / min(K, len(retrieved_ids))
    """
    if not retrieved_ids:
        return 0.0

    retrieved_k = retrieved_ids[:k]
    relevant_set = set(relevant_ids)
    hits = sum(1 for doc_id in retrieved_k if doc_id in relevant_set)

    return hits / min(k, len(retrieved_k))


In [304]:
from sklearn.metrics import ndcg_score
import numpy as np

def ndcg_at_k(retrieved_ids, relevant_ids, k=5):
    """
    Compute Normalized Discounted Cumulative Gain (NDCG@K)
    Measures how well the top-K retrieved results are ranked
    compared to the ground truth relevance ordering.
    """
    if not retrieved_ids or not relevant_ids:
        return 0.0

    # Assign decreasing relevance scores (id_1 most relevant)
    relevance_map = {doc_id: len(relevant_ids) - i for i, doc_id in enumerate(relevant_ids)}

    # Build true relevance vector for retrieved docs
    retrieved_k = retrieved_ids[:k]
    y_true = np.array([[relevance_map.get(doc_id, 0) for doc_id in retrieved_k]])
    y_score = np.array([[k - i for i in range(len(retrieved_k))]])  # descending scores

    if np.sum(y_true) == 0:
        return 0.0

    return float(ndcg_score(y_true, y_score, k=k))


In [305]:
def average_precision(retrieved_ids, relevant_ids):
    """
    Compute Average Precision (AP)
    AP = Average of precision@k over ranks where a relevant document appears.
    """
    if not relevant_ids:
        return 0.0
    
    relevant_set = set(relevant_ids)
    score = 0.0
    hits = 0

    for i, doc_id in enumerate(retrieved_ids):
        if doc_id in relevant_set:
            hits += 1
            score += hits / (i + 1)

    return score / len(relevant_set)


In [306]:
def evaluate_model(search_function, ground_truth, df, model_name="Model", k=5, **kwargs):
    """
    Evaluate a search model using all metrics.
    
    Args:
        search_function: Function that takes (query, df, top_n, **kwargs) and returns results DataFrame
        ground_truth: Dictionary mapping queries to relevant doc IDs
        df: The articles DataFrame
        model_name: Name of the model for display
        k: Number of results to retrieve and evaluate
        **kwargs: Additional arguments to pass to search_function (e.g., index, bm25)
    
    Returns:
        Dictionary with evaluation metrics
    """
    
    all_retrieved = {}
    precisions = []
    ndcgs = []
    ap=[]
    for query, relevant_ids in ground_truth.items():
        # Retrieve results
        results = search_function(query, df, top_n=k, **kwargs)
        retrieved_ids = results["id"].tolist()
        all_retrieved[query] = retrieved_ids
        
        # Calculate metrics
        prec = precision_at_k(retrieved_ids, relevant_ids, k=k)
        ndcg = ndcg_at_k(retrieved_ids, relevant_ids, k=k)
        
        precisions.append(prec)
        ndcgs.append(ndcg)
        ap.append(average_precision(retrieved_ids, relevant_ids))
    
    # Aggregate results
    results = {
        "Model": model_name,
        f"Precision@{k}": np.mean(precisions),
        f"NDCG@{k}": np.mean(ndcgs),
        "MAP": np.mean(ap)
    }
    
    return results

In [307]:
def run_complete_evaluation(ground_truth, df, bm25, index_spacy, index_semantic, k=5):
    """
    Run evaluation for all three models and compare results.
    """
    results = []
        
    # 1. Evaluate BM25
    bm25_results = evaluate_model(search_articles_bm25,ground_truth,df,model_name="Lexical model",k=k)
    results.append(bm25_results)
    # 2. Evaluate spaCy
    spacy_results = evaluate_model(search_articles_spacy,ground_truth,df,model_name="spaCy model",k=k,index=index_spacy)
    results.append(spacy_results)

    # 3. Evaluate Transformer
    transformer_results = evaluate_model(search_articles_semantic,ground_truth,df,model_name="semantic model",k=k,index=index_semantic)
    results.append(transformer_results)

    # Create comparison DataFrame
    comparison_df = pd.DataFrame(results)
    
    return comparison_df

In [320]:
ground_truth = load_ground_truth("../data/evaluation/evaluation.csv")

In [321]:
ground_truth 


{"What were the major factors behind Time Warner's profit increase in Q4 2004?": [0,
  10,
  34,
  49,
  3],
 "How did high fuel prices impact British Airways' profits in late 2004?": [3,
  27,
  44,
  29,
  25],
 'Which companies were involved in the battle to acquire UFJ Holdings in Japan?': [31,
  36,
  12,
  46,
  34],
 'What were the key economic indicators mentioned for Japan in 2004â€“2005?': [5,
  43,
  45,
  21,
  30],
 'How did the US trade deficit evolve in 2004, and what were the main contributing factors?': [23,
  1,
  6,
  40,
  44],
 'Which global companies made significant investments or acquisitions in China during 2004â€“2005?': [18,
  33,
  38,
  12,
  50],
 'What restructuring plans were reported for Fiat and its luxury brands in early 2005?': [50,
  12,
  48,
  20,
  34],
 'How did oil price surges in early 2005 affect global economic forecasts according to the OECD?': [29,
  27,
  44,
  23,
  40],
 'What deal did Ryanair sign with Boeing, and how many jobs was it 

In [322]:
evaluation_results = run_complete_evaluation(ground_truth=ground_truth,df=df,bm25=bm25,
                                             index_spacy=index_spacy,index_semantic=index_semantic,k=5)

In [323]:
evaluation_results

Unnamed: 0,Model,Precision@5,NDCG@5,MAP
0,Lexical model,0.228571,0.888587,0.202597
1,spaCy model,0.187013,0.612753,0.141515
2,semantic model,0.244156,0.932487,0.217532
