In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer

pd.set_option("display.max_columns", None)

In [None]:
raw_data_path = Path("..", "data", "raw", "SG sanctions on Russia.xlsx")

In [None]:
df = (
    pd.read_excel(
        raw_data_path,
        sheet_name="Contents",
        parse_dates=["date"],
        usecols=[
            "id",
            "source",
            "title",
            "content",
            "date",
            "parent",
            "language",
            "url",
            "parent source identifier",
            "domain",
            "topics",
            "image tags",
            "sentiment",
            "sentiment class",
            "visibility",
            "potential impressions",
            "actual impressions",
            "ave",
            "city",
            "country",
            "gender",
            "no. of comments",
            "no. of likes",
            "no. of shares",
            "no. of retweets",
            "no. of views",
            "user name",
        ],
    ).set_index("id")
)[lambda df: df["source"] == "Online News"]
df.head()

In [None]:
# Sentences we want sentence embeddings for
titles = df["title"].to_list()
content = df["content"].to_list()

## Load title/content embeddings, compute and save if not available

In [None]:
model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
if "content_embeddings.npy" in set(map(str, Path().glob("*.npy"))):
    content_embeddings = np.load("content_embeddings.npy")
else:
    content_embeddings = model.encode(content, batch_size=32, show_progress_bar=True)
    np.save("content_embeddings.npy", content_embeddings)
    
if "title_embeddings.npy" in set(map(str, Path().glob("*.npy"))):
    title_embeddings = np.load("title_embeddings.npy")
else:
    title_embeddings = model.encode(titles, batch_size=32, show_progress_bar=True)
    np.save("title_embeddings.npy", title_embeddings)

In [None]:
sentence_embeddings = content_embeddings
display(model)
sentence_embeddings.shape

## Experiment with different search methods, try to create consistent API

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer

class SearchJaccard:
    """Jaccard similarity based on tokenized word sets"""
    def __init__(
        self, 
        model,
        corpus: list[str],
    ):
        self.tokenizer = model.tokenizer
        
        # Save corpus for finding original text
        self.df = pd.DataFrame(corpus, columns=["corpus"])
        self.df["token_set"] = self.df["corpus"].apply(lambda doc: set(self.tokenizer.tokenize(doc)))

    def __call__(self, text: str, k: int):
        text_token_set = set(self.tokenizer.tokenize(text))
        similarity = self.df["token_set"].apply(self.jaccard_similarity, B=text_token_set)
        top_k = similarity.nlargest(k)
        indexes = top_k.index.values
        distances = top_k.values
        
        return distances, indexes
    
    def __getitem__(self, ids):
        return self.df.loc[ids, "corpus"].to_list()
    
    # Reduce memory usage by storing corpus only in df, extract when needed
    @property
    def corpus(self):
        return self.df["corpus"].to_list()

    @staticmethod
    def jaccard_similarity(A, B):
        #Find intersection of two sets
        nominator = A.intersection(B)

        #Find union of two sets
        denominator = A.union(B)

        #Take the ratio of sizes
        similarity = len(nominator)/len(denominator)

        return similarity

    
# jaccard_searcher = SearchJaccard(model=model, corpus=content)
# d, i = jaccard_searcher("Russian government approved a list of countries and territories that are 'unfriendly'", 20)
# jaccard_searcher[i]

In [None]:
import faiss
from sentence_transformers import SentenceTransformer

class SearchFlatL2:
    """Exhaustive euclidean search on flat vector index"""
    
    def __init__(
        self, 
        model,
        sentence_embeddings,
        corpus: list[str],
    ):
        self.model = model
        
        # Save corpus for finding original text
        self.corpus = corpus

        d = sentence_embeddings.shape[1]
        
        # IndexFlatL2 config
        self.index = faiss.IndexFlatL2(d)
        
        # faiss indexing embeddings
        self.index.add(sentence_embeddings)
    
    def __call__(self, text: str, k: int):
        
        xq = self.model.encode([text])
        D, I = self.index.search(xq, k)
        
        return D[0], I[0]
    
    def __getitem__(self, ids):
        return [self.corpus[i] for i in ids]
    
# flatl2_searcher = SearchFlatL2(model=model, sentence_embeddings=sentence_embeddings, corpus=content)
# d, i = flatl2_searcher("Russian government approved a list of countries and territories that are 'unfriendly'", k = 20)
# flatl2_searcher[i]

In [None]:
import faiss
from sentence_transformers import SentenceTransformer

class SearchHNSW:
    """HNSW Approximate Nearest Neighbour L2 search, use when require low memory usage"""
    def __init__(
        self, 
        model,
        sentence_embeddings,
        corpus: list[str],
        M=64, # number of connections each vertex will have
        ef_search=32, # depth of layers explored during search
        ef_construction=64, # depth of layers explored during index construction
    ):
        self.model = model
        
        # Save corpus for finding original text
        self.corpus = corpus

        d = sentence_embeddings.shape[1]
        
        # HNSW config
        self.index = faiss.IndexHNSWFlat(d, M)
        self.index.hnsw.efConstruction = ef_construction
        self.index.hnsw.efSearch = ef_search
        
        # faiss indexing embeddings
        self.index.add(sentence_embeddings)
    
    def __call__(self, text: str, k: int):
        
        xq = self.model.encode([text])
        D, I = self.index.search(xq, k)
        
        return D[0], I[0]
    
    def __getitem__(self, ids):
        return [self.corpus[i] for i in ids]

    
# hnsw_searcher = SearchHNSW(model=model, sentence_embeddings=sentence_embeddings, corpus=content)
# d, i = hnsw_searcher("Russian government approved a list of countries and territories that are 'unfriendly'", k = 20)
# hnsw_searcher[i]

In [None]:
from pynndescent import NNDescent
from sentence_transformers import SentenceTransformer

class SearchPyNN:
    """NNDescent"""
    def __init__(
        self, 
        model,
        sentence_embeddings,
        corpus: list[str],
    ):
        self.model = model
        
        # Save corpus for finding original text
        self.corpus = corpus

        d = sentence_embeddings.shape[1]
        
        # PyNNDescent indexing embeddings
        self.index = NNDescent(sentence_embeddings)
    
    def __call__(self, text: str, k: int):
        
        xq = self.model.encode([text])
        I, D = self.index.query(xq, k)
        
        return D[0], I[0]
    
    def __getitem__(self, ids):
        return [self.corpus[i] for i in ids]

    
# pynn_searcher = SearchPyNN(model=model, sentence_embeddings=sentence_embeddings, corpus=content)
# d, i = pynn_searcher("Russian government approved a list of countries and territories that are 'unfriendly'", k = 20)
# pynn_searcher[i]

## Additional filters, separate from similarity

To filter by metadata, 
1. Expand top_k params and do a post-filter. Drawback is that there could be too few results at the end.
2. If filter param is discrete, can create and index for each combination of filter param
3. Create an index on the fly for each configuration, but this will be the slowest and may negate any speed benefits.

Explore filtering by date, by tokenized words