In [1]:
import faiss
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

In [2]:
summarizer_type = "openai"
topic_sum_df = pd.read_parquet("gs://scraped-news-article-data-null/2023-topics-%s.parquet" % summarizer_type)
topic_sum_df.head()

Unnamed: 0,topics,summary
0,0,The news articles revolve around the advanceme...
1,1,The news articles cover the fluctuations in oi...
2,2,The news articles revolve around the security ...
3,3,The Federal Reserve's interest rate decisions ...
4,4,The news articles cover various aspects of the...


In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')


def embed_summary(summary):
    return model.encode(summary)


topic_sum_df["vector"] = topic_sum_df["summary"].apply(embed_summary)
topic_sum_df.head()

Unnamed: 0,topics,summary,vector
0,0,The news articles revolve around the advanceme...,"[-0.07444057, -0.041888542, 0.019037506, 0.036..."
1,1,The news articles cover the fluctuations in oi...,"[-0.02794887, -0.0540836, 0.15111348, 0.131706..."
2,2,The news articles revolve around the security ...,"[-0.07545868, 0.07306916, 0.0052351127, -0.062..."
3,3,The Federal Reserve's interest rate decisions ...,"[-0.008949646, 0.0005092344, -0.064278044, 0.0..."
4,4,The news articles cover various aspects of the...,"[-0.018509006, 0.01599976, 0.02875276, 0.11734..."


In [4]:
vectors = np.array(topic_sum_df.vector.to_list() )
vectors.shape

(1564, 384)

In [5]:
index = faiss.index_factory(len(topic_sum_df["vector"].iloc[0]), "IDMap,Flat")
index.add_with_ids(vectors, np.array(topic_sum_df.topics.to_list()))

In [6]:
index_file = "/home/jupyter/faiss_topic.index"


faiss.write_index(index, index_file)

In [7]:
index = faiss.read_index(index_file)

In [8]:
query = "artificial intelligence"
vector_query = np.array(model.encode(query))

In [9]:
D, I = index.search(np.reshape(vector_query, (1, len(vector_query))), k=3)

In [10]:
I[0]

array([  0,  59, 438])

In [11]:
topic_sum_df.loc[topic_sum_df.topics.isin(I[0])].summary.to_list()

['The news articles revolve around the advancements and competition in the field of Artificial Intelligence (AI) and its integration into various industries. Companies like Google, Microsoft, OpenAI, and others are developing and launching AI-powered products like chatbots, virtual assistants, and generative AI. The articles also discuss the potential risks and concerns associated with AI.',
 'The articles discuss various aspects of the artificial intelligence (AI) industry, including funding for AI startups, the use of AI in investing and aviation, the growth of AI-related jobs, and the potential impact of AI on environmental, social, and governance (ESG) concerns. Some specific companies mentioned include Character.AI, iManage, iLearningEngines, Inflection, and Exotec.',
 "The articles discuss various advancements and partnerships in the field of AI, particularly those related to Google and its competitors. Topics include the release of new AI search technology, partnerships with Int