In [1]:
from preprocess import Preprocessor
import pandas as pd # type: ignore
from gensim.models.doc2vec import Doc2Vec # type: ignore
from sklearn.metrics.pairwise import cosine_similarity
from joblib import dump, load
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
model = Doc2Vec.load('models/doc2vec_model.model')

preprocessor = Preprocessor()



In [3]:
relative_path = "../resources/data/data.csv"

sample_df = pd.read_csv(relative_path)

df = sample_df
df.head()

Unnamed: 0,Article link,Website source,Article type,Article title,Content,Creation date,Author,Category,Tags,Summary,Temp
0,https://www.theblock.co/post/285730/custodia-i...,https://www.theblock.co,News Article,Custodia is not entitled to a Fed master accou...,The Federal Reserve does not have to give digi...,"March 29, 2024, 7:05PM EDT",Sarah Wynn,Policy,COURT HEARINGS-LAWSUITS,Custodia Bank sued the central bank in 2022 fo...,
1,https://www.theblock.co/post/285724/multicoin-...,https://www.theblock.co,News Article,"Multicoin Capital's hedge fund has grown 9,281...",Multicoin Capital’s crypto-focused hedge fund ...,"March 29, 2024, 7:00PM EDT UPDATED: March 29, ...",Elizabeth Napolitano,Companies,INVESTMENT FIRMS,"Multicoin Capital’s hedge fund has returned 9,...",
2,https://www.theblock.co/post/285702/1kx-raise-...,https://www.theblock.co,News Article,1kx raises $75 million in latest funding round,"1kx has raised $75 million, the latest sign in...","March 29, 2024, 3:20PM EDT",Elizabeth Napolitano,Companies,,Investment firm 1kx has raised $75 million for...,
3,https://www.theblock.co/post/285690/cftc-commi...,https://www.theblock.co,News Article,CFTC Commissioner Pham says agency may be infr...,One of the Commodity Futures Trading Commissio...,"March 29, 2024, 12:06PM EDT",Sarah Wynn,Exchanges,CFTC-SEC,The agency’s complaint “appears to assert that...,
4,https://www.theblock.co/post/285608/bitcoin-fu...,https://www.theblock.co,News Article,Bitcoin futures open interest reaches new high...,Open interest for bitcoin futures on centraliz...,"March 29, 2024, 11:03AM EDT UPDATED: March 29,...",Vishal Chawla,The Block,,Bitcoin futures open interest on centralized e...,


In [4]:
def search(query):
    query = preprocessor.preprocess_text(query)
    inferred_vector = model.infer_vector(query.split())
    
    sims = model.dv.most_similar([inferred_vector], topn=10)

    results = []
    for sim in sims:
        doc_index = int(sim[0])
        similarity = sim[1]
        title = df.iloc[doc_index][' Article title']
        content = df.iloc[doc_index][' Content']
        results.append((doc_index, similarity, title, content))

    return results



In [5]:
query = "elon musk"

for doc_index, similarity, title, content in search(query):
    print(doc_index)
    print(similarity)
    print(title)
    print(content)


6351
0.9358048439025879
Bitcoin mining report: Jan. 17
Bitcoin mining stocks tracked by The Block were higher on Tuesday, with 17 gaining and the other two declining. Bitcoin rose slightly to $21,317 by market close. Here is a look at how the individual miners performed today:  
6468
0.9236555695533752
Bitcoin mining report: Jan. 10
Bitcoin mining stocks tracked by The Block were higher on Tuesday, with 16 gaining and the other three declining. Bitcoin rose 1.3% to $17,459 by market close. Here is a look at how the individual miners performed today:  
6424
0.9187068939208984
Bitcoin mining report: Jan. 12
Bitcoin mining stocks tracked by The Block were higher on Thursday, with all 19 gaining. Bitcoin rose 7.5% to $18,857 by market close. Here is a look at how the individual miners performed today: RELATED INDICES See crypto indices 
6397
0.9181467890739441
Bitcoin mining report: Jan. 13
Bitcoin mining stocks tracked by The Block were mostly higher on Friday, with 13 gaining and the oth

In [6]:
tfidf_matrix = load("models/tfidf/tfidf_matrix.joblib")
vectorizer = load("models/tfidf/vectorizer.joblib")

In [7]:
def tfidf_query(query):
  preprocessed_query = preprocessor.preprocess_text(query)

# Tìm kiếm và xác định hàng liên quan nhất
  query_vector = vectorizer.transform([preprocessed_query])
  similarities = cosine_similarity(query_vector, tfidf_matrix)

  # Bước 6: Sắp xếp và hiển thị kết quả
  results = []
  for idx, sim in enumerate(similarities[0]):
      results.append((df.iloc[idx][' Article title'], sim))

  results.sort(key=lambda x: x[1], reverse=True)

  for result in results[:10]:
      print(result)

In [13]:
query = "sam altman"
tfidf_query(query)

('Worldcoin price drops 6% after Sam Altman unseated as OpenAI CEO', 0.4922975693228572)
("Worldcoin's Sam Altman tells Joe Rogan that US government is waging war on crypto", 0.47714439868319936)
("Sam Altman remains at Worldcoin's developer while he may return to OpenAI", 0.44508625269701596)
('Worldcoin price swings accompany twists in OpenAI saga', 0.424659368233429)
('Sam Altman says Worldcoin faces huge operational challenges as demand grows', 0.38477343498519045)
('Sam Altman joins investors in $19 million round for crypto life insurance startup', 0.32335432864778835)
('Sam Altman’s Worldcoin targets sovereign funds for latest raise as OpenAI booms', 0.24421760437312118)
('Worldcoin price sets all-time high on back of Sora AI launch, up 100% in a week', 0.18007894779395447)
('Worldcoin launches grant program, outlines decentralization plans', 0.1346294034241786)
("Worldcoin token drops 5% amid Elon Musk's lawsuit against OpenAI", 0.1295889079791297)
