In [1]:
import asyncio
import requests
from datetime import datetime, timedelta
from enum import Enum

NEWS_API_KEY = "3fc1f6647e6b4aaa9fc7aec282a27ea4"

class SortBy(Enum):
    popularity = "popularity"
    relevancy = "relevancy" 
    publishedAt = "publishedAt"

async def _news_api(
    query: str,
    num_articles: int = 50,
    sort_by: SortBy = SortBy.relevancy,
    last_day = 30,
) -> dict:
    # 免费账号限制，只能获取最近1月数据
    today = datetime.now()
    one_month_ago = today - timedelta(days=last_day)
    from_datetime = one_month_ago.strftime("%Y-%m-%d")

    # https://newsapi.org/v2/everything?q=Apple&from=2023-07-29&sortBy=popularity
    response = requests.get(
        "https://newsapi.org/v2/everything",
        params={
            "q": query,
            "apiKey": NEWS_API_KEY,
            "pageSize": num_articles,
            "sortBy": sort_by,
            "from": from_datetime,
        },
    )
    return response.json()['articles']

# await _news_api("OpenAI", 10, sort_by=SortBy.publishedAt, last_day=7), # 1周内最新10条
# await _news_api("OpenAI", 10, sort_by=SortBy.popularity), # 1月内最热10条
# await _news_api("OpenAI", 10, sort_by=SortBy.popularity, last_day=7)

In [2]:
import math
import itertools
from loguru import logger
from tqdm.auto import tqdm

async def search_news(queries, batch_size=4):
    def get_chunks(inputs, size):
        batches = []
        for item in inputs:
            batches.append(item)
            if len(batches) == batch_size:
                yield batches
                batches = [] 
        if batches:
            yield batches

    documents = {}
    chunks = get_chunks(queries, batch_size)
    for qs in tqdm(chunks, total=math.ceil(len(queries)/batch_size)):
        logger.debug(f"Search news: {qs}")
        coros = [ _news_api(query, num_articles=20, last_day=7) for query in qs ]
        results = await asyncio.gather(*coros)

        for articles in results:
            for doc in articles:
                doc_key = doc['title']
                logger.info(f"  Get [{doc_key}]")
                documents[doc_key] = doc

    return documents

# News about LLaMA2
queries = [
    "LLaMA2 news",
    "LLaMA2 latest developments",
    "LLaMA2 updates",
    "LLaMA2 new features",
    "LLaMA2 release",
    "LLaMA2 release date",
    "LLaMA2 beta",
    "LLaMA2 launch",
    "LLaMA2 availability",
    "LLaMA2 rumors",
    "LLaMA2 speculations",
    "LLaMA2 leaks",
    "LLaMA2 product review",
    "LLaMA2 comparison",
    "LLaMA2 vs competitors",
    "LLaMA2 pricing",
    "LLaMA2 sales",
    "LLaMA2 market share",
    "LLaMA2 impact on industry",
    "LLaMA2 customer reviews",
    "LLaMA2 reliability",
    "LLaMA2 security",
    "LLaMA2 performance",
    "LLaMA2 compatibility",
    "LLaMA2 integration",
    "LLaMA2 case studies",
    "LLaMA2 use cases",
    "LLaMA2 benefits",
    "LLaMA2 drawbacks",
    "LLaMA2 future prospects",
    "LLaMA2 innovation",
    "LLaMA2 technology",
    "LLaMA2 advancements",
]

docs = await search_news(queries)
len(docs)

  0%|          | 0/9 [00:00<?, ?it/s]

[32m2023-07-29 21:41:37.383[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36msearch_news[0m:[36m20[0m - [34m[1mSearch news: ['LLaMA2 news', 'LLaMA2 latest developments', 'LLaMA2 updates', 'LLaMA2 new features'][0m
[32m2023-07-29 21:41:42.013[0m | [1mINFO    [0m | [36m__main__[0m:[36msearch_news[0m:[36m27[0m - [1m  Get [Hugging News #0724: Llama 2 登陆 Hugging Face、AI 开源游戏竞赛获奖选手公布！ - HuggingFace][0m
[32m2023-07-29 21:41:42.014[0m | [1mINFO    [0m | [36m__main__[0m:[36msearch_news[0m:[36m27[0m - [1m  Get [Daily Hacker News for 2023-07-23][0m
[32m2023-07-29 21:41:42.014[0m | [1mINFO    [0m | [36m__main__[0m:[36msearch_news[0m:[36m27[0m - [1m  Get [How Meta’s Llama 2 Shifts Marketing’s Relationship With AI][0m
[32m2023-07-29 21:41:42.015[0m | [1mINFO    [0m | [36m__main__[0m:[36msearch_news[0m:[36m27[0m - [1m  Get [Llama2.c: inference llama 2 in one file of pure C | Hacker News][0m
[32m2023-07-29 21:41:42.016[0m | [1mINFO    [0m 

44

In [3]:
import pandas as pd

df = pd.DataFrame(docs).transpose()
df.to_csv('last_documents.csv', encoding="utf-8")
df

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
Hugging News #0724: Llama 2 登陆 Hugging Face、AI 开源游戏竞赛获奖选手公布！ - HuggingFace,"{'id': None, 'name': 'Cnblogs.com'}",HuggingFace,Hugging News #0724: Llama 2 登陆 Hugging Face、AI...,每一周，我们的同事都会向社区的成员们发布一些关于 Hugging Face 相关的更新，包括...,https://www.cnblogs.com/huggingface/p/17578351...,,2023-07-24T12:59:00Z,Hugging Face Hugging News Hugging News \r\n: \...
Daily Hacker News for 2023-07-23,"{'id': None, 'name': 'Daemonology.net'}",,Daily Hacker News for 2023-07-23,The 10 highest-rated articles on\nHacker News\...,https://www.daemonology.net/hn-daily/2023-07-2...,,2023-07-24T00:00:00Z,The 10 highest-rated articles on\r\nHacker New...
How Meta’s Llama 2 Shifts Marketing’s Relationship With AI,"{'id': None, 'name': 'CMSWire'}",pr@cmswire.com (Pierre DeBois),How Meta’s Llama 2 Shifts Marketing’s Relation...,Learn how Meta’s Llama 2 is changing the marke...,https://www.cmswire.com/digital-experience/how...,https://www.cmswire.com/-/media/8be51baf377a40...,2023-07-27T11:02:32Z,The Gist\r\n<ul><li>Llama 2 launch. Meta's Lla...
Llama2.c: inference llama 2 in one file of pure C | Hacker News,"{'id': 'hacker-news', 'name': 'Hacker News'}",tomcam,Llama2.c: inference llama 2 in one file of pur...,,https://news.ycombinator.com/item?id=36838051,,2023-07-25T05:47:01Z,I got the strangest output from your first lin...
Alibaba's cloud unit brings Meta's AI model Llama to its clients,"{'id': 'the-times-of-india', 'name': 'The Time...",Reuters,Alibaba's cloud unit brings Meta's AI model Ll...,Alibaba's cloud computing division said it has...,https://economictimes.indiatimes.com/tech/tech...,"https://img.etimg.com/thumb/msid-102126946,wid...",2023-07-26T05:31:09Z,Alibaba's cloud computing division said it has...
4 big analyst picks: Meta lands a buy rating on AI By,"{'id': None, 'name': 'Biztoc.com'}",investing.com,4 big analyst picks: Meta lands a buy rating o...,Here is your Pro Recap of the biggest analyst ...,https://biztoc.com/x/df335770b748ec0c,https://c.biztoc.com/p/df335770b748ec0c/s.webp,2023-07-26T15:30:08Z,Here is your Pro Recap of the biggest analyst ...
Meta Stock: Facebook Parent Beats Q2 Goals,"{'id': None, 'name': 'Biztoc.com'}",investors.com,Meta Stock: Facebook Parent Beats Q2 Goals,"parent company of Facebook, Instagram and What...",https://biztoc.com/x/a50f66afcc2cf2f2,https://c.biztoc.com/p/a50f66afcc2cf2f2/og.webp,2023-07-27T17:06:13Z,", parent company of Facebook, Instagram and Wh..."
"AWS Week in Review – Redshift+Forecast, CodeCatalyst+GitHub, Lex Analytics, Llama 2, and Much More – July 24, 2023","{'id': None, 'name': 'Amazon.com'}",Jeff Barr,"AWS Week in Review – Redshift+Forecast, CodeCa...",Summer is in full swing here in Seattle and we...,https://aws.amazon.com/blogs/aws/aws-week-in-r...,https://d2908q01vomqb2.cloudfront.net/da4b9237...,2023-07-24T22:58:56Z,Summer is in full swing here in Seattle and we...
Alibaba Cloud to Support Meta AI Model for Chinese Users,"{'id': None, 'name': 'Investopedia'}",Vaidik Trivedi,Alibaba Cloud to Support Meta AI Model for Chi...,ADRs of Alibaba and Meta shares rose after the...,https://www.investopedia.com/alibaba-cloud-to-...,https://www.investopedia.com/thmb/4DQiJTAZpSXt...,2023-07-26T13:55:09Z,The cloud arm of Alibaba Group (BABA) said Wed...
Unsupervised Learning NO. 391,"{'id': None, 'name': 'Danielmiessler.com'}",Daniel Miessler,Unsupervised Learning NO. 391,"AI Manipulation Defenders, .MIL Leak, and the ...",http://danielmiessler.com/p/391,https://beehiiv-images-production.s3.amazonaws...,2023-07-24T16:53:27Z,"Unsupervised Learning is a Security, AI, and M..."


# Embedding做文章排序

In [1]:
import pandas as pd

df = pd.read_csv('last_documents.csv', index_col=0)
df.head(3)

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
Hugging News #0724: Llama 2 登陆 Hugging Face、AI 开源游戏竞赛获奖选手公布！ - HuggingFace,"{'id': None, 'name': 'Cnblogs.com'}",HuggingFace,Hugging News #0724: Llama 2 登陆 Hugging Face、AI...,每一周，我们的同事都会向社区的成员们发布一些关于 Hugging Face 相关的更新，包括...,https://www.cnblogs.com/huggingface/p/17578351...,,2023-07-24T12:59:00Z,Hugging Face Hugging News Hugging News \r\n: \...
Daily Hacker News for 2023-07-23,"{'id': None, 'name': 'Daemonology.net'}",,Daily Hacker News for 2023-07-23,The 10 highest-rated articles on\nHacker News\...,https://www.daemonology.net/hn-daily/2023-07-2...,,2023-07-24T00:00:00Z,The 10 highest-rated articles on\r\nHacker New...
How Meta’s Llama 2 Shifts Marketing’s Relationship With AI,"{'id': None, 'name': 'CMSWire'}",pr@cmswire.com (Pierre DeBois),How Meta’s Llama 2 Shifts Marketing’s Relation...,Learn how Meta’s Llama 2 is changing the marke...,https://www.cmswire.com/digital-experience/how...,https://www.cmswire.com/-/media/8be51baf377a40...,2023-07-27T11:02:32Z,The Gist\r\n<ul><li>Llama 2 launch. Meta's Lla...


In [13]:
from sentence_transformers import SentenceTransformer

instruction = "为这个句子生成表示以用于检索相关文章："
instruction_en = "Represent this sentence for searching relevant passages:"
queries = ["LLaMA RLHF"]
passages = [
    f"{article['title']} {article['description']} {article['content']}"
        for _, article in df.iterrows()
]

model = SentenceTransformer('BAAI/bge-large-en')
q_embeddings = model.encode([instruction_en+q for q in queries], normalize_embeddings=True)
p_embeddings = model.encode(passages, normalize_embeddings=True)
scores = q_embeddings @ p_embeddings.T
scores

array([[0.75337267, 0.6653036 , 0.72978985, 0.7472973 , 0.74269295,
        0.6478652 , 0.65310436, 0.6958818 , 0.6909492 , 0.6593058 ,
        0.642282  , 0.75479937, 0.65342087, 0.7440892 , 0.64307153,
        0.6774725 , 0.7743534 , 0.7009691 , 0.7353648 , 0.7438247 ,
        0.68389887, 0.7672374 , 0.74533165, 0.7418338 , 0.63542855,
        0.7460848 , 0.7522627 , 0.6734979 , 0.6353735 , 0.7735085 ,
        0.758535  , 0.63516665, 0.65505517, 0.65303767, 0.6466215 ,
        0.75210625, 0.74482125, 0.666496  , 0.6433321 , 0.7228894 ,
        0.64142954, 0.612141  , 0.7044281 , 0.6454203 ]], dtype=float32)

In [14]:
xdf = df.copy()
xdf["score"] = scores[0]
xdf.sort_values(by='score', ascending=False)[['title', 'description', 'score']]

Unnamed: 0,title,description,score
LLaMA2 Chat 70B outperformed ChatGPT,LLaMA2 Chat 70B outperformed ChatGPT,Comments,0.774353
Llama2.c: inference llama 2 in one file of pure C,Llama2.c: inference llama 2 in one file of pure C,Inference Llama 2 in one file of pure C. Contr...,0.773508
"Show HN: Goat-7B LLM, a new SOTA among the open-source 7B models","Show HN: Goat-7B LLM, a new SOTA among the ope...","New research results around LLM 'alignment', d...",0.767237
Llama2 论文中译版——开放式基础和微调聊天模型 - 沉睡的木木夕,Llama2 论文中译版——开放式基础和微调聊天模型 - 沉睡的木木夕,# Llama 2：开放式基础和微调聊天模型 ## 写在前头 因为最近一直在使用 LLM 工...,0.758535
Alibaba#39;s cloud unit brings Meta#39;s AI model Llama to its clients,Alibaba#39;s cloud unit brings Meta#39;s AI mo...,"Meta released Llama2, a commercial version of ...",0.754799
Hugging News #0724: Llama 2 登陆 Hugging Face、AI 开源游戏竞赛获奖选手公布！ - HuggingFace,Hugging News #0724: Llama 2 登陆 Hugging Face、AI...,每一周，我们的同事都会向社区的成员们发布一些关于 Hugging Face 相关的更新，包括...,0.753373
What is LLama2? : Dive into Meta AI’s Newest Open Source Language Model,What is LLama2? : Dive into Meta AI’s Newest O...,Meta AI and Microsoft have come together to la...,0.752263
Show HN: LLM Boxing – GPT 3.5 vs. Llama 2,Show HN: LLM Boxing – GPT 3.5 vs. Llama 2,Article URL: https://llmboxing.com\nComments U...,0.752106
Llama2.c: inference llama 2 in one file of pure C | Hacker News,Llama2.c: inference llama 2 in one file of pur...,,0.747297
A Silent New AI Bombshell Launch Nobody Saw Coming,A Silent New AI Bombshell Launch Nobody Saw Co...,A Silent New AI Bombshell Launch Nobody Saw Co...,0.746085


In [5]:
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings

model = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", model_kwargs={"device": "cuda"})

def embeddings(inputs):
    return model.embed_documents(inputs)

load INSTRUCTOR_Transformer
max_seq_length  512


In [6]:
article_embeddings = embeddings(
    [
        f"{article['title']} {article['description']} {article['content']}"
        for _, article in df.iterrows()
    ]
)

# Re-rank

In [7]:
hypothetical_answer = "There have been rumors circulating about LLaMA2 making a major announcement at their upcoming event. Some speculate that NAME, the CEO of LLaMA2, will unveil a groundbreaking new technology. However, nothing has been confirmed yet and we'll have to wait for the event to find out more."

hypothetical_answer_embedding = embeddings(hypothetical_answer)[0]

In [9]:
import numpy as np

# Calculate cosine similarity
cosine_similarities = []
for article_embedding in article_embeddings:
    cosine_similarities.append(np.dot(hypothetical_answer_embedding, article_embedding))

np.max(cosine_similarities)

0.5697349326708971

In [13]:
df['similarity'] = cosine_similarities

xdf = df.sort_values(by='similarity', ascending=False).reset_index()
for i, row in xdf.head(10).iterrows():
    print(f"[{i+1}] Title:", row['title'])
    print("Description:", row["description"])
    print("Content:", row["content"])
    # print("Link:", row["url"])
    # print("Score:", row["similarity"])
    print(" ")

[1/10] Title: Llama2 论文中译版——开放式基础和微调聊天模型 - 沉睡的木木夕
Description: # Llama 2：开放式基础和微调聊天模型 ## 写在前头 因为最近一直在使用 LLM 工具，所以在学习 [Llama 2：开放式基础和微调聊天模型](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fin
Content: LLM Llama 2
ChatGPT3.5DeepLCopilot X
Llama 2LLMs 70 700 LLMs Llama 2-Chat Llama 2-Chat LLMs 
Llama 2-Chat 4k 95% 1% 2% 3.4.2 
GPT- 4 Llama 2-Chat GPT-4 /+ GPT-4 
1 
LLMs
LLMs Auto-regressive t… [+12961 chars]
 
[2/10] Title: This Week In Rust: This Week in Rust 505
Description: Hello and welcome to another issue of This Week in Rust!
Rust is a programming language empowering everyone to build reliable and efficient software.
This is a weekly summary of its progress and community.
Want something mentioned? Tag us at @ThisWeekInRust o…
Content: Hello and welcome to another issue of This Week in Rust!
Rust is a programming language empowering everyone to build reliable and efficient software.
This is a weekly summary of its progress and co… [+11470 chars]
 
[3/10] Title