# Search

In [None]:
import os
import json
import random
import numpy as np
from collections import defaultdict

from pyserini.search.lucene import LuceneSearcher
from pyserini.search import FaissSearcher
from pyserini.search.faiss._searcher import AutoQueryEncoder

from utils import *
import time

In [None]:
model_id = "infgrad/stella-base-zh-v3-1792d"
# model_id = "qihoo360/360Zhinao-search"
device = "cuda:7"

In [None]:
query2ids: dict[str, list[str]] = {}
with open(QUERY2ID_LIST_FPATH, "r") as f:
    for line in f:
        for query, id in json.loads(line).items():
            query2ids[query] = [str(id)]

KS = [1]

In [None]:
ALL_IDS = []
for doc_ids in query2ids.values():
    ALL_IDS += doc_ids
# Sampling from a set deprecated since Python 3.9 and will be removed in a subsequent version.
ALL_IDS = list(set(ALL_IDS))

In [None]:
TEST_QUERY = "名誉权"


class Searcher:
    @staticmethod
    def ndcg_at_k(hit_ids, target_ids, k):
        """
        计算NDCG@k

        参数:
            hit_ids (list): 查询得到的文档ID列表，按照排名顺序排列
            target_ids (list): 目标文档ID列表，按照相关性降序排列
            k (int): 计算NDCG@k时考虑的前k个文档

        返回:
            float: NDCG@k的值
        """
        # 将查询文档ID与目标文档ID的相关性映射到一个分值列表中

        relevances = [1 if doc_id in target_ids else 0 for doc_id in hit_ids]
        if len(relevances) < k:
            relevances += [0] * (k - len(relevances))

        # 计算DCG@k
        dcg = np.sum(relevances[:k] / np.log2(np.arange(2, k + 2)))

        # 计算IDCG@k
        ideal_relevances = np.zeros(k)
        ideal_relevances[: len(target_ids)] = 1
        idcg = np.sum(ideal_relevances[:k] / np.log2(np.arange(2, k + 2)))

        # 计算NDCG@k
        ndcg = dcg / idcg

        return ndcg

    def __init__(
        self,
        index_dir: str = None,
        encoder: str = None,
        ks: list[int] = None,
        k_max: int = None,
        n_threads: int = 16,
        view_searcher=None,
    ):
        self.index_dir = index_dir
        self.encoder = encoder
        self.k_max = k_max
        self.ks = ks
        self.n_threads = n_threads

        if self.index_dir is not None:
            if self.encoder is None:
                self.searcher = LuceneSearcher(self.index_dir)
                self.searcher.set_language("zh")
            else:
                self.searcher = FaissSearcher(self.index_dir, self.encoder)
        else:
            self.searcher = None

        if self.ks is None:
            self.ks = KS
        if self.k_max is None:
            self.k_max = max(self.ks)
            if self.index_dir is not None and "seg" in str(self.index_dir):
                self.k_max *= 5

        if view_searcher is None:
            self.view_searcher = self.searcher
        else:
            self.view_searcher = view_searcher

    def batch_search(self, queries: str) -> dict[str, list]:
        print(f"Searching for {len(queries)} queries...")
        search_start = time.time()
        query2hits = self.searcher.batch_search(
            queries, queries, k=self.k_max, threads=self.n_threads
        )
        search_timecost = time.time() - search_start
        print(
            f"Time per query: {search_timecost / len(queries):.4f} s (= {search_timecost:.4f}/{len(queries)})"
        )
        return query2hits

    def get_query2ids(self, queries: list[str]) -> list[tuple[str, list[str]]]:
        if self.searcher is not None:
            queries = list(query2ids.keys())
            query2hits = self.batch_search(queries)
            query_hit_ids_pairs = []
            for query, hits in query2hits.items():
                hit_ids = [hit.docid for hit in hits]
                if len(hit_ids) > 0 and "-" in hit_ids[0]:
                    hit_ids = [hit.docid.split("-")[0] for hit in hits]
                    # Deduplicate but keep the order
                    hit_ids = list(dict.fromkeys(hit_ids))
                query_hit_ids_pairs.append((query, hit_ids))
        else:
            query_hit_ids_pairs = [
                (query, random.sample(ALL_IDS, self.k_max))
                for query in query2ids.keys()
            ]

        return query_hit_ids_pairs

    def calc_ndcg_at_k(self, query_hit_ids_pairs: list[tuple[str, list[str]]]) -> float:

        ndcg_at = defaultdict(list)
        for query, hit_ids in query_hit_ids_pairs:
            target_ids = query2ids[query]
            for k in self.ks:
                ndcg_at[k].append(Searcher.ndcg_at_k(hit_ids, target_ids, k))
        for k, ndcg_list in ndcg_at.items():
            print(f"NDCG@{k:<2}: {np.mean(ndcg_list):.4f}")
        return ndcg_at

    def test_ndcg_at_k(
        self,
        query2ids: dict[str, list[str]],
    ):

        query_hit_ids_pairs = self.get_query2ids(list(query2ids.keys()))
        ndcg_at = self.calc_ndcg_at_k(query_hit_ids_pairs)
        # return ndcg_at

    def test(
        self,
        query: str,
        query2ids: dict[str, list[int]] = None,
    ):

        hit_ids = []

        for hit in self.searcher.search(query):
            hit_id = hit.docid
            hit_ids.append(hit_id)
            print(f"### {hit_id:>5}: {hit.score:.5f}")
            print(self.view_searcher.doc(hit_id).raw())

        print(f"Hit IDs: {hit_ids}")
        if query2ids is not None:
            print(f"Target IDs: {query2ids.get(query)}")

## Random

In [None]:
random.seed(42)
rand_searcher = Searcher()

In [None]:
rand_searcher.test_ndcg_at_k(query2ids)

NDCG@1 : 0.0000


## BM25

In [None]:
bm25_searcher = Searcher(str(IDX_HOME / "bm25_fulltext_zh_test_idxs"))

In [None]:
bm25_searcher.test(query=TEST_QUERY, query2ids=query2ids)

### 83477: 8.73480
{
  "id" : 83477,
  "contents" : "北京市海淀区人民法院 民事判决书 （2018）京0108民初8145号 原告：张豫冬，男，1971年11月30日出生，汉族，作家，住南京市鼓楼区。 委托诉讼代理人：曹若楠，北京市京师律师事务所律师。 委托诉讼代理人：郭芳汝，北京市京师律师事务所实习律师。 被告：端宏斌，男，1980年3月29日出生，汉族，财经作家，住上海市浦东新区。 委托诉讼代理人：马立国，天津全唐律师事务所律师。 委托诉讼代理人：王星星，天津全唐律师事务所实习律师。 被告：北京搜狐互联网信息服务有限公司，住所地北京市海淀区科学院南路2号院3号楼11层1101。 法定代表人：张朝阳，董事长。 委托诉讼代理人：夏雪，女，该公司员工。 原告张豫冬与被告端宏斌、北京搜狐互联网信息服务有限公司（以下简称搜狐公司）肖像权、名誉权纠纷一案，本院于2018年1月22日立案后，依法适用普通程序，公开开庭进行了审理。原告张豫冬的委托诉讼代理人曹若楠、郭芳汝，被告端宏斌的委托诉讼代理人马立国、王星星及被告搜狐公司的委托诉讼代理人夏雪到庭参加诉讼。本案现已审理终结。 原告向本院提出诉讼请求：1．判令被告一赔偿原告精神损害赔偿金500000元；2．判令被告二赔偿原告精神损害赔偿金100000元；3．判令被告赔偿原告公证费4230元；4．判令被告删除有关侵犯原告肖像的文章和图片，断开涉嫌侵权的页面链接（原告已当庭表示撤回）；5．判令被告在全国公开发行的报纸上赔礼道歉，致歉内容应当包含诉讼案件的判决书案号及内容，侵权图片名称及位置，致歉发布时间应不少于10日；6．请求判令被告承担全部诉讼费用。事实和理由：2017年5月30日，端宏斌在公众号老端的观点上，发布原创文章孔令辉要被叠码仔坑死了，在未经原告本人许可的情况下，端宏斌擅自在该文章中使用了原告的照片作为配图。其后搜狐公司旗下网站搜狐财经、搜狐网转载该文章，原告通知其删除文章，断开链接，但被告仍未采取任何必要措施。原告作为知名作家具有一定的知名度，使用其照片利用了原告的社会和市场影响力，微信公众号的运营本身是一种盈利行为，端宏斌严重侵犯了原告的肖像权。在100000＋的微信公众号上擅自使用原告的照片作为文章配图，造成公众对原告的误解，导致原告的社会评价降低，同时侵犯了原告

In [None]:
bm25_searcher.test_ndcg_at_k(query2ids)

Searching for 1740 queries...
Time per query: 0.0001 s (= 0.1608/1740)
NDCG@1 : 0.3736


## Seg + BM25

In [None]:
bm25_seg_searcher = Searcher(str(IDX_HOME / "bm25_fulltext_test_seg_zh_idxs"), k_max=10)

In [None]:
bm25_seg_searcher.test(query=TEST_QUERY, query2ids=query2ids)

### 83477-7: 9.96350
{
  "id" : "83477-7",
  "contents" : "属于人格权，端宏斌侵犯了张豫冬的肖像权，理应向张豫冬赔偿相应的精神损失，但本院同时考虑到整个文章并不针对张豫冬，端宏斌在使用涉案肖像图片时也并未篡改、歪曲肖像或其他侮辱性的使用，只是作为解释文字含义的配图使用，情节并不严重，故本院对精神损失赔偿数额予以综合考虑后酌情判定。 就张豫冬主张涉案图片使用侵害其名誉权一节，本院认为，自然人的名誉乃是公民人格尊严的体现，自然人的名誉权是其依赖自己的名誉参与社会生活的权利。公民享有名誉权，公民的人格尊严受法律保护，禁止使用侮辱、诽谤等方式损害公民的名誉。本案中，涉案文章的主要内容从孔令辉借资赌博事件说起，最后落脚在奉劝民众不要赌博的要旨上，其间使用了诉争肖像图片作为解释叠码仔含义的配图，但从文章使用该肖像图片的直接目的和涉案文章内容来看，都不是意图指向原告张豫冬，甚至通篇都没有提到张豫冬的姓名，张豫冬与该文章内容并没有直接联系。涉案肖像图片使用作为端宏斌公开言论表达及传播的一个部分，其并未公开传播对张豫冬具有诽谤意义的虚假或歪曲事实，亦未通过涉案图片公开表达传播对张豫冬人格具有侮辱意义的观点言论，故端宏斌并未侵犯张豫冬的名誉权，无需承担侵犯名誉权的侵权责任。 就搜狐公司转"
}
### 83477-8: 9.34500
{
  "id" : "83477-8",
  "contents" : "载包含诉争肖像图片的涉案文章是否侵犯张豫冬肖像权及名誉权一节，鉴于前文已分析指出端宏斌并未侵犯张豫冬的名誉权，故搜狐公司的前述行为也不构成对张豫冬名誉权的侵犯。但是，端宏斌使用张豫冬肖像图片的行为已构成对张豫冬肖像权的侵犯，搜狐互联公司未经张豫冬许可擅自转载包含其肖像图片的涉案文章，已构成对张豫冬肖像权的侵犯，应当承担侵犯张豫冬肖像权的相应侵权责任，包括停止侵权、赔礼道歉及赔偿精神损失。鉴于搜狐公司的转载行为属于自己的行为，故并不因是否通知删除而免责，而且张豫冬无相应证据证明搜狐公司与端宏斌之间存在共同侵权的意思联络，搜狐公司与端宏斌分别实施侵权行为造成同一损害后果，应当承担按份的侵权责任。虽然搜狐公司系转发，但鉴于其平台身份的影响力较大，故本院酌定二者责任份额为6：4。 综上，本院依照《中华人民共和国民法通则》第一百条、

In [None]:
bm25_seg_searcher.test_ndcg_at_k(query2ids)

Searching for 1740 queries...
Time per query: 0.0002 s (= 0.2913/1740)
NDCG@1 : 0.3914


## Seg + BM25 + Reranking

In [None]:
from sentence_transformers import SentenceTransformer
import torch

In [None]:
bm25_seg_query2hits = bm25_seg_searcher.batch_search(list(query2ids.keys()))

Searching for 1740 queries...
Time per query: 0.0002 s (= 0.2776/1740)


In [None]:
model = SentenceTransformer(
    model_id,
    device=device,
    model_kwargs={
        # "torch_dtype": torch.float16,
        "attn_implementation": "sdpa",
    },
)

Some weights of BertModel were not initialized from the model checkpoint at infgrad/stella-base-zh-v3-1792d and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
all_seg_hit_texts = []
for query, hits in bm25_seg_query2hits.items():
    all_seg_hit_texts.extend(
        json.loads(bm25_seg_searcher.view_searcher.doc(hit.docid).raw())["contents"]
        for hit in hits
    )


all_uniq_seg_texts = list(set(all_seg_hit_texts))

all_uniq_seg_hit_vecs = model.encode(
    all_uniq_seg_texts, batch_size=16, show_progress_bar=True
)

uniq_seg_text2vec = dict(zip(all_uniq_seg_texts, all_uniq_seg_hit_vecs))
seg_hit_vecs_list = [
    [
        uniq_seg_text2vec[
            json.loads(bm25_seg_searcher.view_searcher.doc(hit.docid).raw())["contents"]
        ]
        for hit in hits
    ]
    for query, hits in bm25_seg_query2hits.items()
]

Batches:   0%|          | 0/269 [00:00<?, ?it/s]

In [None]:
query_vecs = model.encode(list(query2ids.keys()), batch_size=16, show_progress_bar=True)

Batches:   0%|          | 0/109 [00:00<?, ?it/s]

In [None]:
ndcg_at = defaultdict(list)

for query_vec, hit_vecs, (query, hits) in zip(
    query_vecs, seg_hit_vecs_list, bm25_seg_query2hits.items()
):
    target_ids = query2ids[query]
    # print(target_ids)

    hit_ids = [hit.docid for hit in hits]
    # print(hit_ids)
    if len(hit_ids) > 0:
        hit_scores = query_vec @ np.array(hit_vecs).T
        # print(hit_scores)
        # Use hit_scores to sort hits
        hits = sorted(zip(hit_ids, hit_scores), key=lambda x: x[1])
        hit_ids = [hit[0] for hit in hits]
        # print(hit_ids)
        if "-" in hit_ids[0]:
            hit_ids = [hit_id.split("-")[0] for hit_id in hit_ids]
            hit_ids = list(dict.fromkeys(hit_ids))

    for k in KS:
        ndcg_at[k].append(Searcher.ndcg_at_k(hit_ids, target_ids, k))

    # break

for k in KS:
    print(f"NDCG@{k:<2}: {np.mean(ndcg_at[k]):.4f}")

NDCG@1 : 0.1592


## BM25+Rocchio

In [None]:
bm25_rocchio_searcher = Searcher(str(IDX_HOME / "bm25_fulltext_zh_test_idxs"))

In [None]:
bm25_rocchio_searcher.searcher.set_rocchio(debug=True)
assert bm25_rocchio_searcher.searcher.is_using_rocchio()
bm25_rocchio_searcher.test(query=TEST_QUERY, query2ids=query2ids)

2024-05-29 06:42:47,437 INFO  [main] lib.RocchioReranker (RocchioReranker.java:141) - QID: null
2024-05-29 06:42:47,438 INFO  [main] lib.RocchioReranker (RocchioReranker.java:142) - Original Query: (名誉)^1.0 (誉权)^1.0
2024-05-29 06:42:47,438 INFO  [main] lib.RocchioReranker (RocchioReranker.java:143) - Feedback Query: (姜某)^0.23284954 (张某)^0.32359484 (俊杰)^0.22059405 (文章)^0.17678362 (孟魁)^0.24170865 (毛俊)^0.22059405 (名誉)^0.9546514 (护利)^0.24880524 (科技)^0.17743175 (王护)^0.24880524 (誉权)^0.70710677
2024-05-29 06:42:47,438 INFO  [main] lib.RocchioReranker (RocchioReranker.java:144) - Feedback term: 科技 -> 0.17743175
2024-05-29 06:42:47,438 INFO  [main] lib.RocchioReranker (RocchioReranker.java:144) - Feedback term: 毛俊 -> 0.22059405
2024-05-29 06:42:47,438 INFO  [main] lib.RocchioReranker (RocchioReranker.java:144) - Feedback term: 文章 -> 0.17678362
2024-05-29 06:42:47,438 INFO  [main] lib.RocchioReranker (RocchioReranker.java:144) - Feedback term: 名誉 -> 0.9546514
2024-05-29 06:42:47,438 INFO  [main]

In [None]:
bm25_rocchio_searcher.searcher.set_rocchio(debug=False)
bm25_rocchio_searcher.test_ndcg_at_k(query2ids)

Searching for 1740 queries...


Time per query: 0.0010 s (= 1.7782/1740)
NDCG@1 : 0.3736


## BM25+RM3

In [None]:
bm25_rm3_searcher = Searcher(str(IDX_HOME / "bm25_fulltext_zh_test_idxs"))

In [None]:
bm25_rm3_searcher.searcher.set_rm3(debug=True, filter_terms=False)
assert bm25_rm3_searcher.searcher.is_using_rm3()
bm25_rm3_searcher.test(query=TEST_QUERY, query2ids=query2ids)

2024-05-29 06:42:49,699 INFO  [main] lib.Rm3Reranker (Rm3Reranker.java:107) - QID: null
2024-05-29 06:42:49,699 INFO  [main] lib.Rm3Reranker (Rm3Reranker.java:108) - Original Query: (名誉)^1.0 (誉权)^1.0
2024-05-29 06:42:49,700 INFO  [main] lib.Rm3Reranker (Rm3Reranker.java:109) - Feedback Query: (康姿)^0.043518174 (俊杰)^0.05086634 (毛俊)^0.05086634 (名誉)^0.25 (张豫)^0.044858526 (豫冬)^0.044858526 (号码)^0.052709505 (护利)^0.061148014 (王护)^0.061148014 (誉权)^0.25 (标注)^0.04650839 (姿百)^0.043518174
2024-05-29 06:42:49,700 INFO  [main] lib.Rm3Reranker (Rm3Reranker.java:110) - Feedback term: 豫冬 -> 0.044858526
2024-05-29 06:42:49,700 INFO  [main] lib.Rm3Reranker (Rm3Reranker.java:110) - Feedback term: 号码 -> 0.052709505
2024-05-29 06:42:49,700 INFO  [main] lib.Rm3Reranker (Rm3Reranker.java:110) - Feedback term: 毛俊 -> 0.05086634
2024-05-29 06:42:49,700 INFO  [main] lib.Rm3Reranker (Rm3Reranker.java:110) - Feedback term: 康姿 -> 0.043518174
2024-05-29 06:42:49,700 INFO  [main] lib.Rm3Reranker (Rm3Reranker.java:110) 

In [None]:
bm25_rm3_searcher.searcher.set_rm3(debug=False)
bm25_rm3_searcher.test_ndcg_at_k(query2ids)

Searching for 1740 queries...
Time per query: 0.0009 s (= 1.6496/1740)
NDCG@1 : 0.2833


In [None]:
bm25_rm3_searcher.searcher.set_rm3(5, 5, 0.9, debug=False)
bm25_rm3_searcher.test_ndcg_at_k(query2ids)

Searching for 1740 queries...


Time per query: 0.0007 s (= 1.1355/1740)
NDCG@1 : 0.3454


## Seg + FAISS

In [None]:
query_encoder = AutoQueryEncoder(
    model_id,
    device=device,
    pooling="mean",
    l2_norm=False,
    # pooling="cls",
    # l2_norm=True,
)

Some weights of BertModel were not initialized from the model checkpoint at infgrad/stella-base-zh-v3-1792d and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
faiss_seg_searcher = Searcher(
    IDX_HOME / f"faiss_fulltext_test_seg_{model_id.replace('/', '--')}_idxs",
    query_encoder,
    view_searcher=bm25_seg_searcher.searcher,
)

In [None]:
faiss_seg_searcher.test(query=TEST_QUERY, query2ids=query2ids)

### 61394-1: -139.26257
{
  "id" : "61394-1",
  "contents" : "侵犯其名誉权，应当承担赔偿责任。 被告中国移动通信集团湖南有限公司岳阳市平江县分公司辩称，1、被告没有实施侵权行为，被告没有开展手机号码标注的业务；2、原告的手机是否显示为某殡葬公司的名称，被告不知情。被告为通信运营商，仅承担两通信终端之间的信号传输，对信号内容不干预；3、原告与“平江县佛缘殡葬有限公司”的法定代表人李红卫系夫妻关系，不排除李红卫对原告使用的号码向电话邦公司办理了标注业务。被告不构成侵权，不应承担赔偿责任。 本院经审理认定事实如下：对于原告使用的手机号码（134××××4998）曾被标注“平江县佛缘殡葬有限公司”，起诉后经被告与标注公司“电话邦”反映后，已被取消标注。“平江县佛缘殡葬有限公司”的法定代表人李红卫与原告系夫妻关系的事实双方无争议，本院予以确认。双方争议的焦点是被告是否实施了擅自标注原告手机号码的侵权行为。双方对自己的主张分别举证，原告提交了证据1、电话拨打的截图，证明被告侵犯原告的名誉权；2、被告的答辩状，证明被告承认原告的手机被错误标注。被告质证认为，原告的证据1、2不能达到原告的证明目的。被告提交了证据1、企业登记情况表，证明涉案手机号码（134××××4998）自2010年"
}
### 19005-9: -144.10883
{
  "id" : "19005-9",
  "contents" : "权利，并可以作为当事人进行涉及著作权或者与著作权有关的权利的诉讼、仲裁活动。而本案中，广州东嘉公司系涉案音乐作品的著作权人，其于2012年10月10日将该著作权独家授权给广东播种者公司。后广东播种者公司又将该作品中词曲的著作权、复制权、放映权、广播权以专有的方式授权给上诉人深圳声影公司。根据上述权利流转过程可知，上诉人深圳声影公司并非涉案音乐作品的著作权人，其仅仅经许可获得了该作品的使用权。且在上诉人与广东播种者公司签订的《音像著作权授权合同》中约定，上诉人深圳声影公司可以以自己的名义享有对卡拉OK等公共娱乐场所经营者授权使用的独家管理权，并有权以自己的名义向侵权使用者提起诉讼。以上约定与《著作权集体管理条例》中著作权集体管理组织的管理内容性质相同，而根据该条例规定，除著作权集体管理组织外，任何组织和个人均不得

In [None]:
faiss_seg_searcher.test_ndcg_at_k(query2ids)

Searching for 1740 queries...
Time per query: 0.0067 s (= 11.7390/1740)
NDCG@1 : 0.1828
