In [1]:
import numpy as np 
import pandas as pd
from collections import Counter
import re
import os 
os.environ['CUDA_VISIBLE_DEVICES']='0'
import joblib
import json 
import jieba 
import copy

from openai import AzureOpenAI
import matplotlib.pyplot as plt 
from sentence_transformers import SentenceTransformer
from FlagEmbedding import FlagReranker
from transformers import AutoTokenizer
from sklearn.metrics import average_precision_score

In [2]:
# 原始数据处理
def format_model(x):
    model_list = x.split(',')
    model_list = [i.strip().lower() for i in model_list]
    new_list = [model_list[0]]
    i = 1
    while i < len(model_list):
        if (i != len(model_list) - 1) and (model_list[i-1] == model_list[i]):
            new_list.append(model_list[i]+model_list[i+1])
            if i < len(model_list) - 1:
                i += 2
            else:
                break
        elif (i != len(model_list) - 1) and (model_list[i-1] != model_list[i]):
            new_list.append(model_list[i])
            i += 1
        elif (model_list[i] == "上下水") or (model_list[i] == "air"):
            for j in range(len(new_list)):
                if model_list[i-1] == new_list[j]:
                    new_list.pop(j)
                    break
            new_list.append(model_list[i-1]+model_list[i])
            i += 1
        else:
            new_list.append(model_list[i])
            break
    return new_list

def format_all_models(x, dim_df):
    new_list = []
    for i in x:
        if i.find("全型号") >= 0:
            end_idx = i.find("全型号")
            name = i[:end_idx]
            new_list += [j for j in dim_df[dim_df['cat_name'] == name].model.tolist() if j not in x]
        else:
            new_list.append(i)
    return new_list

def format_series(x, dim_df):
    def contains_chinese(s):
        return re.search('[\u4e00-\u9fff]', s) is not None
    new_list = []
    for i in x:
        if i.find("系列") >= 0:
            end_idx = i.find("系列")
            name = i[:end_idx]
            new_list += [j for j in dim_df[(dim_df.model.str.find(name)>=0) & (
                dim_df.model.apply(lambda x: not contains_chinese(x)))].model.tolist() if j not in x]
            new_list += [i]
        else:
            new_list.append(i)
    return new_list

In [3]:
# 拼接openai embedding
def generate_embeddings(text, model="text-embedding-ada-002"): # model = "deployment_name"
    return client.embeddings.create(input=[text], model=model).data[0].embedding

In [4]:
# 测试集处理及计算与正确qa的相似度
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def search_docs(df, user_query, top_n=4, to_print=True):
    embedding = generate_embeddings(
        user_query,
    )
    df["similarities"] = df.ada_002.apply(lambda x: cosine_similarity(x, embedding))

    res = (
        df.sort_values("similarities", ascending=False)
        .head(top_n)
    )[["qa_id", "question", "answer", "similarities"]]
    return res.to_dict(orient='records')

def concat(x):
    return ",".join(x.astype(str).tolist())

def format_gt(x):
    if str(x) == "nan":
        return x
    else:
        return ",".join(x.split("\n"))

def count_gt(x):
    if str(x) == "nan":
        return 0
    else:
        return len(x.split(","))   

In [5]:
# 向量召回
def search_docs_bge(df, user_query, top_n=4, to_print=True):
    embedding = model.encode(user_query, normalize_embeddings=True).tolist()
    df["similarities"] = df.bge_large.apply(lambda x: cosine_similarity(x, embedding))
    output_columns = ["qa_id", "question", "answer", "similarities"]
    if "hit_reason" in df.columns:
        output_columns.append("hit_reason")
    res = (
        df.sort_values("similarities", ascending=False, 
                       kind="mergesort"
                      )
        .head(top_n)
    )[output_columns]
    return res.to_dict(orient='records')

def find_non_chinese_substrings(s):
    # 正则表达式解释：
    # [^\u4e00-\u9fff\W]+ 匹配非中文字符和非ASCII标点的连续字符
    # 但这样会排除空格，所以我们需要允许空格存在
    # 我们使用(?:[^\u4e00-\u9fff\W]| )+ 来实现这一点，(?:) 是非捕获组，用于匹配模式但不作为捕获结果返回
    # [^\u4e00-\u9fff\W] 匹配非中文且非标点的字符，| 表示或，空格 ' ' 被显式允许
    pattern = r'(?:[^\u4e00-\u9fff\W]| )+'
    
    # 使用findall方法查找所有匹配项
    matches = re.findall(pattern, s)
    
    # 过滤掉只包含空格的字符串
    matches = [match for match in matches if not match.isspace()]
    
    return matches

def clean_string(s):
    s = s.replace(" ", "").lower()
    return s

def find_model(x, all_model_list):
    x = x.replace("\n", "") 
    x = find_non_chinese_substrings(x)
    result = [clean_string(s) for s in x]
    return [model for model in all_model_list if model in result]

def find_cat(x, all_cat_list):
    return [name for name in all_cat_list if name in x]   

def filter_model(x, model_list):
    x = x.split(",")
    for model in model_list:
        if model in x:
            return True
    return False

def find_error_with_reason(a):
    # 第一次匹配“错误xxx”
    pattern1 = r"错误\s*\d+"
    matches1 = re.findall(pattern1, a)
    
    # 第二次匹配“错误原因xxx”
    pattern2 = r"错误原因\s*\d+"
    matches2 = re.findall(pattern2, a)

    # 合并两次匹配的结果
    matches = matches1 + matches2
    
    return [name.replace(" ", "").replace("原因", "") for name in matches]

def filter_reason(x, query_reason_list):
    reason_list = find_error_with_reason(x)
    for name in query_reason_list:
        if name in reason_list:
            return True 
    return False

def transform_model_name(x, all_model_list):
    x = x.replace("\n", "") 
    candidates = find_non_chinese_substrings(x)
    for name in candidates:
        cleaned_name = clean_string(name)
        for model in all_model_list:
            if cleaned_name == model:
                x = x.replace(name, model)
                break
    return x 

def remove_model_name(x, all_model_list):
    x = x.replace("\n", "") 
    candidates = find_non_chinese_substrings(x)
    for name in candidates:
        if clean_string(name) in all_model_list:
            x = x.replace(name, "")
    return x 

class BM25_Model(object):
    def __init__(self, documents_list, k1=2, k2=1, b=0.5):
        self.documents_list = documents_list
        self.documents_number = len(documents_list)
        self.avg_documents_len = sum([len(document) for document in documents_list]) / self.documents_number
        self.f = []
        self.idf = {}
        self.k1 = k1
        self.k2 = k2
        self.b = b
        self.init()

    def init(self):
        df = {}
        for document in self.documents_list:
            temp = {}
            for word in document:
                temp[word] = temp.get(word, 0) + 1
            self.f.append(temp)
            for key in temp.keys():
                df[key] = df.get(key, 0) + 1
        for key, value in df.items():
            self.idf[key] = np.log((self.documents_number - value + 0.5) / (value + 0.5))

    def get_score(self, index, query):
        score = 0.0
        document_len = len(self.f[index])
        qf = Counter(query)
        for q in query:
            if q not in self.f[index]:
                continue
            score += self.idf[q] * (self.f[index][q] * (self.k1 + 1) / (
                        self.f[index][q] + self.k1 * (1 - self.b + self.b * document_len / self.avg_documents_len))) * (
                                 qf[q] * (self.k2 + 1) / (qf[q] + self.k2))

        return score

    def get_documents_score(self, query, indices):
        score_list = []
        for i in indices:
            score_list.append(self.get_score(i, query))
        return score_list


class WordCut:
    def __init__(self, all_model_list=None):
        with open('/data/dataset/kefu/hit_stopwords.txt', encoding='utf-8') as f: # 可根据需要打开停用词库，然后加上不想显示的词语
            con = f.readlines()
            stop_words = set()
            for i in con:
                i = i.replace("\n", "")   # 去掉读取每一行数据的\n
                stop_words.add(i)
        self.stop_words = stop_words
        self.all_model_list = all_model_list
        
    def cut(self, mytext):
        # jieba.load_userdict('自定义词典.txt')  # 这里你可以添加jieba库识别不了的网络新词，避免将一些新词拆开
        # jieba.initialize()  # 初始化jieba
        # 文本预处理 ：去除一些无用的字符只提取出中文出来
        # new_data = re.findall('[\u4e00-\u9fa5]+', mytext, re.S)
        # new_data = " ".join(new_data)
        # 匹配中英文标点符号，以及全角和半角符号
        pattern = r'[\u3000-\u303f\uff01-\uff0f\uff1a-\uff20\uff3b-\uff40\uff5b-\uff65\u2018\u2019\u201c\u201d\u2026\u00a0\u2022\u2013\u2014\u2010\u2027\uFE10-\uFE1F\u3001-\u301E]|[\.,!¡?¿\-—_(){}[\]\'\";:/]'
        # 使用 re.sub 替换掉符合模式的字符为空字符
        new_data = re.sub(pattern, '', mytext)
        if self.all_model_list is not None:
            new_data = transform_model_name(new_data, self.all_model_list)
        # 文本分词
        seg_list_exact = jieba.lcut(new_data)
        result_list = []
        # 去除停用词并且去除单字
        for word in seg_list_exact:
            if word not in self.stop_words and len(word) > 1:
                result_list.append(word) 
        return result_list

def search_docs_bm25(df, indices, user_query, top_n=4):
    # document_list = [wc.cut(doc) for doc in df.question]
    # bm25_model = BM25_Model(document_list)
    embedding = wc.cut(user_query)
    df["similarities"] = bm25_model.get_documents_score(embedding, indices)
    output_columns = ["qa_id", "question", "answer", "similarities"]
    if "hit_reason" in df.columns:
        output_columns.append("hit_reason")
    res = (
        df.sort_values("similarities", ascending=False)
        .head(top_n)
    )[output_columns]
    return res.to_dict(orient='records')

In [6]:
# 综合分析
def ranking_metric(x):
    if (x.find("error")>=0) and (x.find("model")>=0):
        return 1 
    elif (x.find("error")>=0) and (x.find("cat")>=0):
        return 2 
    elif (x.find("error")>=0):
        return 3 
    elif (x.find("model")>=0):
        return 4
    elif (x.find("cat")>=0):
        return 5
    else:
        return 6

def mine_hard_negative(x, similarities, reason, result, positive):
    positives = x[positive].split(",")
    df = pd.DataFrame(x[[similarities, reason, result]].to_dict())
    df = df[~df[result].isin(positives)]
    df["ranking"] = df[reason].apply(lambda x: ranking_metric(x))
    df = df.sort_values(["ranking", similarities], ascending=[True, False])
    df = df.drop_duplicates(result)
    df = df.iloc[:10]
    return pd.Series({col+"_hard": df[col].values.tolist() for col in [similarities, reason, result]})

def format_result(x, similarities, reason, result):
    num_result = len(x[result])
    new_set = dict()
    for j in range(num_result):
        result_name = x[result][j]
        sim = x[similarities][j]
        reason_code = x[reason][j]
        if result_name in new_set:
            if (ranking_metric(reason_code) <= ranking_metric(new_set[result_name]["reason"])
               ) & (sim > new_set[result_name]["similarities"]):
                new_set.update({result_name: {"similarities": sim, "reason": reason_code}})
        else:
            new_set.update({result_name: {"similarities": sim, "reason": reason_code}})
    df = pd.DataFrame(new_set).T.reset_index().rename(columns={"index": "result"})
    df["ranking"] = df["reason"].apply(lambda x: ranking_metric(x))
    df = df.sort_values(["ranking", "similarities"], ascending=[True, False])
    return df[['result', 'reason', 'similarities']].to_dict(orient='records')

def merge_recall(x, recall_list, weights):
    pool = {}
    for i in range(len(weights)):
        weight = weights[i] 
        recall_name = recall_list[i]
        num_results = len(x[recall_name])
        for j in range(num_results):
            result_item = x[recall_name][j]
            result = result_item["result"]
            if result in pool:
                if ranking_metric(result_item['reason']) < ranking_metric(pool[result]['reason']):
                    reason = result_item['reason']
                    pool[result]['reason'] = reason
                pool[result]['similarities'] += weight * result_item['similarities'] / sum(weights)
                pool[result]['full_reason'] = pool[result]['full_reason']+","+result_item['reason']+"_"+recall_name
            else:
                pool[result]= {
                    "reason": result_item['reason'],
                    "similarities": weight * result_item['similarities'] / sum(weights),
                    "full_reason": result_item['reason']+"_"+recall_name
                              }
    df = pd.DataFrame(pool).T.reset_index().rename(columns={"index": "result"})
    df["ranking"] = df["reason"].apply(lambda x: ranking_metric(x))
    df = df.sort_values(["ranking", "similarities"], ascending=[True, False])
    return df[['result', 'reason', 'full_reason', 'similarities']].to_dict(orient='records')

def mine_hard_negative2(x, recall, top_n, positive, output_cols):
    df = pd.DataFrame(x[recall])
    positives = x[positive].split(",")
    df = df[~df["result"].isin(positives)]
    df["ranking"] = df["reason"].apply(lambda x: ranking_metric(x))
    df = df.sort_values(["ranking", "similarities"], ascending=[True, False])
    df = df.iloc[:top_n]
    return df[output_cols].to_dict(orient='records')  

def split_recall(x, output_cols, new_cols):
    df = pd.DataFrame(x)
    return pd.Series({new_col: df[col].values.tolist() 
                      for col, new_col in zip(output_cols, new_cols)})

def find_score_limit(x):
    min_all = float("inf")
    max_all = float("-inf")
    for i in range(len(x)):
        min_i = min(x[i])
        max_i = max(x[i])
        min_all = min(min_all, min_i)
        max_all = max(max_all, max_i)
    return min_all, max_all

def convert_limit(x, min_all, max_all):
    return [(i-min_all)/(max_all-min_all) for i in x]

def convert_df_to_jsonl(df, filename, query="question_cleaned", pos_col="question_positive", neg_col="question_bge_hard"):
    with open(filename, 'w') as file:
        for _, row in df.iterrows():
            # Constructing the dictionary for each row
            data = {
                "query": row[query],
                "pos": row[pos_col],
                "neg": row[neg_col]
            }
            # Writing the JSON string followed by a newline character to make it JSONL
            file.write(json.dumps(data) + '\n')

In [69]:
# 排序
def mrr_at_k_score(is_relevant, pred_ranking, k):
    """
    Computes MRR@k score

    Args:
        is_relevant (`List[bool]` of length `num_pos+num_neg`): True if the document is relevant
        pred_ranking (`List[int]` of length `num_pos+num_neg`): Indices of the documents sorted in decreasing order
            of the similarity score

    Returns:
        mrr_score (`float`): MRR@k score
    """
    mrr_score = 0
    for rank, index in enumerate(pred_ranking[:k]):
        if is_relevant[index]:
            mrr_score = 1 / (rank + 1)
            break

    return mrr_score

def recall_at_k_score(is_relevant, pred_ranking, k):
    """
    Computes MRR@k score

    Args:
        is_relevant (`List[bool]` of length `num_pos+num_neg`): True if the document is relevant
        pred_ranking (`List[int]` of length `num_pos+num_neg`): Indices of the documents sorted in decreasing order
            of the similarity score

    Returns:
        mrr_score (`float`): MRR@k score
    """
    recall_score = 0
    for index in pred_ranking[:k]:
        if is_relevant[index]:
            recall_score = 1
            break

    return recall_score

def ap_score(is_relevant, pred_scores):
    """
    Computes AP score

    Args:
        is_relevant (`List[bool]` of length `num_pos+num_neg`): True if the document is relevant
        pred_scores (`List[float]` of length `num_pos+num_neg`): Predicted similarity scores

    Returns:
        ap_score (`float`): AP score
    """
    # preds = np.array(is_relevant)[pred_scores_argsort]
    # precision_at_k = np.mean(preds[:k])
    # ap = np.mean([np.mean(preds[: k + 1]) for k in range(len(preds)) if preds[k]])
    ap = average_precision_score(is_relevant, pred_scores)
    return ap

def compute_recall_score(df, model, query, recall):
    pairs = []
    for i in range(df.shape[0]):
        sample = df.iloc[i]
        for p in sample[recall]:
            pairs.append([sample[query], p])
    all_scores = model.compute_score(pairs)
    result = []
    start_inx = 0
    for i in range(df.shape[0]):
        sample = df.iloc[i]
        pred_scores = all_scores[start_inx:start_inx + len(sample[recall])]
        result.append(pred_scores)
        start_inx += len(sample[recall])
    return result

def compute_metrics_batched_from_crossencoder(df, score, relevant, 
                                              mrr_at_k=10, recall_at_list=[1,2], metrics=["map", "mrr", "recall"]):
    all_mrr_scores = []
    all_ap_scores = []
    all_recall_scores = [[] for _ in range(len(recall_at_list))]

    for i in range(df.shape[0]):
        sample = df.iloc[i]
        is_relevant = sample[relevant]
        pred_scores = np.array(sample[score])

        pred_scores_argsort = np.argsort(-pred_scores)  # Sort in decreasing order
        if "mrr" in metrics:
            mrr = mrr_at_k_score(is_relevant, pred_scores_argsort, mrr_at_k)
            all_mrr_scores.append(mrr)
        if "map" in metrics:
            ap = ap_score(is_relevant, pred_scores)
            all_ap_scores.append(ap)
        if "recall" in metrics:
            for recall_index, recall_at in enumerate(recall_at_list):
                recall_score = recall_at_k_score(is_relevant, pred_scores_argsort, recall_at)
                all_recall_scores[recall_index].append(recall_score)

    result = {}
    if "map" in metrics:
        mean_ap = np.mean(all_ap_scores)
        result["map"] = mean_ap
    if "mrr" in metrics:
        mean_mrr = np.mean(all_mrr_scores)
        result[f"mrr@{mrr_at_k}"] = mean_mrr
    if "recall" in metrics:
        for recall_index, recall_at in enumerate(recall_at_list):
            result[f"recall@{recall_at}"] = np.mean(all_recall_scores[recall_index])
    return result

def find_T_loc(x, relevant, score):
    is_relevant = x[relevant]
    pred_scores = np.array(x[score])
    pred_scores_argsort = np.argsort(-pred_scores)
    for rank, index in enumerate(pred_scores_argsort):
        if is_relevant[index]:
            return rank
    return np.nan

def get_reranking(x, relevant, score, recall, reason, postranking=False):
    is_relevant = x[relevant]
    pred_scores = np.array(x[score])
    pred_scores_argsort = np.argsort(-pred_scores, kind="mergesort")
    recall_list = copy.deepcopy(x[recall])
    for index, i in enumerate(recall_list):
        i.update({"relevant": is_relevant[index], "recall_order": index})
    reranking = []
    for index, i in enumerate(pred_scores_argsort):
        temp = recall_list[i]
        temp.update({"ranking_score": pred_scores[i], "ranking_order": index})
        reranking.append(temp)
    if postranking:
        reranking = pd.DataFrame(reranking)
        reranking["if_special"] = reranking[reason].apply(
            lambda x: (x.find("model") >= 0)|(x.find("error") >= 0)|(len(x.split(","))>1)).astype(int)
        reranking.loc[(reranking["if_special"]==1)&(reranking["similarities"]<=0.75), "if_special"] = 0
        reranking["ranking"] = reranking[reason].apply(lambda x: ranking_metric(x))
        top_list = reranking[reranking["if_special"]==1].sort_values(
            ["if_special", "ranking", "ranking_score"], ascending=[False, True, False], kind="mergesort")["result"].iloc[:2].tolist()
        reranking["if_top"] = reranking["result"].isin(top_list)
        reranking = pd.concat([reranking[reranking["if_top"]==True], reranking[reranking["if_top"]==False]], axis=0).reset_index(drop=True)
        reranking["reranking_score"] = list(range(reranking.shape[0]))[::-1]
        reranking["reranking_order"] = list(range(reranking.shape[0]))
        reranking = reranking.drop("ranking", axis=1)
        reranking = reranking.to_dict(orient="records")
    return reranking

# 向量召回

In [8]:
oot = pd.read_csv("/data/dataset/kefu/oot20240315.csv")

In [9]:
df2 = pd.read_csv("/data/dataset/kefu/database20240506.csv")

In [10]:
# df2["error_list"] = df2.question.apply(lambda x: ",".join(find_error_with_reason(x))).astype(str)

In [11]:
# df2.to_csv("/data/dataset/kefu/database20240506.csv", index=None)

In [12]:
model = SentenceTransformer('/workspace/data/private/zhuxiaohai/models/bge_finetune_emb')
q_embeddings = model.encode(df2.question.tolist(), normalize_embeddings=True, batch_size=32)
df2['bge_large'] = q_embeddings.tolist()

In [13]:
dim_df = pd.read_csv("/data/dataset/kefu/dim_df20240315.csv")
all_model_list = dim_df.model.tolist()
all_cat_list = dim_df.cat_name.unique().tolist()

In [14]:
test = oot
df1 = df2 

In [135]:
# 标签+向量3

In [8]:
# 打标阶段测试数据

In [136]:
results = []
for i in range(test.shape[0]):
    question = test['question'].iloc[i]
    model_list = find_model(question, all_model_list)
    cat_list = find_cat(question, all_cat_list)   
    cat_list += [cat for cat in dim_df.loc[dim_df.model.isin(model_list), 'cat_name'].tolist() if cat not in cat_list]
    reason_list = find_error_with_reason(question)
    question = remove_model_name(question, all_model_list)
    result = {"model": list(set(model_list)),
              "cat": list(set(cat_list)),
              "error": list(set(reason_list)),
              "query_cleaned": question}
    results.append(json.dumps(result, ensure_ascii=False))
test["labeller"] = results 

In [137]:
test[["question", "labeller"]].to_csv("tests/data/data_labeller_test.csv", index=None)

In [16]:
def sub_worker(result, score, reason, top_n):
    if (filter_mask & (reason_indicator.str.find("errorcode")>=0)).sum() > 0:
        aug_mask = filter_mask & (reason_indicator.str.find("errorcode")>=0)
        filtered_df = df1[aug_mask].copy()
        filtered_df["hit_reason"] = reason_indicator[aug_mask].copy()
        res = search_docs_bge(filtered_df, question, top_n=int(top_n/2))
        result += [j["qa_id"] for j in res]
        score += [round(j["similarities"], 2) for j in res]
        reason += [j["hit_reason"] for j in res]
        
        aug_mask = filter_mask & (~(reason_indicator.str.find("errorcode")>=0))
        filtered_df = df1[aug_mask].copy()
        filtered_df["hit_reason"] = reason_indicator[aug_mask].copy()
        res = search_docs_bge(filtered_df, question, top_n=int(top_n/2))
        result += [j["qa_id"] for j in res]
        score += [round(j["similarities"], 2) for j in res]
        reason += [j["hit_reason"] for j in res]
    else:
        aug_mask = filter_mask
        filtered_df = df1[aug_mask].copy()
        filtered_df["hit_reason"] = reason_indicator[aug_mask].copy()
        res = search_docs_bge(filtered_df, question, top_n=int(top_n/2))
        result += [j["qa_id"] for j in res]
        score += [round(j["similarities"], 2) for j in res]
        reason += [j["hit_reason"] for j in res]
        
        aug_mask = (reason_indicator.str.find("errorcode")>=0)
        if aug_mask.sum() > 0:
            filtered_df = df1[aug_mask].copy()
            filtered_df["hit_reason"] = reason_indicator[aug_mask].copy()
            res = search_docs_bge(filtered_df, question, top_n=int(top_n/2))
            result += [j["qa_id"] for j in res]
            score += [round(j["similarities"], 2) for j in res]
            reason += [j["hit_reason"] for j in res]

    aug_mask = (~filter_mask) & (reason_indicator.str.find("cat")>=0)
    if aug_mask.sum() > 0:
        filtered_df = df1[aug_mask].copy()
        filtered_df["hit_reason"] = reason_indicator[aug_mask].copy()
        res = search_docs_bge(filtered_df, question, top_n=int(top_n/2))
        result += [j["qa_id"] for j in res]
        score += [round(j["similarities"], 2) for j in res]
        reason += [j["hit_reason"] for j in res]
    return result, score, reason
    
label = []
result_list = []
infer_list = []
top_n = 10
for i in range(test.shape[0]):
    gt = test['gt_qa_id'].iloc[i].split(",")
    question = test['question'].iloc[i]
    model_list = find_model(question, all_model_list)
    cat_list = find_cat(question, all_cat_list)   
    cat_list += [cat for cat in dim_df.loc[dim_df.model.isin(model_list), 'cat_name'].tolist() if cat not in cat_list]
    reason_list = find_error_with_reason(question)
    model_mask = (df1.model_list.apply(lambda x: filter_model(x, model_list)))
    cat_mask = (df1.cat_name.apply(lambda x: filter_model(x, cat_list)))
    reason_mask = (df1.question.apply(lambda x: filter_reason(x, reason_list)))
    reason_indicator = pd.Series(["none"]*df1.shape[0], index=df1.index)
    reason_indicator[model_mask] = reason_indicator[model_mask].apply(lambda x: x + "|model" if x != "none" else "model")
    reason_indicator[cat_mask] = reason_indicator[cat_mask].apply(lambda x: x + "|cat" if x != "none" else "cat")
    reason_indicator[reason_mask] = reason_indicator[reason_mask].apply(lambda x: x + "|errorcode" if x != "none" else "errorcode")
    result = []
    score = []
    reason = []
    question = remove_model_name(question, all_model_list)
    filter_mask = (reason_indicator.str.find("model")>=0)
    if filter_mask.sum() > 0:
        result, score, reason = sub_worker(result, score, reason, top_n)
    else:
        filter_mask = (reason_indicator.str.find("cat")>=0)   
        if filter_mask.sum() > 0:
            result, score, reason = sub_worker(result, score, reason, top_n)
    if len(result) == 0:
        filter_mask = (reason_indicator.str.find("errorcode")>=0)
        if filter_mask.sum() > 0:
            aug_mask = filter_mask
            filtered_df = df1[aug_mask].copy()
            filtered_df["hit_reason"] = reason_indicator[aug_mask].copy()
            res = search_docs_bge(filtered_df, question, top_n=int(top_n/2))
            result += [j["qa_id"] for j in res]
            score += [round(j["similarities"], 2) for j in res]
            reason += [j["hit_reason"] for j in res]  
            aug_mask = (~filter_mask)
            filtered_df = df1[aug_mask].copy()
            filtered_df["hit_reason"] = reason_indicator[aug_mask].copy()
            res = search_docs_bge(filtered_df, question, top_n=int(top_n/2))
            result += [j["qa_id"] for j in res]
            score += [round(j["similarities"], 2) for j in res]
            reason += [j["hit_reason"] for j in res]  
        else:
            filtered_df = df1.copy()
            filtered_df["hit_reason"] = reason_indicator.copy()
            res = search_docs_bge(filtered_df, question, top_n=top_n)
            result += [j["qa_id"] for j in res]
            score += [round(j["similarities"], 2) for j in res]
            reason += [j["hit_reason"] for j in res]
    
    found = False
    for j in result:
        if j in gt:
            found = True
            break 
    if found:
        label.append(1)
    else:
        label.append(0)
    result_list.append({"qa_id": test['qa_id'].iloc[i], 
                        "result": result, 
                        "similarities": score, 
                        "hit_reason": reason, 
                        "label": int(found)})
    infer_list.append([{"id": result_name, "score": result_score, "info": result_reason} 
                      for result_name, result_score, result_reason in zip(result, score, reason)])

In [17]:
# 向量召回阶段测试数据

In [18]:
result_df = pd.DataFrame(result_list)
result_df["recall_list"] = [json.dumps(i, ensure_ascii=False) for i in infer_list]
test_result = test.merge(result_df, how='left', left_on='qa_id', right_on='qa_id')
test_result["sim_max"] = test_result["similarities"].apply(lambda x: max(x))
test_result["sim_min"] = test_result["similarities"].apply(lambda x: min(x))
test_result["result_num"] = test_result.result.apply(lambda x: len(x))

In [19]:
test_result[["qa_id", "question", "recall_list"]].to_csv("tests/data/data_vector_search.csv", index=None)

In [20]:
test_result.label.mean()

0.9230769230769231

In [9]:
# test_result.label.mean()
0.9487179487179487

0.9487179487179487

In [28]:
# 标签+bm25

In [21]:
wc = WordCut()

In [22]:
document_list = [wc.cut(doc) for doc in df1.question]
bm25_model = BM25_Model(document_list)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.815 seconds.
Prefix dict has been built successfully.


In [23]:
def sub_worker_bm25(result, score, reason, top_n):
    if (filter_mask & (reason_indicator.str.find("errorcode")>=0)).sum() > 0:
        aug_mask = filter_mask & (reason_indicator.str.find("errorcode")>=0)
        filtered_df = df1[aug_mask].copy()
        filtered_indices = np.array(range(df1.shape[0]))[aug_mask.values]
        filtered_df["hit_reason"] = reason_indicator[aug_mask].copy()
        res = search_docs_bm25(filtered_df, filtered_indices, question, top_n=int(top_n/2))
        result += [j["qa_id"] for j in res]
        score += [round(j["similarities"], 2) for j in res]
        reason += [j["hit_reason"] for j in res]
        
        aug_mask = filter_mask & (~(reason_indicator.str.find("errorcode")>=0))
        filtered_df = df1[aug_mask].copy()
        filtered_indices = np.array(range(df1.shape[0]))[aug_mask.values]
        filtered_df["hit_reason"] = reason_indicator[aug_mask].copy()
        res = search_docs_bm25(filtered_df, filtered_indices, question, top_n=int(top_n/2))
        result += [j["qa_id"] for j in res]
        score += [round(j["similarities"], 2) for j in res]
        reason += [j["hit_reason"] for j in res]
    else:
        aug_mask = filter_mask
        filtered_df = df1[aug_mask].copy()
        filtered_indices = np.array(range(df1.shape[0]))[aug_mask.values]
        filtered_df["hit_reason"] = reason_indicator[aug_mask].copy()
        res = search_docs_bm25(filtered_df, filtered_indices, question, top_n=int(top_n/2))
        result += [j["qa_id"] for j in res]
        score += [round(j["similarities"], 2) for j in res]
        reason += [j["hit_reason"] for j in res]
        
        aug_mask = (reason_indicator.str.find("errorcode")>=0)
        if aug_mask.sum() > 0:
            filtered_df = df1[aug_mask].copy()
            filtered_indices = np.array(range(df1.shape[0]))[aug_mask.values]
            filtered_df["hit_reason"] = reason_indicator[aug_mask].copy()
            res = search_docs_bm25(filtered_df, filtered_indices, question, top_n=int(top_n/2))
            result += [j["qa_id"] for j in res]
            score += [round(j["similarities"], 2) for j in res]
            reason += [j["hit_reason"] for j in res]

    aug_mask = (~filter_mask) & (reason_indicator.str.find("cat")>=0)
    # aug_mask = (~filter_mask) & (~(reason_indicator.str.find("errorcode")>=0)) & (reason_indicator.str.find("cat")>=0)
    if aug_mask.sum() > 0:
        filtered_df = df1[aug_mask].copy()
        filtered_indices = np.array(range(df1.shape[0]))[aug_mask.values]
        filtered_df["hit_reason"] = reason_indicator[aug_mask].copy()
        res = search_docs_bm25(filtered_df, filtered_indices, question, top_n=int(top_n/2))
        result += [j["qa_id"] for j in res]
        score += [round(j["similarities"], 2) for j in res]
        reason += [j["hit_reason"] for j in res]

    return result, score, reason

label = []
result_list = []
infer_list = []
top_n = 10
for i in range(test.shape[0]):
    gt = test['gt_qa_id'].iloc[i].split(",")
    question = test['question'].iloc[i]
    model_list = find_model(question, all_model_list)
    cat_list = find_cat(question, all_cat_list)   
    cat_list += [cat for cat in dim_df.loc[dim_df.model.isin(model_list), 'cat_name'].tolist() if cat not in cat_list]
    reason_list = find_error_with_reason(question)
    model_mask = (df1.model_list.apply(lambda x: filter_model(x, model_list)))
    cat_mask = (df1.cat_name.apply(lambda x: filter_model(x, cat_list)))
    reason_mask = (df1.question.apply(lambda x: filter_reason(x, reason_list)))
    reason_indicator = pd.Series(["none"]*df1.shape[0], index=df1.index)
    reason_indicator[model_mask] = reason_indicator[model_mask].apply(lambda x: x + "|model" if x != "none" else "model")
    reason_indicator[cat_mask] = reason_indicator[cat_mask].apply(lambda x: x + "|cat" if x != "none" else "cat")
    reason_indicator[reason_mask] = reason_indicator[reason_mask].apply(lambda x: x + "|errorcode" if x != "none" else "errorcode")
    result = []
    score = []
    reason = []
    question = remove_model_name(question, all_model_list)
    filter_mask = (reason_indicator.str.find("model")>=0)
    if filter_mask.sum() > 0:
        result, score, reason = sub_worker_bm25(result, score, reason, top_n)
    else:
        filter_mask = (reason_indicator.str.find("cat")>=0)   
        if filter_mask.sum() > 0:
            result, score, reason = sub_worker_bm25(result, score, reason, top_n)
    if len(result) == 0:
        filter_mask = (reason_indicator.str.find("errorcode")>=0)
        if filter_mask.sum() > 0:
            aug_mask = filter_mask
            filtered_df = df1[aug_mask].copy()
            filtered_indices = np.array(range(df1.shape[0]))[aug_mask.values]
            filtered_df["hit_reason"] = reason_indicator[aug_mask].copy()
            res = search_docs_bm25(filtered_df, filtered_indices, question, top_n=int(top_n/2))
            result += [j["qa_id"] for j in res]
            score += [round(j["similarities"], 2) for j in res]
            reason += [j["hit_reason"] for j in res]  
            aug_mask = (~filter_mask)
            filtered_df = df1[aug_mask].copy()
            filtered_indices = np.array(range(df1.shape[0]))[aug_mask.values]
            filtered_df["hit_reason"] = reason_indicator[aug_mask].copy()
            res = search_docs_bm25(filtered_df, filtered_indices, question, top_n=int(top_n/2))
            result += [j["qa_id"] for j in res]
            score += [round(j["similarities"], 2) for j in res]
            reason += [j["hit_reason"] for j in res]  
        else:
            filtered_df = df1.copy()
            filtered_df["hit_reason"] = reason_indicator.copy()
            filtered_indices = np.array(range(df1.shape[0]))
            res = search_docs_bm25(filtered_df, filtered_indices, question, top_n=top_n)
            result += [j["qa_id"] for j in res]
            score += [round(j["similarities"], 2) for j in res]
            reason += [j["hit_reason"] for j in res]
    # result_list.append({"qa_id": test['qa_id'].iloc[i], "result": result, "similarities": score, "hit_reason": reason})
    found = False
    for j in result:
        if j in gt:
            found = True
            break 
    if found:
        label.append(1)
    else:
        label.append(0)
    result_list.append({"qa_id": test['qa_id'].iloc[i], "result": result, 
                        "similarities": score, "hit_reason": reason, "label": int(found)})
    infer_list.append([{"id": result_name, "score": result_score, "info": result_reason} 
                  for result_name, result_score, result_reason in zip(result, score, reason)])

In [24]:
result_df = pd.DataFrame(result_list)
result_df["recall_list"] = [json.dumps(i, ensure_ascii=False) for i in infer_list]
test_result2 = test.merge(result_df, how='left', left_on='qa_id', right_on='qa_id')
test_result2["sim_max"] = test_result2["similarities"].apply(lambda x: max(x) if len(x)>0 else np.nan)
test_result2["sim_min"] = test_result2["similarities"].apply(lambda x: min(x) if len(x)>0 else np.nan)
test_result2["result_num"] = test_result2.result.apply(lambda x: len(x))

In [25]:
test_result2.label.mean()

0.8205128205128205

# 综合分析

In [26]:
final_result = pd.merge(left=test_result, right=test_result2[["qa_id",
                                               "result",
                                               "similarities",
                                               "hit_reason",
                                               "recall_list",
                                               "label",
                                               "sim_max",
                                               "sim_min",
                                               "result_num"]], 
                 left_on="qa_id", right_on="qa_id", how="left",
                 suffixes=["_bge", "_bm25"])

In [27]:
final_result['label_all'] = final_result[["label_bge", "label_bm25"]].apply(lambda x: max(x["label_bge"], x["label_bm25"]), axis=1)

In [28]:
final_result.label_all.mean()

0.9487179487179487

In [29]:
min_all, max_all = find_score_limit(final_result["similarities_bm25"].tolist())
final_result["similarities_rescaled_bm25"] = final_result["similarities_bm25"].apply(
    lambda x: convert_limit(x, min_all, max_all))

In [30]:
cols = ["similarities_bge", "hit_reason_bge", "result_bge"]
final_result["recall_bge"] = final_result[cols].apply(lambda x: format_result(x, *cols), axis=1) 
cols = ["similarities_rescaled_bm25", "hit_reason_bm25", "result_bm25"]
final_result["recall_bm25"] = final_result[cols].apply(lambda x: format_result(x, *cols), axis=1) 
final_result["recall_all"] = final_result[["recall_bge", "recall_bm25"]].apply(
    lambda x: merge_recall(x, 
                           recall_list=["recall_bge", "recall_bm25"], 
                           weights=[0.9, 0.84]
                          ), 
    axis=1) 

In [None]:
# 单通道融合， 只取向量

In [27]:
cols = ["similarities_bge", "hit_reason_bge", "result_bge"]
final_result["recall_bge"] = final_result[cols].apply(lambda x: format_result(x, *cols), axis=1) 
final_result["similarities_rescaled_bm25"] = final_result["similarities_rescaled_bm25"].apply(lambda x: [round(i, 2) for i in x])
cols = ["similarities_rescaled_bm25", "hit_reason_bm25", "result_bm25"]
final_result["recall_bm25"] = final_result[cols].apply(lambda x: format_result(x, *cols), axis=1) 
final_result["recall_all"] = final_result[["recall_bge", "recall_bm25"]].apply(
    lambda x: merge_recall(x, 
                           recall_list=["recall_bge"], 
                           weights=[1.0]
                          ), 
    axis=1) 

In [342]:
final_result["recall_bge"] = final_result["recall_bge"].apply(lambda x: json.dumps(x, ensure_ascii=False)) 
final_result["recall_bm25"] = final_result["recall_bm25"].apply(lambda x: json.dumps(x, ensure_ascii=False)) 
final_result["recall_all"] = final_result["recall_all"].apply(lambda x: json.dumps(x, ensure_ascii=False)) 
final_result[["recall_list", "recall_bge", "recall_all"]].rename(
    columns={"recall_list": "raw", "recall_bge": "duplicated", "recall_all": "merged"}
).to_csv("tests/data/data_merge_one_recall_channel.csv", index=None)

In [None]:
# 多通道融合

In [428]:
final_result["similarities_rescaled_bm25"] = final_result["similarities_rescaled_bm25"].apply(lambda x: [round(i, 2) for i in x])

# 去重
cols = ["similarities_bge", "hit_reason_bge", "result_bge"]
final_result["recall_bge"] = final_result[cols].apply(lambda x: format_result(x, *cols), axis=1) 

# 去重
cols = ["similarities_rescaled_bm25", "hit_reason_bm25", "result_bm25"]
final_result["recall_bm25"] = final_result[cols].apply(lambda x: format_result(x, *cols), axis=1) 

# 原始召回
cols = ["similarities_rescaled_bm25", "hit_reason_bm25", "result_bm25"]
final_result["recall_list_bm25"] = final_result[cols].apply(lambda x: [{"id": result_name, "score": result_score, "info": result_reason} 
                  for result_name, result_score, result_reason in 
                  zip(x["result_bm25"], x["similarities_rescaled_bm25"], x["hit_reason_bm25"])], axis=1)
# 融合
final_result["recall_all"] = final_result[["recall_bge", "recall_bm25"]].apply(
    lambda x: merge_recall(x, 
                           recall_list=["recall_bge", "recall_bm25"], 
                           weights=[0.9, 0.84]
                          ), 
    axis=1) 

In [392]:
final_result["recall_list_bm25"] = final_result["recall_list_bm25"].apply(lambda x: json.dumps(x, ensure_ascii=False)) 
final_result["recall_bge"] = final_result["recall_bge"].apply(lambda x: json.dumps(x, ensure_ascii=False)) 
final_result["recall_bm25"] = final_result["recall_bm25"].apply(lambda x: json.dumps(x, ensure_ascii=False)) 
final_result["recall_all"] = final_result["recall_all"].apply(lambda x: json.dumps(x, ensure_ascii=False)) 
final_result[["recall_list_bge", "recall_bge", "recall_list_bm25", "recall_bm25", "recall_all"]].to_csv(
    "tests/data/data_merge_two_recall_channel.csv", index=None)

# 排序

In [31]:
final_result["question_cleaned"] = final_result["question"].apply(lambda x: remove_model_name(x, all_model_list))

In [32]:
model_reranker = FlagReranker("/workspace/data/private/zhuxiaohai/models/bge_finetune_reranker_question_top20", use_fp16=True)

In [10]:
# 打分数据

In [51]:
recall_list = ["recall_bge"]
top_n = [1, 2]
use_sim_score = False
result_all = {}
result_T_loc = {}
result_reranking = {}
preranking = None
target = "question"
postranking = True
for recall in recall_list:
    # 召回特性
    temp = final_result.copy()
    if preranking is not None:
        temp[recall] = temp[recall].apply(lambda x: x[:preranking])
    temp[f"{recall}_all"] = temp[recall].apply(lambda x: [i["result"] for i in x])
    temp["relevant"] = temp[[f"{recall}_all", "gt_qa_id"]].apply(
        lambda x: [True if i in x["gt_qa_id"].split(",") else False for i in x[f"{recall}_all"]], axis=1)
    temp_exploded = temp.explode(f"{recall}_all")[['qa_id', f"{recall}_all"]]
    temp_right = df1[['qa_id', 'question', 'answer']].copy()
    temp_exploded = pd.merge(left=temp_exploded, right=temp_right, 
                             left_on=f"{recall}_all", right_on='qa_id', 
                             how='left', suffixes=["", "_right"])[["qa_id", f"{recall}_all", "question", "answer"]]
    temp_exploded = temp_exploded.groupby("qa_id")[["question", "answer"]].apply(
        lambda x: pd.Series({col: x[col].tolist() for col in x.columns}))
    temp = pd.merge(left=temp[["qa_id", "question_cleaned", f"{recall}_all", "relevant", recall]], right=temp_exploded,
                    left_on='qa_id', right_on='qa_id', how='left')
    if use_sim_score:
        temp["score"] = temp[f"{recall}_all"].apply(lambda x: list(range(len(x)))[::-1])
    else:
        temp["score"] = compute_recall_score(temp, model_reranker, "question_cleaned", target)

Compute Scores: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  7.39it/s]


In [53]:
temp["recall_bge"] = temp["recall_bge"].apply(lambda x: json.dumps(x, ensure_ascii=False)) 
temp["recall_question"] = temp["question"].apply(lambda x: json.dumps(x, ensure_ascii=False)) 
temp["score"] = temp["score"].apply(lambda x: json.dumps(x, ensure_ascii=False)) 
temp[["question_cleaned", "recall_bge", "recall_question", "score"]].to_csv("tests/data/data_rank.csv", index=None)

In [11]:
# 重排

In [82]:
recall_list = ["recall_bge"]
top_n = [1, 2]
use_sim_score = False
result_all = {}
result_T_loc = {}
result_reranking = {}
preranking = None
target = "question"
postranking = True
for recall in recall_list:
    # 召回特性
    temp = final_result.copy()
    if preranking is not None:
        temp[recall] = temp[recall].apply(lambda x: x[:preranking])
    temp[f"{recall}_all"] = temp[recall].apply(lambda x: [i["result"] for i in x])
    temp["relevant"] = temp[[f"{recall}_all", "gt_qa_id"]].apply(
        lambda x: [True if i in x["gt_qa_id"].split(",") else False for i in x[f"{recall}_all"]], axis=1)
    temp_exploded = temp.explode(f"{recall}_all")[['qa_id', f"{recall}_all"]]
    temp_right = df1[['qa_id', 'question', 'answer']].copy()
    temp_exploded = pd.merge(left=temp_exploded, right=temp_right, 
                             left_on=f"{recall}_all", right_on='qa_id', 
                             how='left', suffixes=["", "_right"])[["qa_id", f"{recall}_all", "question", "answer"]]
    temp_exploded = temp_exploded.groupby("qa_id")[["question", "answer"]].apply(
        lambda x: pd.Series({col: x[col].tolist() for col in x.columns}))
    temp = pd.merge(left=temp[["qa_id", "question_cleaned", f"{recall}_all", "relevant", recall]], right=temp_exploded,
                    left_on='qa_id', right_on='qa_id', how='left')
    if use_sim_score:
        temp["score"] = temp[f"{recall}_all"].apply(lambda x: list(range(len(x)))[::-1])
    else:
        temp["score"] = compute_recall_score(temp, model_reranker, "question_cleaned", target)
    # T_loc = temp[["relevant", "score"]].apply(lambda x: find_T_loc(x, "relevant", "score"), axis=1)
    if recall.find("_all") >= 0:
        reason = "full_reason"
    else:
        reason = "reason"
    result_reranking[recall] = temp[["relevant", "score", recall]].apply(
        lambda x: get_reranking(x, "relevant", "score", recall, reason, postranking), axis=1)

Compute Scores: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  7.70it/s]


In [None]:
# 拉通测试数据

In [83]:
temp["reranking"] = result_reranking[recall]
temp["raw_question"] = final_result.question.tolist()

In [84]:
temp["reranking"] = temp["reranking"].apply(lambda x: json.dumps(x, ensure_ascii=False)) 
temp[["raw_question", "reranking"]].to_csv("tests/data/data_pipeline.csv", index=None)

In [None]:
# 重排测试数据

In [76]:
temp["reranking"] = result_reranking[recall]
temp["recall_bge"] = temp["recall_bge"].apply(lambda x: json.dumps(x, ensure_ascii=False)) 
temp["score"] = temp["score"].apply(lambda x: json.dumps(x, ensure_ascii=False)) 
temp["reranking"] = temp["reranking"].apply(lambda x: json.dumps(x, ensure_ascii=False)) 
temp[["recall_bge", "score", "reranking"]].to_csv("tests/data/data_rerank.csv", index=None)

In [None]:
# 总体效果

In [72]:
recall_list = ["recall_bge", "recall_bm25", "recall_all"]
top_n = [1, 2]
use_sim_score = False
result_all = {}
result_T_loc = {}
result_reranking = {}
preranking = None
target = "question"
postranking = True
for recall in recall_list:
    # 召回特性
    temp = final_result.copy()
    if preranking is not None:
        temp[recall] = temp[recall].apply(lambda x: x[:preranking])
    temp[f"{recall}_all"] = temp[recall].apply(lambda x: [i["result"] for i in x])
    temp["relevant"] = temp[[f"{recall}_all", "gt_qa_id"]].apply(
        lambda x: [True if i in x["gt_qa_id"].split(",") else False for i in x[f"{recall}_all"]], axis=1)
    temp_exploded = temp.explode(f"{recall}_all")[['qa_id', f"{recall}_all"]]
    temp_right = df1[['qa_id', 'question', 'answer']].copy()
    temp_exploded = pd.merge(left=temp_exploded, right=temp_right, 
                             left_on=f"{recall}_all", right_on='qa_id', 
                             how='left', suffixes=["", "_right"])[["qa_id", f"{recall}_all", "question", "answer"]]
    temp_exploded = temp_exploded.groupby("qa_id")[["question", "answer"]].apply(
        lambda x: pd.Series({col: x[col].tolist() for col in x.columns}))
    temp = pd.merge(left=temp[["qa_id", "question_cleaned", f"{recall}_all", "relevant", recall]], right=temp_exploded,
                    left_on='qa_id', right_on='qa_id', how='left')
    if use_sim_score:
        temp["score"] = temp[f"{recall}_all"].apply(lambda x: list(range(len(x)))[::-1])
    else:
        temp["score"] = compute_recall_score(temp, model_reranker, "question_cleaned", target)
    # T_loc = temp[["relevant", "score"]].apply(lambda x: find_T_loc(x, "relevant", "score"), axis=1)
    if recall.find("_all") >= 0:
        reason = "full_reason"
    else:
        reason = "reason"
    result_reranking[recall] = temp[["relevant", "score", recall]].apply(
        lambda x: get_reranking(x, "relevant", "score", recall, reason, postranking), axis=1)
    # result_T_loc[recall] = T_loc
    if postranking:
        temp["score"] = [[j["reranking_score"] for j in i] for i in result_reranking[recall]] 
        temp["relevant"] = [[j["relevant"] for j in i] for i in result_reranking[recall]]
    T_loc = temp[["relevant", "score"]].apply(lambda x: find_T_loc(x, "relevant", "score"), axis=1)
    result_T_loc[recall] = T_loc
    result_all[recall] = compute_metrics_batched_from_crossencoder(temp, "score", "relevant", metrics=["recall"], recall_at_list=top_n)

    # 排序特性
    temp = final_result.copy()
    if preranking is not None:
        temp[recall] = temp[recall].apply(lambda x: x[:preranking])
    temp[f"{recall}_all"] = temp[recall].apply(lambda x: [i["result"] for i in x])
    temp[f"{recall}_all"] = temp[[f"{recall}_all", "gt_qa_id"]].apply(lambda x: x[f"{recall}_all"] + [
        i for i in x["gt_qa_id"].split(",") if i not in x[f"{recall}_all"]], axis=1)
    temp["relevant"] = temp[[f"{recall}_all", "gt_qa_id"]].apply(
        lambda x: [True if i in x["gt_qa_id"].split(",") else False for i in x[f"{recall}_all"]], axis=1)
    temp_exploded = temp.explode(f"{recall}_all")[['qa_id', f"{recall}_all"]]
    temp_right = df1[['qa_id', 'question', 'answer']].copy()
    temp_exploded = pd.merge(left=temp_exploded, right=temp_right, 
                             left_on=f"{recall}_all", right_on='qa_id', 
                             how='left', suffixes=["", "_right"])[["qa_id", f"{recall}_all", "question", "answer"]]
    temp_exploded = temp_exploded.groupby("qa_id")[["question", "answer"]].apply(
        lambda x: pd.Series({col: x[col].tolist() for col in x.columns}))
    temp = pd.merge(left=temp[["qa_id", "question_cleaned", f"{recall}_all", "relevant"]], right=temp_exploded,
                    left_on='qa_id', right_on='qa_id', how='left')
    if use_sim_score:
        temp["score"] = temp[f"{recall}_all"].apply(lambda x: list(range(len(x)))[::-1])
    else:
        temp["score"] = compute_recall_score(temp, model_reranker, "question_cleaned", target)
    result_all[recall].update(compute_metrics_batched_from_crossencoder(temp, "score", "relevant", metrics=["map", "mrr"]))

Compute Scores: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  7.50it/s]
Compute Scores: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  9.36it/s]
Compute Scores: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  8.74it/s]
Compute Scores: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  8.56it/s]
Compute Scores: 100%|███████████████████████████████████████████████████████████████████████████████

In [62]:
# bge ranker finetune过后
a = pd.DataFrame(result_T_loc)
cols = a.columns.tolist()
len_cols = [a[col].unique().shape[0] for col in cols]
anchor = cols[np.argmax(len_cols)]
stat = a[anchor].fillna(100).value_counts("mean").sort_index().reset_index()
for col in [col for col in cols if col != anchor]:
    temp = a[col].fillna(100).value_counts("mean").sort_index().reset_index()
    stat = pd.merge(left=stat, right=temp, left_on=anchor, right_on=col, how="outer")
stat

Unnamed: 0,recall_all,proportion_x,recall_bge,proportion_y,recall_bm25,proportion
0,0.0,0.769231,0.0,0.820513,0.0,0.769231
1,1.0,0.025641,1.0,0.051282,,
2,2.0,0.025641,,,2.0,0.025641
3,3.0,0.051282,,,3.0,0.025641
4,4.0,0.025641,,,,
5,6.0,0.025641,6.0,0.051282,,
6,7.0,0.025641,,,,
7,100.0,0.051282,100.0,0.076923,100.0,0.179487


In [63]:
# bge ranker finetune过后
pd.DataFrame(result_all)

Unnamed: 0,recall_bge,recall_bm25,recall_all
recall@1,0.820513,0.769231,0.769231
recall@2,0.871795,0.769231,0.794872
map,0.816545,0.863095,0.810481
mrr@10,0.835775,0.880189,0.825244


In [307]:
# bge ranker finetune过后
a = pd.DataFrame(result_T_loc)
cols = a.columns.tolist()
len_cols = [a[col].unique().shape[0] for col in cols]
anchor = cols[np.argmax(len_cols)]
stat = a[anchor].fillna(100).value_counts("mean").sort_index().reset_index()
for col in [col for col in cols if col != anchor]:
    temp = a[col].fillna(100).value_counts("mean").sort_index().reset_index()
    stat = pd.merge(left=stat, right=temp, left_on=anchor, right_on=col, how="outer")
stat

Unnamed: 0,recall_bge,proportion_x,recall_bm25,proportion_y,recall_all,proportion
0,0.0,0.820513,0.0,0.769231,0.0,0.74359
1,1.0,0.025641,,,1.0,0.051282
2,2.0,0.025641,2.0,0.025641,2.0,0.025641
3,,,,,3.0,0.076923
4,4.0,0.025641,,,,
5,6.0,0.051282,,,6.0,0.051282
6,100.0,0.051282,100.0,0.179487,100.0,0.051282
7,,,3.0,0.025641,,


In [45]:
# bge ranker finetune过后
pd.DataFrame(result_all)

Unnamed: 0,recall_bge,recall_bm25,recall_all
recall@1,0.820513,0.769231,0.74359
recall@2,0.846154,0.769231,0.794872
map,0.818681,0.863095,0.810939
mrr@10,0.835775,0.880189,0.827839


In [46]:
final_result["reranking_bge"] = pd.DataFrame(result_reranking)["recall_bge"].tolist()

In [47]:
final_result["GT_rank_bge"] = pd.DataFrame(result_T_loc)["recall_bge"].tolist()

In [48]:
final_result["GT_rank_bge"].value_counts()

GT_rank_bge
0.0    32
6.0     2
1.0     1
2.0     1
4.0     1
Name: count, dtype: int64

In [278]:
final_result[final_result["GT_rank_bge"]>=3][["question", 
                                              "question_positive", 
                                              "answer_positive", 
                                              "GT_rank_bge", 
                                              "recall_bge",
                                              "reranking_bge"]]

Unnamed: 0,question,question_positive,answer_positive,GT_rank_bge,recall_bge,reranking_bge
6,T7S PLUS 错误一,[扫地机器人机器人报错误1激光头遮挡],"[1,引导客户提供报错时照片或视频，进一步确认；\n*有贴膜，优先引导客户取下贴膜后再关机重...",3.0,"[{'result': 'ICWIKI202307243886', 'reason': 'm...","[{'result': 'ICWIKI202402061673', 'reason': 'c..."
18,G10S不集尘,[为什么扫地机清扫结束没有自动集尘？],[您好，若扫地机不能自动集尘，请按照以下操作排查：\n（1）扫地机在勿扰模式期间不会主动集尘...,6.0,"[{'result': 'ICWIKI202307243982', 'reason': 'm...","[{'result': 'ICWIKI202307243982', 'reason': 'm..."
32,G20不集尘,[为什么扫地机清扫结束没有自动集尘？],[您好，若扫地机不能自动集尘，请按照以下操作排查：\n（1）扫地机在勿扰模式期间不会主动集尘...,6.0,"[{'result': 'ICWIKI202307243982', 'reason': 'm...","[{'result': 'ICWIKI202307243982', 'reason': 'm..."
34,G10无法清洗拖布，请清理基座附近障碍物,[无法清洗拖布/回充失败/无法回充/不回基站],[您好，基座未通电会出现上述情况，确认基座指示灯是否亮起；\n（1）如不亮，参考话术：\n关...,3.0,"[{'result': 'ICWIKI202307243284', 'reason': 'm...","[{'result': 'ICWIKI202307243284', 'reason': 'm..."


In [279]:
final_result[final_result["GT_rank_bge"]>=3].iloc[0].gt_qa_id

'ICWIKI202307243886'

In [280]:
final_result[final_result["GT_rank_bge"]>=3].iloc[0].reranking_bge

[{'result': 'ICWIKI202402061673',
  'reason': 'cat',
  'similarities': 0.78,
  'relevant': False,
  'recall_order': 6,
  'ranking_score': 9.6875,
  'ranking_order': 0,
  'if_special': 0,
  'if_top': False,
  'reranking_score': 9,
  'reranking_order': 0},
 {'result': 'ICWIKI202309040294',
  'reason': 'cat',
  'similarities': 0.78,
  'relevant': False,
  'recall_order': 5,
  'ranking_score': 9.6875,
  'ranking_order': 1,
  'if_special': 0,
  'if_top': False,
  'reranking_score': 8,
  'reranking_order': 1},
 {'result': 'ICWIKI202307243887',
  'reason': 'cat',
  'similarities': 0.78,
  'relevant': False,
  'recall_order': 7,
  'ranking_score': 9.6796875,
  'ranking_order': 2,
  'if_special': 0,
  'if_top': False,
  'reranking_score': 7,
  'reranking_order': 2},
 {'result': 'ICWIKI202307243886',
  'reason': 'model|cat',
  'similarities': 0.77,
  'relevant': True,
  'recall_order': 0,
  'ranking_score': 6.08203125,
  'ranking_order': 3,
  'if_special': 0,
  'if_top': False,
  'reranking_scor

In [247]:
df1[df1.qa_id=="ICWIKI202307243916"]

Unnamed: 0,qa_id,qa_type,question,answer,model,effective,update_by,update_time,model_list,model_num,model_id,cat_name,ada_002,bge_large
743,ICWIKI202307243916,故障问题,扫地机拖地出水少不出水,1，引导客户查看拖布是否安装到位并正常工作，拖地时完全打湿拖布，安装好后使用观察\n2，取出...,"G10S, P10, G20, T7S, T7SPlus, G10, G10Plus, T8...",2023-09-11 11:24:01 已生效,王鹏程,2023-09-11 11:24:00.000000,"g10s,p10,g20,t7s,t7splus,g10,g10plus,t8,t8plus...",12,"ICMU025,ICMU028,ICMU030,ICMU017,ICMU018,ICMU01...","扫地机,扫地机,扫地机,扫地机,扫地机,扫地机,扫地机,扫地机,扫地机,扫地机,扫地机,扫地机","[-0.018422875553369522, -0.010261740535497665,...","[0.017396926879882812, -0.017865771427750587, ..."
