In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import StratifiedKFold,KFold
from datasets import Dataset 
from datasets import Features, ClassLabel, Sequence, Value
from datasets import load_from_disk, load_dataset, DatasetDict
from pandasql import sqldf
from transformers import AutoTokenizer, DataCollatorWithPadding

from utils import find_model_with_pos, find_error_with_reason_with_pos, find_error_with_reason

In [2]:
import numpy as np 
import pandas as pd
from collections import Counter
from pandasql import sqldf
import re
import os 
os.environ['CUDA_VISIBLE_DEVICES']='1'
import joblib
import json 
import jieba 
import copy

from openai import AzureOpenAI
import matplotlib.pyplot as plt 
from sentence_transformers import SentenceTransformer
from FlagEmbedding import FlagReranker
from transformers import AutoTokenizer
from sklearn.metrics import average_precision_score

In [3]:
# pandasql查询函数需要的环境
pysqldf = lambda q: sqldf(q, globals())

In [4]:
# 原始数据处理
def format_model2(x):
    model_list = x.split(',')
    model_list = [i.strip().lower() for i in model_list]
    new_list = [model_list[0]]
    i = 1
    while i < len(model_list):
        if (i != len(model_list) - 1) and (model_list[i-1] == model_list[i]):
            new_list.append(model_list[i]+model_list[i+1])
            if i < len(model_list) - 1:
                i += 2
            else:
                break
        elif (model_list[i][:3] == "上下水") or (model_list[i][:3] == "air") or (model_list[i][:3] == "pro") or (model_list[i][:4] == "pure"):
            for j in range(len(new_list)-1, -1, -1):
                if model_list[i-1] == new_list[j]:
                    new_list.pop(j)
                    break
            new_list.append(model_list[i-1]+model_list[i])
            i += 1
        elif (i != len(model_list) - 1) and (model_list[i-1] != model_list[i]):
            new_list.append(model_list[i])
            i += 1
        else:
            new_list.append(model_list[i])
            break
    return new_list

def format_all_models(x, dim_df):
    new_list = []
    for i in x:
        if i.find("全型号") >= 0:
            end_idx = i.find("全型号")
            name = i[:end_idx]
            new_list += [j for j in dim_df[dim_df['cat_name'] == name].model.tolist() if j not in x]
        else:
            new_list.append(i)
    return new_list

def format_series(x, dim_df):
    def contains_chinese(s):
        return re.search('[\u4e00-\u9fff]', s) is not None
    new_list = []
    for i in x:
        if i.find("系列") >= 0:
            end_idx = i.find("系列")
            name = i[:end_idx]
            new_list += [j for j in dim_df[(dim_df.model.str.find(name)>=0) & (
                dim_df.model.apply(lambda x: not contains_chinese(x)))].model.tolist() if j not in x]
            new_list += [i]
        else:
            new_list.append(i)
    return new_list

In [5]:
dim_df = pd.read_excel("/data/dataset/kefu/国内客服助手（生产环境）_品类与型号关系表_表格.xlsx")
dim_df = dim_df.rename(columns={"编号": "model_id", 
                                "品类": "cat_name", 
                                "型号": "model", 
                                "是否已生效": "effective"})
dim_df["model"] = dim_df["model"].str.lower()
dim_df['model'] = dim_df['model'].apply(lambda x: ''.join(x.split(' ')))

In [6]:
df1 = pd.read_excel("/data/dataset/kefu/国内客服助手（生产环境）_知识库数据_知识库条目.xlsx")
# df1 = pd.read_excel("/data/dataset/kefu/最新的生产环境国内客服数据/最新的生产环境国内客服数据/国内客服助手（生产环境）_知识库数据.xlsx")
# df1 = pd.read_excel("/data/dataset/kefu/上线前的评估测试数据/上线前的评估测试数据/上线前的客服售后知识库数据集.xlsx")
df1 = df1[df1["问题"].notnull()]
query = f"""
select 
    *
    ,row_number() over (partition by 适配机型, 问题 order by 变更日期) as count_no
    ,count(*) over (partition by 适配机型, 问题) as count_unique
from 
    df1
"""

# 使用pysqldf执行SQL查询
df1 = pysqldf(query)
df1 = df1[(df1['count_unique']==df1['count_no'])]
# df1 = df1.drop(["附件", "聚合的适配机型", "重复值"], axis=1)
df1 = df1.drop(["count_no", "count_unique"], axis=1)
rename = {'编号': "qa_id", 
          '类型': "qa_type", 
          '问题': "question", 
          '回复': "answer", 
          '适配机型': "model", 
          '是否已生效': "effective", 
          '更新人': "update_by", 
          '变更日期': "update_time"}
df1 = df1.rename(columns=rename)
df1['question'] = df1['question'].apply(lambda x: x.strip('\r'))

# df1["model_list"] = df1["model"].apply(lambda x: format_model(x))
df1["model_list"] = df1["model"].apply(lambda x: format_model2(x))
df1["model_list"] = df1['model_list'].apply(lambda x: format_all_models(x, dim_df))
df1["model_list"] = df1['model_list'].apply(lambda x: format_series(x, dim_df))
df1["model_num"] = df1["model_list"].apply(lambda x: len(x))
df1["model_list"] = df1["model_list"].apply(lambda x: ','.join(x))

In [7]:
temp = df1.copy()
temp["model_list"] = temp["model_list"].apply(lambda x: x.split(','))
temp_exploded = temp.explode("model_list")

query = f"""
select 
    a.*
    ,b.model_id
    ,b.cat_name
from 
    temp_exploded a 
left join 
    dim_df b
on 
    a.model_list = b.model
"""

# 使用pysqldf执行SQL查询
temp_exploded = pysqldf(query)

query = f"""
select 
    qa_id
    ,group_concat(model_id) as model_id
    ,group_concat(cat_name) as cat_name
from 
    temp_exploded
group by 
    qa_id
"""

# 使用pysqldf执行SQL查询
temp_exploded = pysqldf(query)

query = f"""
select 
    a.*
    ,b.model_id
    ,b.cat_name
from 
    df1 a 
left join 
    temp_exploded b
on
    a.qa_id = b.qa_id
"""

# 使用pysqldf执行SQL查询
df1 = pysqldf(query)



In [8]:
model_list = dim_df.model.unique().tolist()
for i in range(df1.shape[0]):
    for model in df1.model_list.iloc[i].split(","):
        if model not in model_list:
            print(df1.qa_id.iloc[i], model)

In [9]:
dim_df.model = dim_df.model.apply(lambda x: x.replace("版本", "").replace("版", ""))
all_model_list = dim_df.model.tolist()
all_cat_list = dim_df.cat_name.unique().tolist()

In [10]:
df1["error_list"] = df1["question"].apply(lambda x: ",".join(find_error_with_reason(x)))

In [11]:
df1["answer"] = df1["answer"].apply(lambda x: x.replace("\r", ""))

In [12]:
df1.to_csv("temp_data.csv", index=None)

In [13]:
df1.shape

(3976, 16)

In [14]:
dim_df.to_csv("temp_dim.csv", index=None)

In [15]:
df = pd.read_excel("/data/dataset/kefu/国内客服助手（生产环境）_中转栈_问答总明细_标注.xlsx")

In [16]:
df["date"] = df["提问日期"].apply(lambda x: "-".join([str(x.year), str(x.month)]))

In [17]:
df["date"].value_counts("mean").sort_index()

date
2023-10    0.112364
2023-11    0.145791
2023-12    0.149953
2023-7     0.024164
2023-8     0.081219
2023-9     0.081219
2024-1     0.080548
2024-2     0.030340
2024-3     0.066989
2024-4     0.120687
2024-5     0.106726
Name: proportion, dtype: float64

In [18]:
df = df[~(df["匹配的标准问题编号"].isnull() & df["备注"].isnull())]
df = df[df["问题"].notnull()]

In [19]:
df["date"].value_counts("mean").sort_index()

date
2023-10    0.030303
2023-11    0.034965
2023-12    0.046620
2023-7     0.012432
2023-8     0.029526
2023-9     0.026807
2024-1     0.031080
2024-2     0.007770
2024-3     0.123543
2024-4     0.348485
2024-5     0.308469
Name: proportion, dtype: float64

In [20]:
df.shape

(2574, 19)

In [21]:
df["index"] = range(df.shape[0])

In [22]:
query = f"""
select 
    *
    ,row_number() over (partition by 问题 order by 编号) as count_no
    ,count(*) over (partition by 问题) as count_unique
from 
    df
"""

# 使用pysqldf执行SQL查询
df1 = pysqldf(query)

In [23]:
df1[df1['count_unique']>1].sort_values(["问题", "提问日期"])

Unnamed: 0,编号,问题,回复1,回复1附件,回复1标题,回复2,回复2附件,回复2标题,是否解决,提问者,...,类型,正确回复,提问日期(供统计用),匹配的标准问题编号,备注,父记录,date,index,count_no,count_unique
70,ICASK202405227738,A20序列号位置？,score: 0\n类型:\n常见问题\n问题:\nA20序列号位置？\n回复:\n1.机器...,image.png,A20序列号位置？,score: 0.188311100006\n类型:\n日常使用\n问题:\n每个档位都可以...,,每个档位都可以智能检测脏污度？,,衡晶晶,...,,,2024-05-22 00:00:00.000000,ICWIKI202405222688,,,2024-5,115,1,2
71,ICASK202405227739,A20序列号位置？,score: 0\n类型:\n常见问题\n问题:\nA20序列号位置？\n回复:\n1.机器...,img_v3_02b4_2d5ce0de-690c-4058-946c-29b9ca6b40...,A20序列号位置？,score: 0.188311100006\n类型:\n日常使用\n问题:\n每个档位都可以...,,每个档位都可以智能检测脏污度？,,衡晶晶,...,,,2024-05-22 00:00:00.000000,ICWIKI202405222688,,,2024-5,114,2,2
80,ICASK202405067282,A20续航,score: 0\n类型:\n产品参数\n问题:\nA20 air 的电池续航时间？\n回复...,,A20 air 的电池续航时间？,score: 0\n类型:\n产品参数\n问题:\nA20 air的电池续航时间\n回复:\...,,A20 air的电池续航时间,,苑倓姿,...,,,2024-05-06 00:00:00.000000,,没有匹配答案,,2024-5,570,1,2
81,ICASK202405167610,A20续航,score: 0\n类型:\n产品参数\n问题:\nA20 air 的电池续航时间？\n回复...,,A20 air 的电池续航时间？,score: 0\n类型:\n产品参数\n问题:\nA20 air的电池续航时间\n回复:\...,,A20 air的电池续航时间,,田翠萍,...,,,2024-05-16 00:00:00.000000,,问题机型无法匹配问题编号,,2024-5,243,2,2
82,ICASK202404276949,A20自清洁,score: 0\n类型:\n日常使用\n问题:\n怎样进行自清洁？\n回复:\n您好，将机...,,怎样进行自清洁？,score: 0\n类型:\n日常使用\n问题:\n什么是智能自清洁\n回复:\n智能自清洁...,,什么是智能自清洁,,田翠萍,...,,,2024-04-27 00:00:00.000000,ICWIKI202401031509，ICWIKI202401031511，ICWIKI20...,问题模糊，匹配答案较多,,2024-4,902,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2547,ICASK202309081327,集尘电压异常,score: 0\n类型:\n故障问题\n问题:\n错误35-集尘电压异常\n回复:\n3....,img_v2_d77347d6-2b35-481e-9a7a-7284b5c5ba8g.jpg,错误35-集尘电压异常,score: 0.137186169624\n类型:\n日常使用\n问题:\n集尘时为什么会...,,集尘时为什么会有异味？,已解决,田永强,...,,,2023-09-08 11:13:00.000000,ICWIKI202308220097,不确定机型,,2023-9,2453,3,5
2548,ICASK202309221629,集尘电压异常,score: 0\n类型:\n故障问题\n问题:\n错误35-集尘电压异常\n回复:\n3....,img_v2_d77347d6-2b35-481e-9a7a-7284b5c5ba8g.jpg,错误35-集尘电压异常,score: 0.137169599533\n类型:\n故障问题\n问题:\n集尘效果差或集...,,集尘效果差或集尘不干净,已解决,陈蕾,...,,,2023-09-22 19:07:00.000000,ICWIKI202308220097,不确定机型,,2023-9,2412,4,5
2549,ICASK202312174374,集尘电压异常,score: 0\n类型:\n故障问题\n问题:\n错误35-集尘电压异常\n回复:\n3....,img_v2_d77347d6-2b35-481e-9a7a-7284b5c5ba8g.jpg,错误35-集尘电压异常,score: 0.137169599533\n类型:\n故障问题\n问题:\n集尘效果差或集...,,集尘效果差或集尘不干净,,李国亮,...,,,2023-12-17 00:00:00.000000,ICWIKI202308220097,不确定机型,,2023-12,2170,5,5
2566,ICASK202403296048,风机异常,score: 0\n类型:\n故障问题\n问题:\n洗衣机故障代码E18，故障描述烘干再生风...,,洗衣机故障代码E18，故障描述烘干再生风机异常,score: 0\n类型:\n故障问题\n问题:\n洗衣机故障代码E16，故障描述烘干循环风...,,洗衣机故障代码E16，故障描述烘干循环风机异常,,蒋盼盼,...,,,2024-03-29 00:00:00.000000,ICWIKI202307243866，ICWIKI202307243868，ICWIKI20...,不确定机型,,2024-3,1802,1,2


In [24]:
df1 = df1[(df1['count_unique']==df1['count_no'])]

In [25]:
df1.shape

(1733, 22)

In [26]:
df["问题"].duplicated().sum()

841

In [27]:
df.shape

(2574, 20)

In [28]:
1733+841

2574

In [29]:
df = df.set_index("index").loc[df1["index"].tolist()]

In [30]:
df["model_list"] = df["问题"].apply(lambda x: find_model_with_pos(x, all_model_list))

In [31]:
df["error_list"] = df["问题"].apply(lambda x: find_error_with_reason_with_pos(x))

In [32]:
df["has_model"] = df["model_list"].apply(lambda x: len(x)>0)
df["has_error"] = df["error_list"].apply(lambda x: len(x)>0)
df["gt_isnull"] = df["匹配的标准问题编号"].isnull()
df["gt_notnull"] = df["匹配的标准问题编号"].notnull()

In [33]:
df["has_error"].value_counts()

has_error
False    1601
True      132
Name: count, dtype: int64

In [34]:
df["has_model"].value_counts()

has_model
False    952
True     781
Name: count, dtype: int64

In [35]:
def split_answer(x):
    if str(x) == "nan":
        return np.nan
    else:
        candidate = x.split("，")
        if len(candidate)==1:
            candidate = x.split(",")
        candidate = [j.strip() for j in candidate]
        return candidate

def get_list_num(x):
    if str(x) == "nan":
        return np.nan
    else:
        return len(x)

In [36]:
df["gt_list"] = df["匹配的标准问题编号"].apply(lambda x: split_answer(x))
df["gt_num"] = df["gt_list"].apply(lambda x: get_list_num(x))

In [37]:
uncertainty = ["不太确定",
"不确定",
"不确定机型",
"不确定机型，问题不明确",
"不确定问题具体方向",
"不确定问题意思及机型",
"匹配度不高",
"无法判断问题具体意思，匹配相似答案多个",
"机型不明确，导致匹配到的标准问题有多个",
"由于机型不明确，导致匹配到的标准问题有多个",
"由于机型不明确，导致匹配答案过多\n\n\n\n\n\n\n\n\n",
"由于机型问题不明确，导致匹配到的标准问题有多个",
"部确定机型",
"问题不明确",
"问题模糊，匹配答案较多"]
df["if_t"] = 1
for type_name in uncertainty:
    if type_name == "不确定机型":
        df.loc[(df["备注"]==type_name)&(df["has_model"]==False)&(df["has_error"]==False)&(df["gt_num"]>3), "if_t"] = 0
    else:
        df.loc[df["备注"]==type_name, "if_t"] = 0

In [38]:
uncertainty = ["不太确定",
"不确定",
# "不确定机型",
# "不确定机型，问题不明确",
"不确定问题具体方向",
# "不确定问题意思及机型",
"匹配度不高",
"无法判断问题具体意思，匹配相似答案多个",
# "机型不明确，导致匹配到的标准问题有多个",
# "由于机型不明确，导致匹配到的标准问题有多个",
# "由于机型不明确，导致匹配答案过多\n\n\n\n\n\n\n\n\n",
# "由于机型问题不明确，导致匹配到的标准问题有多个",
# "部确定机型",
"问题不明确",
"问题模糊，匹配答案较多"]
df["if_t2"] = 1
for type_name in uncertainty:
    df.loc[(df["备注"]==type_name)&(~((df["has_model"]==False)&(df["has_error"]==False))), "if_t2"] = 0

In [39]:
df["备注2"] = df["备注"]
df.loc[df["备注"].isnull(), "备注2"] = 'nan'

In [40]:
df.groupby(["备注2", "has_model", "has_error"]).agg({"gt_isnull": ["sum", "mean"], "gt_notnull": "sum", "gt_num": ["max", "min"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gt_isnull,gt_isnull,gt_notnull,gt_num,gt_num
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean,sum,max,min
备注2,has_model,has_error,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
,False,False,0,0.0,75,11.0,1.0
,False,True,0,0.0,2,1.0,1.0
,True,False,0,0.0,403,11.0,1.0
,True,True,0,0.0,50,2.0,1.0
不太确定,False,False,0,0.0,2,1.0,1.0
不太确定,True,False,0,0.0,4,1.0,1.0
不确定,False,False,0,0.0,25,10.0,1.0
不确定,True,False,0,0.0,17,10.0,1.0
不确定机型,False,False,2,0.004706,423,11.0,1.0
不确定机型,False,True,0,0.0,55,10.0,1.0


In [41]:
df["label"] = 0
df.loc[df["匹配的标准问题编号"].isnull(), "label"] = 1

In [42]:
# hard_type = ["由于机型不明确，导致匹配到的标准问题有多个",
#              "机型不明确，导致匹配到的标准问题有多个",
#              "由于机型不明确，导致匹配答案过多\n\n\n\n\n\n\n\n\n",
#              "由于机型问题不明确，导致匹配到的标准问题有多个",
#              "问题机型无法匹配问题编号",
#              "问题模糊，匹配答案较多"]
# for type_name in hard_type:
#     df.loc[df["备注"]==type_name, "label"] = 0

In [43]:
df["index"] = range(df.shape[0])
df["index"] = df["index"].astype(str)
df["question"] = df["问题"].apply(lambda x: x.lower())

In [44]:
train_all_df = df.copy()

In [45]:
train_all_df["label"].value_counts()

label
0    1089
1     644
Name: count, dtype: int64

In [46]:
train_all_df["label"].value_counts("mean")

label
0    0.62839
1    0.37161
Name: proportion, dtype: float64

In [47]:
#数据集分成N份,训练集DATAPARTS是1、2、3，评测样本为9
KFOLD = 3
y = "label"
folds = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=2020)
ar = list(folds.split(train_all_df.index, 
                      train_all_df[y].values))
dp = [train_all_df.index.values[i[1]] for i in ar]
for i,j in enumerate(dp):
    if i in [0, 1]:
        train_all_df.loc[j, 'set']='1train'
    elif i == 2:
        train_all_df.loc[j, 'set']='2test'

In [48]:
train_all_df["label2"] = train_all_df["label"]
train_all_df.loc[train_all_df["if_t"]==0, "label2"] = 1

In [49]:
train_all_df["label2_2"] = train_all_df["label"]
train_all_df.loc[train_all_df["if_t2"]==0, "label2_2"] = 1

In [50]:
train_all_df.groupby("set").agg({"label2": [len, "sum","mean"]})

Unnamed: 0_level_0,label2,label2,label2
Unnamed: 0_level_1,len,sum,mean
set,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1train,1156,608,0.525952
2test,577,298,0.516464


In [51]:
train_all_df.groupby("set").agg({"label2_2": [len, "sum","mean"]})

Unnamed: 0_level_0,label2_2,label2_2,label2_2
Unnamed: 0_level_1,len,sum,mean
set,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1train,1156,454,0.392734
2test,577,227,0.393414


In [52]:
train_all_df.groupby("set").agg({y: [len, "sum","mean"]})

Unnamed: 0_level_0,label,label,label
Unnamed: 0_level_1,len,sum,mean
set,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1train,1156,430,0.371972
2test,577,214,0.370884


In [53]:
features = Features({'sentence': Value(dtype='string'),
                     'label': Value(dtype='int32'),
                     "set": Value(dtype='string'),
                     "index": Value(dtype='string'),
                     "if_t": Value(dtype='int32')})

raw_datasets = Dataset.from_pandas(train_all_df[["question", "label", "set", "index", "if_t"]].rename(columns={"question": "sentence"}),
                    features=features, preserve_index=False)

In [105]:
raw_datasets.save_to_disk("/data/dataset/kefu/router_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/1733 [00:00<?, ? examples/s]

In [106]:
# raw_datasets = load_from_disk("/data/dataset/kefu/router_dataset")

In [54]:
train_dataset = raw_datasets.filter(lambda example: (example['set'] == '1train'))
test_dataset = raw_datasets.filter(lambda example: (example['set'] == '2test'))

Filter:   0%|          | 0/1733 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1733 [00:00<?, ? examples/s]

In [55]:
new_datasets = DatasetDict({
    'train': train_dataset,  
    'test': test_dataset    
})

In [56]:
# checkpoint = "/data/dataset/huggingface/hub/bert-base-chinese/"
checkpoint = "/workspace/data/private/zhuxiaohai/models/bert-finetuned-router/"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [57]:
def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

tokenized_datasets = new_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/1156 [00:00<?, ? examples/s]

Map:   0%|          | 0/577 [00:00<?, ? examples/s]

In [58]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'set', 'index', 'if_t', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1156
    })
    test: Dataset({
        features: ['sentence', 'label', 'set', 'index', 'if_t', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 577
    })
})

In [59]:
raw_tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/1733 [00:00<?, ? examples/s]

In [60]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [61]:
from transformers import AutoModelForSequenceClassification
# checkpoint = "/workspace/data/private/zhuxiaohai/models/bert-router/checkpoint-30"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [62]:
from transformers import TrainingArguments

In [63]:
training_args = TrainingArguments(
    "/workspace/data/private/zhuxiaohai/models/bert-router2",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    # lr_scheduler_type="cosine",
    per_device_train_batch_size=64,
    push_to_hub=False,
)

In [64]:
import evaluate
from scipy.stats import ks_2samp

In [65]:
metric = evaluate.load("roc_auc")

In [66]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = logits[:, 1]
    result = metric.compute(prediction_scores=predictions, references=labels)
    ks = ks_2samp(predictions[labels==1],predictions[labels==0]).statistic
    result.update({"ks": ks})
    return result

In [67]:
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [56]:
trainer.train()



Epoch,Training Loss,Validation Loss,Roc Auc,Ks
1,No log,0.683267,0.509805,0.08942
2,No log,0.654957,0.553887,0.102521
3,No log,0.647329,0.619763,0.202324
4,No log,0.630108,0.659951,0.301322
5,No log,0.626844,0.671981,0.302558
6,No log,0.625573,0.674741,0.325547
7,No log,0.626143,0.676781,0.329873
8,No log,0.627241,0.68053,0.341367
9,No log,0.626315,0.684547,0.338462
10,No log,0.626681,0.683805,0.338462




TrainOutput(global_step=20, training_loss=0.6002439498901367, metrics={'train_runtime': 47.8049, 'train_samples_per_second': 187.01, 'train_steps_per_second': 0.418, 'total_flos': 146782052231400.0, 'train_loss': 0.6002439498901367, 'epoch': 10.0})

In [370]:
trainer.evaluate(tokenized_datasets["test"])

{'eval_loss': 0.6229077577590942,
 'eval_roc_auc': 0.7131896707087871,
 'eval_ks': 0.3146288715532556,
 'eval_runtime': 1.0902,
 'eval_samples_per_second': 529.248,
 'eval_steps_per_second': 66.959}

In [71]:
a = trainer.predict(raw_tokenized_datasets)

In [72]:
train_all_df["pred"] = a.predictions[:, 1]

In [73]:
train_all_df["pred"].mean()

-0.36177596

In [68]:
from find_keywords import LabellerByRules
from recall import RecallBySearchEngine
from search_engine import QASearchEngine, VectorSim
import json

In [69]:
config_router = {
    "dim_df_path": "temp_dim.csv",
    "model_col": ("model", "model"),
    "cat_col": ("cat_name", "cat"),
    "error_col": ("error", "error"),
}
router = LabellerByRules(config_router)

config = {
    "search_engine": {
        "class": QASearchEngine,
        "database_path": "temp_data.csv",
        "id_col": "qa_id",
        "index_columns": [("model_list", "model"), ("cat_name", "cat"), ("error_list", "error")],
        "score_model": {
            "type": "vector",
            "class": VectorSim,
            "embedding_col": "question",
            "embedding_model_path": "/workspace/data/private/zhuxiaohai/models/bge_finetune_emb"
        },
    },
    "top_n": 10,
}
top_n = config.pop("top_n")
vector_search = RecallBySearchEngine(config)

In [74]:
test = train_all_df.copy()

In [75]:
all = []
for i in range(test.shape[0]):
    if i%10==0:
        print(i)
    query = test["question"].iloc[i]
    query_body = router.extract_keywords(query)
    query_cleaned = query_body["query_cleaned"]
    search_body = {
        "query": query_cleaned,
        "top_n": top_n,
        "labels": query_body["labels"]
    }
    results = vector_search.query_recalls(search_body)
    all.append(results)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630
640
650
660
670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850
860
870
880
890
900
910
920
930
940
950
960
970
980
990
1000
1010
1020
1030
1040
1050
1060
1070
1080
1090
1100
1110
1120
1130
1140
1150
1160
1170
1180
1190
1200
1210
1220
1230
1240
1250
1260
1270
1280
1290
1300
1310
1320
1330
1340
1350
1360
1370
1380
1390
1400
1410
1420
1430
1440
1450
1460
1470
1480
1490
1500
1510
1520
1530
1540
1550
1560
1570
1580
1590
1600
1610
1620
1630
1640
1650
1660
1670
1680
1690
1700
1710
1720
1730


In [78]:
len(all)

1733

In [79]:
all = [json.dumps(all[i]) for i in range(len(all))]

In [81]:
train_all_df["recall"] = all

In [None]:
train_all_df[["编号", "label", "pred", "set", "if_t", "label2", "label2_2", "if_t2",
              "recall"]].to_csv("validate3.csv", index=None)

In [84]:
# 将DataFrame转换为Dataset
new_field_dataset = Dataset.from_pandas(train_all_df[["pred"]])

# 将新字段添加到现有dataset中
raw_tokenized_datasets = raw_tokenized_datasets.add_column('new_field_name', new_field_dataset['pred'])

In [85]:
# 将DataFrame转换为Dataset
new_field_dataset = Dataset.from_pandas(train_all_df[["label2"]])

# 将新字段添加到现有dataset中
raw_tokenized_datasets = raw_tokenized_datasets.add_column('label2', new_field_dataset['label2'])

In [86]:
# 将DataFrame转换为Dataset
new_field_dataset = Dataset.from_pandas(train_all_df[["label2_2"]])

# 将新字段添加到现有dataset中
raw_tokenized_datasets = raw_tokenized_datasets.add_column('label2_2', new_field_dataset['label2_2'])

In [87]:
train_dataset = raw_tokenized_datasets.filter(lambda example: (example['set'] == '1train')&(example['new_field_name']<=0.115))
test_dataset = raw_tokenized_datasets.filter(lambda example: (example['set'] == '2test')&(example['new_field_name']<=0.115))

Filter:   0%|          | 0/1733 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1733 [00:00<?, ? examples/s]

In [88]:
train_dataset

Dataset({
    features: ['sentence', 'label', 'set', 'index', 'if_t', 'input_ids', 'token_type_ids', 'attention_mask', 'new_field_name', 'label2', 'label2_2'],
    num_rows: 780
})

In [89]:
test_dataset

Dataset({
    features: ['sentence', 'label', 'set', 'index', 'if_t', 'input_ids', 'token_type_ids', 'attention_mask', 'new_field_name', 'label2', 'label2_2'],
    num_rows: 420
})

In [90]:
# 去掉一个字段
train_dataset = train_dataset.remove_columns('label')
# 重命名一个字段
train_dataset = train_dataset.rename_column('label2_2', 'label')
# 去掉一个字段
test_dataset = test_dataset.remove_columns('label')
# 重命名一个字段
test_dataset = test_dataset.rename_column('label2_2', 'label')

In [91]:
train_dataset

Dataset({
    features: ['sentence', 'set', 'index', 'if_t', 'input_ids', 'token_type_ids', 'attention_mask', 'new_field_name', 'label2', 'label'],
    num_rows: 780
})

In [92]:
test_dataset

Dataset({
    features: ['sentence', 'set', 'index', 'if_t', 'input_ids', 'token_type_ids', 'attention_mask', 'new_field_name', 'label2', 'label'],
    num_rows: 420
})

In [93]:
checkpoint = "/data/dataset/huggingface/hub/bert-base-chinese/"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
from transformers import AutoModelForSequenceClassification
# checkpoint = "/workspace/data/private/zhuxiaohai/models/bert-router2/checkpoint-39"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /data/dataset/huggingface/hub/bert-base-chinese/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [94]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [95]:
new_datasets = DatasetDict({
    'train': train_dataset,  
    'test': test_dataset    
})

In [96]:
training_args = TrainingArguments(
    "/workspace/data/private/zhuxiaohai/models/bert-router3",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    # lr_scheduler_type="cosine",
    per_device_train_batch_size=64,
    push_to_hub=False,
)

In [97]:
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=new_datasets["train"],
    eval_dataset=new_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [98]:
trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc,Ks
1,No log,0.639568,0.522549,0.095186
2,No log,0.712873,0.553464,0.102373
3,No log,0.75067,0.580176,0.162441
4,No log,0.780728,0.586305,0.149695
5,No log,0.91485,0.586658,0.163254
6,No log,0.914972,0.567051,0.103864
7,No log,0.975367,0.565288,0.135322
8,No log,1.014985,0.570658,0.119864
9,No log,1.085411,0.578793,0.137356
10,No log,1.085337,0.585817,0.15322


Checkpoint destination directory /workspace/data/private/zhuxiaohai/models/bert-router3/checkpoint-13 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory /workspace/data/private/zhuxiaohai/models/bert-router3/checkpoint-26 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory /workspace/data/private/zhuxiaohai/models/bert-router3/checkpoint-39 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory /workspace/data/private/zhuxiaohai/models/bert-router3/checkpoint-52 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory /workspace/data/private/zhuxiaohai/models/bert-router3/checkpoint-65 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory /workspace/data/private/zhu

TrainOutput(global_step=130, training_loss=0.26006642855130707, metrics={'train_runtime': 54.8728, 'train_samples_per_second': 142.147, 'train_steps_per_second': 2.369, 'total_flos': 87299425946400.0, 'train_loss': 0.26006642855130707, 'epoch': 10.0})

In [99]:
trainer.evaluate(new_datasets["test"])

{'eval_loss': 1.0853369235992432,
 'eval_roc_auc': 0.5858169491525423,
 'eval_ks': 0.15322033898305085,
 'eval_runtime': 0.7918,
 'eval_samples_per_second': 530.407,
 'eval_steps_per_second': 66.932,
 'epoch': 10.0}