In [1]:
import pandas as pd 
import datetime
import numpy as np 
import joblib
import re  
import jieba 
import json
import time
from openai import AzureOpenAI
from pandasql import sqldf

In [2]:
# pandasql查询函数需要的环境
pysqldf = lambda q: sqldf(q, globals())

In [2]:
# 原始数据处理
def format_model(x):
    model_list = x.split(',')
    model_list = [i.strip().lower().replace(" ", "") for i in model_list]
    new_list = [model_list[0]]
    i = 1
    while i < len(model_list):
        if (i != len(model_list) - 1) and (model_list[i-1] == model_list[i]):
            new_list.append(model_list[i]+model_list[i+1])
            if i < len(model_list) - 1:
                i += 2
            else:
                break
        elif (i != len(model_list) - 1) and (model_list[i-1] != model_list[i]):
            new_list.append(model_list[i])
            i += 1
        elif (model_list[i] == "上下水") or (model_list[i] == "air"):
            for j in range(len(new_list)):
                if model_list[i-1] == new_list[j]:
                    new_list.pop(j)
                    break
            new_list.append(model_list[i-1]+model_list[i])
            i += 1
        else:
            new_list.append(model_list[i])
            break
    return new_list

def format_all_models(x, dim_df):
    new_list = []
    for i in x:
        if i.find("全型号") >= 0:
            end_idx = i.find("全型号")
            name = i[:end_idx]
            new_list += [j for j in dim_df[dim_df['cat_name'] == name].model.tolist() if j not in x]
        else:
            new_list.append(i)
    return new_list

def format_series(x, dim_df):
    def contains_chinese(s):
        return re.search('[\u4e00-\u9fff]', s) is not None
    new_list = []
    for i in x:
        if i.find("系列") >= 0:
            end_idx = i.find("系列")
            name = i[:end_idx]
            new_list += [j for j in dim_df[(dim_df.model.str.find(name)>=0) & (
                dim_df.model.apply(lambda x: not contains_chinese(x)))].model.tolist() if j not in x]
            new_list += [i]
        else:
            new_list.append(i)
    return new_list

In [3]:
def count_gt(x):
    if str(x) == "nan":
        return 0
    else:
        return len(x.split(","))  
    
def find_non_chinese_substrings(s):
    # 正则表达式解释：
    # [^\u4e00-\u9fff\W]+ 匹配非中文字符和非ASCII标点的连续字符
    # 但这样会排除空格，所以我们需要允许空格存在
    # 我们使用(?:[^\u4e00-\u9fff\W]| )+ 来实现这一点，(?:) 是非捕获组，用于匹配模式但不作为捕获结果返回
    # [^\u4e00-\u9fff\W] 匹配非中文且非标点的字符，| 表示或，空格 ' ' 被显式允许
    pattern = r'(?:[^\u4e00-\u9fff\W]| )+'
    
    # 使用findall方法查找所有匹配项
    matches = re.findall(pattern, s)
    
    # 过滤掉只包含空格的字符串
    matches = [match for match in matches if not match.isspace()]
    
    return matches

def clean_string(s):
    s = s.replace(" ", "").lower()
    return s

def find_model(x, all_model_list):
    x = x.replace("\n", "") 
    x = find_non_chinese_substrings(x)
    result = [clean_string(s) for s in x]
    return [model for model in all_model_list if model in result]

def find_cat(x, all_cat_list):
    return [name for name in all_cat_list if name in x]   

def filter_model(x, model_list):
    x = x.split(",")
    for model in model_list:
        if model in x:
            return True
    return False

def find_error_with_reason(a):
    # 第一次匹配“错误xxx”
    pattern1 = r"错误\s*\d+"
    matches1 = re.findall(pattern1, a)
    
    # 第二次匹配“错误原因xxx”
    pattern2 = r"错误原因\s*\d+"
    matches2 = re.findall(pattern2, a)

    # 合并两次匹配的结果
    matches = matches1 + matches2
    
    return [name.replace(" ", "").replace("原因", "") for name in matches]

def filter_reason(x, query_reason_list):
    reason_list = find_error_with_reason(x)
    for name in query_reason_list:
        if name in reason_list:
            return True 
    return False

def transform_model_name(x, all_model_list):
    x = x.replace("\n", "") 
    candidates = find_non_chinese_substrings(x)
    for name in candidates:
        cleaned_name = clean_string(name)
        for model in all_model_list:
            if cleaned_name == model:
                x = x.replace(name, model)
                break
    return x 

def remove_model_name(x, all_model_list):
    x = x.replace("\n", "") 
    candidates = find_non_chinese_substrings(x)
    for name in candidates:
        if clean_string(name) in all_model_list:
            x = x.replace(name, "")
    return x 

In [5]:
df2 = pd.read_csv("/data/dataset/kefu/database_with_emb20240315.csv")

In [6]:
oot = pd.read_excel("/data/dataset/kefu/国内客服助手（生产环境）_中转栈.xlsx")
oot = oot.rename(columns={"编号": "qa_id",
 "问题": "question",
 "回复1": "answer1_all",
 "回复1标题": "answer1",
 "回复2": "answer2_all",
 "回复2标题": "answer2",
 "是否解决": "if_solved",
 "提问者": "requester",
 "提问者所在组别": "requester_group",
 "提问日期": "request_time",
 "类型": "data_type", 
 "正确回复": "gt_answer"})
oot = oot.drop(["回复1附件", "回复2附件", "提问日期(供统计用)"], axis=1)

In [7]:
# oot = oot[oot.if_solved.notnull()]
# oot = oot[oot.answer1_all.notnull()]
oot["if_solved"] = oot["if_solved"].map({"已解决": 1, "未解决": 0})
oot.loc[oot.qa_id=="ICASK202308010583", "gt_answer"] = "ICWIKI202307243975"
oot.loc[oot.qa_id=="ICASK202308010582", "gt_answer"] = "ICWIKI202308210081"
# oot = oot[oot['gt_answer'].str.find("ICW")>=0]
oot = oot.rename(columns={"gt_answer": "gt_qa_id"})
oot = oot[oot.question.notnull()]

temp = oot.copy()
temp["gt_qa_id"] = temp["gt_qa_id"].astype(str).apply(lambda x: x.split(','))
temp_exploded = temp.explode("gt_qa_id")
temp_right = df2[['qa_id', 
                               'question', 
                               'answer', 
                               'model', 
                               'qa_type', 
                               'model_list', 
                               'cat_name']].copy()
query = f"""
select 
    a.*
    ,b.question as question_kg
    ,b.answer as answer_kg
    ,b.model as model
    ,b.qa_type
    ,b.model_list
    ,b.cat_name
from 
    temp_exploded a 
left join 
    temp_right b
on 
    a.gt_qa_id = b.qa_id
"""

# 使用pysqldf执行SQL查询
temp_exploded = pysqldf(query)

In [8]:
query = f"""
select 
    qa_id
    ,group_concat(question_kg) as question_kg
    ,group_concat(answer_kg) as answer_kg
    ,group_concat(model) as model
    ,group_concat(qa_type) as qa_type
    ,group_concat(model_list) as model_list
    ,group_concat(cat_name) as cat_name
from 
    temp_exploded
group by 
    qa_id
"""

# 使用pysqldf执行SQL查询
temp_exploded = pysqldf(query)

In [9]:
query = f"""
select 
    a.*
    ,b.question_kg
    ,b.answer_kg
    ,b.model
    ,b.qa_type
    ,b.model_list
    ,b.cat_name
from 
    oot a 
left join 
    temp_exploded b
on
    a.qa_id = b.qa_id
"""

# 使用pysqldf执行SQL查询
oot = pysqldf(query)

oot['gt_num'] = oot['gt_qa_id'].astype(str).apply(lambda x: count_gt(x))

In [12]:
joblib.dump(oot[["qa_id", 
     "question",	
     "gt_qa_id",
     "gt_num",
     "question_kg",
     "answer_kg",
     "model",
     "qa_type",
     "model_list",
     "cat_name",
     ]], "/data/dataset/kefu/oot20240422.json")

['/data/dataset/kefu/oot20240422.json']

In [11]:
oot = joblib.load("/data/dataset/kefu/oot20240422.json")

In [11]:
oot.iloc[:2].to_json(force_ascii=False)

'{"qa_id":{"0":"ICASK202403145752","1":"ICASK202403145751"},"question":{"0":"H1 neo 羊毛洗烘干温度","1":"p10烘干"},"gt_qa_id":{"0":null,"1":null},"gt_num":{"0":1,"1":1},"question_kg":{"0":null,"1":null},"answer_kg":{"0":null,"1":null},"model":{"0":null,"1":null},"qa_type":{"0":null,"1":null},"model_list":{"0":null,"1":null},"cat_name":{"0":null,"1":null}}'

In [5]:
def find_non_chinese_substrings(s):
    # 正则表达式解释：
    # [^\u4e00-\u9fff\W]+ 匹配非中文字符和非ASCII标点的连续字符
    # 但这样会排除空格，所以我们需要允许空格存在
    # 我们使用(?:[^\u4e00-\u9fff\W]| )+ 来实现这一点，(?:) 是非捕获组，用于匹配模式但不作为捕获结果返回
    # [^\u4e00-\u9fff\W] 匹配非中文且非标点的字符，| 表示或，空格 ' ' 被显式允许
    pattern = r'(?:[^\u4e00-\u9fff\W]| )+'
    
    # 使用findall方法查找所有匹配项
    matches = re.findall(pattern, s)
    
    # 过滤掉只包含空格的字符串
    matches = [match for match in matches if not match.isspace()]
    
    return matches

def clean_string(s):
    s = s.replace(" ", "").lower()
    return s

def transform_model_name(x, all_model_list):
    x = x.replace("\n", "") 
    candidates = find_non_chinese_substrings(x)
    for name in candidates:
        cleaned_name = clean_string(name)
        for model in all_model_list:
            if cleaned_name == model:
                x = x.replace(name, model)
                break
    return x 

In [294]:
dim_df = pd.read_csv("/data/dataset/kefu/dim_df20240315.csv")
all_model_list = dim_df.model.tolist()
all_cat_list = dim_df.cat_name.unique().tolist()

In [6]:
from preprocessing import extract_versions, extract_models, WordCut

def extract_keywords(question, all_model_list, wc):
    original_question = question
    model_list, question = extract_models(question, all_model_list)
    version_list, question = extract_versions(question)
    key_words = wc.cut(question)
    key_words = [i for i in key_words if ((i.find("model")<0) and (i.find("version")<0))]
    return {"question": original_question, "model": model_list, "version": version_list, "keywords": key_words}


In [520]:
wc = WordCut()

In [521]:
oot["keywords"] = oot["question"].apply(
    lambda x: extract_keywords(x, all_model_list, wc))

In [63]:
oot[oot.version_keywords.apply(lambda x: len(x)>0)][["question", "keywords"]] 

Unnamed: 0,question,keywords
516,P10 Pro上下水版清洁液盒多少毫升,"{'model': ['p10pro'], 'version': ['上下水版'], 'ke..."
517,P10 Pro上下水版清洁液盒多大,"{'model': ['p10pro'], 'version': ['上下水版'], 'ke..."
745,G10S Pure上下水版本有银离子吗,"{'model': ['g10spure'], 'version': ['上下水版'], '..."
805,P10上下水版本怎么排水,"{'model': ['p10'], 'version': ['上下水版'], 'keywo..."
929,p10 pro上下水版 清洁液用完会提醒吗,"{'model': ['p10pro'], 'version': ['上下水版'], 'ke..."
1269,上下水版本挤不上水,"{'model': [], 'version': ['上下水版'], 'keywords':..."
1577,p10上下水版本清洁液加满用多久,"{'model': ['p10'], 'version': ['上下水版'], 'keywo..."
2366,G20上下水版本的产品的执行标准,"{'model': ['g20'], 'version': ['上下水版'], 'keywo..."
2517,P10PRO上下水版排水异常,"{'model': ['p10pro'], 'version': ['上下水版'], 'ke..."
2548,g20上下水版水箱怎么补水,"{'model': ['g20'], 'version': ['上下水版'], 'keywo..."


In [36]:
df_sweeping = pd.read_excel("/data/dataset/kefu/产品知识整理资料.xlsx", sheet_name="扫地机")

In [37]:
df_sweeping.loc[df_sweeping["上市时间"]==45323, "上市时间"] = datetime.datetime(2024, 2, 1, 0, 0)

In [38]:
df_mopping = pd.read_excel("/data/dataset/kefu/产品知识整理资料.xlsx", sheet_name="洗地机")

In [39]:
df_washing = pd.read_excel("/data/dataset/kefu/产品知识整理资料.xlsx", sheet_name="洗衣机")
df_washing = df_washing.iloc[:3]

In [40]:
for col in ["商品编码", "平台ID", "商品id"]:
    df_sweeping[col] = df_sweeping[col].astype(str)
    df_sweeping.loc[df_sweeping[col]=='nan', col] = np.nan
    df_mopping[col] = df_mopping[col].astype(str)
    df_mopping.loc[df_mopping[col]=='nan', col] = np.nan
    df_washing[col] = df_washing[col].astype(np.int64).astype(str)
    df_washing.loc[df_washing[col]=='nan', col] = np.nan

In [41]:
print(df_washing.shape)
print(df_mopping.shape)
print(df_sweeping.shape)

(3, 73)
(7, 87)
(14, 91)


In [42]:
df_washing.to_csv("/data/dataset/kefu/washing.csv", index=None)
df_mopping.to_csv("/data/dataset/kefu/mopping.csv", index=None)
df_sweeping.to_csv("/data/dataset/kefu/sweeping.csv", index=None)

In [43]:
df = pd.concat([df_washing, df_mopping, df_sweeping], axis=0).reset_index(drop=True)

In [44]:
df.head()

Unnamed: 0,商品编码,平台ID,商品id,商品型号,商品名字,商品分类,商品链接,平台,店铺名称,服务别名,...,主刷转速,有无虚拟墙,电源线长,是否支持银离子抑菌,功能,清扫模式,有无地毯自动增压模式,清扫路线,是否支持自动回洗拖布,保修期
0,701135838300,701135838300,701135838300,H1,石头12公斤分子筛洗烘一体H1家用烘干全自动滚筒洗衣机大容量除菌,洗衣机,https://detail.tmall.com/item.htm?id=701135838...,淘宝,石头电器旗舰店,,...,,,,,,,,,,
1,735903284970,735903284970,735903284970,H1 Neo,【新品】石头分子筛12KG洗烘一体家用全自动滚筒洗衣机H1Neo 除菌,洗衣机,https://detail.tmall.com/item.htm?id=735903284...,淘宝,石头电器旗舰店,,...,,,,,,,,,,
2,739010441482,739010441482,739010441482,M1,【新品】石头1kg内衣裤迷你分子筛洗烘一体全自动滚筒洗衣机M1,洗衣机,https://detail.tmall.com/item.htm?id=739010441...,淘宝,石头电器旗舰店,,...,,,,,,,,,,
3,5014561169968,714853608871,5014561169968,A10 Ultra,【全屋清洁】石头洗地机A10 Ultra家用除菌除螨贴边吸拖洗一体机,洗地机,https://detail.tmall.com/item.htm?id=714853608...,淘宝,石头电器旗舰店,,...,,,,,,,,,,
4,5213854695645,725413870652,5213854695645,A10 UltraE,【一机多用】石头洗地机A10 UltraE家用除菌除螨贴边吸拖洗一体机,洗地机,https://detail.tmall.com/item.htm?id=725413870...,淘宝,石头电器旗舰店,,...,,,,,,,,,,


In [45]:
df.to_csv("/data/dataset/kefu/knowledge.csv", index=None)

In [199]:
def read_json(json_file='/data/dataset/kefu/gen_different_keywords_different_models.jsonl'):
    df = []
    # 打开文件进行读取
    with open(json_file, 'r') as file:
        # 逐行读取
        for line in file:
            # 将每行的内容从JSON字符串转换为Python字典
            data = json.loads(line.strip())
            # 现在可以处理这个字典了
            df.append(data)
    return df 

def extract_json(x):
    result = {"template": x["template"]}
    result["replace"] = x["gen"]["replace"]
    return json.dumps(result, ensure_ascii=False)

In [200]:
file_list = ['/data/dataset/kefu/gen_same_keywords_for_models.jsonl',
             '/data/dataset/kefu/gen_different_keywords_different_models.jsonl']
df = []
for file in file_list:
    temp = pd.DataFrame(read_json(file))
    temp['type'] = file.split('/')[-1].split(".")[0]
    df.append(temp)
df = pd.concat(df, axis=0).reset_index(drop=True)

In [201]:
df["final_prompt"] = df.apply(lambda x: extract_json(x), axis=1)

In [185]:
prompt = """
输入json含义如下：
1. template是一个问句，问句里面用[]符号括起来的部分是命名实体
2. replace是上述每个命名实体的具体值

请你按以下步骤操作：
1. 将template里面的每个命名实体替换成replace里面对应的值
2. 把这个句子改写得语言通顺或者完全更改template的原始句式且通顺，但是请保持“问询词”开头的命名实体不变
3. 在新生成的句子中，提取与原始“关键词”开头的所有命名实体对应的新实体，并用括号[]括起来
例如：
输入
{{"template": "[问询词0]的[关键词0]是多少？", "replace": {{"[问询词0]": "g20", "[关键词0]": "是否支持上下水"}}}},
输出
{{"sentence": "请问g20是否有[上下水功能]？", "replace": {{"是否支持上下水": "上下水功能"}}}}

请对每个输入给出三种完全不同的说法（包括句式更改、命名实体转义），最后放在一个列表里面返回, 除此之外不要给出任务其他的信息，即
[json0, json1, json2]

以下是我的输入：
{}
"""

In [706]:
list(all_dict.keys())

['商品链接',
 '平台',
 '店铺名称',
 '服务别名',
 '产品重量',
 '产品毛重',
 '充电时长',
 '主机额定功率',
 '上下水套件额定功率',
 '额定输入(电压)',
 '额定输入功率（充电 + 烘干状态）',
 '额定输入功率（集尘状态）',
 '额定输入功率（热水洗布）',
 '最大进水压力',
 '热水洗布',
 '动态复洗拖布',
 '耗材更换周期',
 '如何关闭/开启语音助手',
 '最大吸力',
 '吸力大小',
 '导航类型',
 '机身维护',
 '机载水箱容量',
 '地毯模式',
 '是否支持自动集尘',
 '基座水箱容量',
 '清洁液储存盒',
 '套餐类型',
 '机身尺寸',
 '整体尺寸',
 '包装尺寸',
 '电池容量',
 '电池续航',
 '水箱续航',
 '机载尘盒容量',
 '尘袋容量',
 '标配清单',
 '支持APP',
 '回洗方式',
 '清洗模式',
 '热水洗布.1',
 '集尘模式',
 '集尘频率',
 '烘干时长',
 '烘干温度',
 '扫拖模式',
 '充电模式',
 '是否有定时预约功能',
 '最大噪音',
 '拖地方式',
 '适用面积',
 '电器基站功能',
 '避障方式',
 '软件算法',
 '是否支持上下水',
 '烘干方式',
 '最高高度',
 '水箱类型',
 '是否支持烘干',
 '悬崖传感器',
 '污水箱容量',
 '升降模组',
 '清水箱容量',
 '扫地机类型',
 '采购地',
 '型号',
 '颜色分类',
 '附加功能',
 '实时视频',
 '质保年限',
 '是否带遥控器',
 '生产企业',
 '主刷转速',
 '有无虚拟墙',
 '电源线长',
 '是否支持银离子抑菌',
 '功能',
 '清扫模式',
 '品牌',
 '有无地毯自动增压模式',
 '清扫路线',
 '是否支持自动回洗拖布',
 '上市时间',
 '保修期',
 '烘干类型',
 '颜色分类(销售属性)',
 '自清洁需要用水量',
 '墙边漏扫',
 '工作噪音',
 '烘干噪音',
 '电源线',
 'Wi-Fi连接',
 '整体高度',
 '刷头尺寸',
 '地板刷尺寸',
 '床刷尺寸',
 '缝隙刷尺寸',
 '延长杆长度',
 '充电座尺寸',

In [719]:
prompt = """
我要做命名实体识别模型，请你在输入的句子中提取关键字（实体），要提取的关键字应该与以下这些家庭清洁机器人的常用参数有关，如果没有相关就不要提取任何东西，常用参数有：
{}

请将结果放在一个列表里面返回，除此之外不要输出其他任何信息，举例如下：
输入句子：
P20标准版的额定输入(电压)是多少？
输出：
["额定输入(电压)"]

输入句子：
请问a10ultra的清洁功能设定及a10ultrae的地面清洁调整是如何选择的？
输出：
["清洁功能设定", "地面清洁调整"]

输入句子：
A10报错？
输出：
[]

输入句子：
{}
""".format(json.dumps(list(all_dict.keys()), ensure_ascii=False), '我想知道g20标准版具有何种服务标识？')

In [720]:
from FlagEmbedding import FlagModel

RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
libpython3.10.so.1.0: cannot open shared object file: No such file or directory

In [721]:
print(prompt)


我要做命名实体识别模型，请你在输入的句子中提取关键字（实体），要提取的关键字应该与以下这些家庭清洁机器人的常用参数有关，如果没有相关就不要提取任何东西，常用参数有：
["商品链接", "平台", "店铺名称", "服务别名", "产品重量", "产品毛重", "充电时长", "主机额定功率", "上下水套件额定功率", "额定输入(电压)", "额定输入功率（充电 + 烘干状态）", "额定输入功率（集尘状态）", "额定输入功率（热水洗布）", "最大进水压力", "热水洗布", "动态复洗拖布", "耗材更换周期", "如何关闭/开启语音助手", "最大吸力", "吸力大小", "导航类型", "机身维护", "机载水箱容量", "地毯模式", "是否支持自动集尘", "基座水箱容量", "清洁液储存盒", "套餐类型", "机身尺寸", "整体尺寸", "包装尺寸", "电池容量", "电池续航", "水箱续航", "机载尘盒容量", "尘袋容量", "标配清单", "支持APP", "回洗方式", "清洗模式", "热水洗布.1", "集尘模式", "集尘频率", "烘干时长", "烘干温度", "扫拖模式", "充电模式", "是否有定时预约功能", "最大噪音", "拖地方式", "适用面积", "电器基站功能", "避障方式", "软件算法", "是否支持上下水", "烘干方式", "最高高度", "水箱类型", "是否支持烘干", "悬崖传感器", "污水箱容量", "升降模组", "清水箱容量", "扫地机类型", "采购地", "型号", "颜色分类", "附加功能", "实时视频", "质保年限", "是否带遥控器", "生产企业", "主刷转速", "有无虚拟墙", "电源线长", "是否支持银离子抑菌", "功能", "清扫模式", "品牌", "有无地毯自动增压模式", "清扫路线", "是否支持自动回洗拖布", "上市时间", "保修期", "烘干类型", "颜色分类(销售属性)", "自清洁需要用水量", "墙边漏扫", "工作噪音", "烘干噪音", "电源线", "Wi-Fi连接", "整体高度", "刷头尺寸", "地板刷尺寸", "床刷尺寸", "缝隙刷尺寸", "延长杆长度", "充电座尺寸", "洗地风机电功率", "吸

In [643]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('/workspace/data/private/zhuxiaohai/models/bge_finetune_emb')


In [698]:
dd = pd.read_csv("/data/dataset/kefu/emb.csv")
dd.columns = ['question', 'emb']

In [699]:
emb = []
for i in range(dd.shape[0]):
    a = json.loads(dd["emb"].iloc[i])
    emb.append(a)
dd['emb'] = emb

In [700]:
dd2 = pd.read_csv("/data/dataset/kefu/emb2.csv")
dd2.columns = ['question', 'emb']
emb = []
for i in range(dd.shape[0]):
    a = json.loads(dd2["emb"].iloc[i])
    emb.append(a)
dd2['emb'] = emb

In [688]:
q_embeddings = model.encode(dd.question.tolist(), normalize_embeddings=True, batch_size=32)

In [701]:
for i in range(dd.shape[0]):
    np.testing.assert_array_equal(np.array(dd['emb'].iloc[i]), np.array(dd2['emb'].iloc[i]))

In [682]:
q_embeddings[0].dtype

dtype('float32')

In [689]:
np.linalg.norm(q_embeddings[20], ord=2)

1.0

In [685]:
joblib.dump(q_embeddings, 'fff.json')

['fff.json']

In [687]:
joblib.load('fff.json').dtype

dtype('float32')

In [718]:
df_exploded.iloc[11]

cat                                                       sweeping
gen              {'question': 'g20标准版的服务别名是什么？', 'prompt': [{'p...
template                                         [问询词0]的[关键词0]是什么？
type                                  gen_same_keywords_for_models
final_prompt     {"template": "[问询词0]的[关键词0]是什么？", "replace": {...
augment          {'sentence': '我想知道g20标准版具有何种[服务标识]？', 'replace...
question                                       我想知道g20标准版具有何种服务标识？
keywords         {'question': '我想知道g20标准版具有何种服务标识？', 'model': [...
entity                                                      [服务别名]
simple_entity                                               [服务别名]
Name: 11, dtype: object

In [203]:

client = AzureOpenAI(
    api_version="2024-02-15-preview",
    azure_endpoint="https://csagent.openai.azure.com/",
    api_key="346ac6661e314a9d8b91b6a99202ba42"
)

In [204]:
def generate_answer(prompt, input_json, model="gpt-4-8k"): # model = "deployment_name"
    response = client.chat.completions.create(
    model=model, # model = "deployment_name".
    messages=[
        {"role": "user", "content": prompt.format(input_json)},
    ]
    )
    return response.choices[0].message.content

In [211]:
result = []
prev_time = time.time()
for i in range(df.shape[0]):
    if i % 10 == 0:
        print(i)
        cur_time = time.time()
        print(cur_time-prev_time)
        prev_time = cur_time
    item = generate_answer(prompt, df["final_prompt"].iloc[i])
    result.append(item)
    joblib.dump(item, "/data/dataset/kefu/temp/{}.jsonl".format(i))

0
0.0010311603546142578
10
54.72938394546509
20
74.10766887664795
30
66.59176898002625
40
53.349692821502686
50
72.52705764770508
60
195.05380153656006
70
58.798354625701904
80
66.6578299999237
90
56.18673753738403
100
59.12020993232727
110
68.89034223556519
120
65.12829995155334
130
72.94213485717773
140
67.03654384613037
150
73.078693151474
160
61.22945499420166
170
66.5701413154602
180
48.48047065734863
190
54.813912868499756
200
68.8300507068634
210
51.10998058319092
220
63.24893808364868
230
58.92753338813782
240
103.40141534805298
250
92.78640294075012
260
108.46890020370483
270
101.64776706695557
280
98.2324366569519
290
348.35827112197876
300
92.04819536209106
310
98.41790628433228
320
101.120445728302
330
85.83840441703796
340
94.09415578842163
350
108.66513323783875
360
99.65352845191956
370
103.49295353889465
380
88.19819498062134
390
96.7720844745636
400
104.1633551120758
410
64.83920764923096
420
79.10374331474304
430
83.6205108165741
440
83.25881052017212
450
83.564756870

In [252]:
result = []
for i in range(df.shape[0]):
    item = joblib.load("/data/dataset/kefu/temp/{}.jsonl".format(i))
    result.append(item)

In [270]:
new_result = []
for i in range(len(result)):
    print(i)
    try:
        item = re.search('：(.*)', result[i], re.DOTALL).group(1)
    except:
        item = result[i]
    
    try:
        item = json.loads(item)
    except:
        p = re.compile(r'\{(.*?)\}')
        matches = p.findall(item)
        item = [json.loads('{'+m+'}}') for m in matches]
    new_result.append(item)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [274]:
df["augment"] = new_result

In [393]:
joblib.dump(df, "/data/dataset/kefu/gen_gpt4_qa.json")

['/data/dataset/kefu/gen_gpt4_qa.json']

In [2]:
df = joblib.load("/data/dataset/kefu/gen_gpt4_qa.json")

In [3]:
df.augment.iloc[0]

[{'sentence': '你能提供一下g20标准版的[商品访问链接]吗？', 'replace': {'商品链接': '商品访问链接'}},
 {'sentence': 'g20标准版的[购买链接]可以获取吗？', 'replace': {'商品链接': '购买链接'}},
 {'sentence': '你知道g20标准版在哪里可以找到[在线购买页面链接]嘛？', 'replace': {'商品链接': '在线购买页面链接'}}]

In [4]:
df_exploded = df.explode("augment")

In [5]:
df_exploded = df_exploded.reset_index(drop=True)

In [6]:
df_exploded['question'] =  df_exploded['augment'].apply(lambda x: x["sentence"].replace("[", "").replace("]", ""))

In [7]:
error_indices = []
for i in range(df_exploded.shape[0]):
    replace = df_exploded['augment'].iloc[i]["replace"]
    for key in replace:
        if key.find("[")>=0:
            error_indices.append(i)
            break 

In [8]:
df_exploded.shape

(1419, 7)

In [9]:
df_exploded = df_exploded.drop([df_exploded.index[i] for i in error_indices])

In [10]:
df_exploded.shape

(1412, 7)

In [11]:
df_exploded.gen.iloc[0]

{'question': 'g20标准版的商品链接是什么？',
 'prompt': [{'primary_value': 'g20标准版',
   'key': '商品链接',
   '商品链接': 'https://detail.tmall.com/item.htm?id=707235140054&skuId=4969634692760&spm=a21dvs.23580594.0.0.3f063d0d9Hg2Xc'}],
 'replace': {'[问询词0]': 'g20标准版', '[关键词0]': '商品链接'}}

In [12]:
ner_list = []
for i in range(df_exploded.shape[0]):
    replace = df_exploded['augment'].iloc[i]["replace"]
    primaries = [j['primary_value'] for j in df_exploded['gen'].iloc[i]['prompt']]
    flag = False
    entity = []
    for key in replace:
        if replace[key] in primaries:
            print(i)
            flag = True
            break 
        entity.append(replace[key])
    if flag:
        print(i, 'type1 error')
        ner_list.append(np.nan)
        continue
    question =  df_exploded['question'].iloc[i]
    ner = {}
    for item in entity:
        matches = list(re.finditer(item, question))
        loc = [[j.start(), j.end()-1] for j in matches]
        ner[item] = loc 
    ner_list.append({"name": ner})

954
954 type1 error


In [13]:
df_exploded.iloc[1000]

cat                                                       mopping
gen             {'question': 'a10ultra的Wi-Fi连接和a10ultrae的机身尺寸是...
template                  [问询词0-0]的[关键词0-0]和[问询词1-0]的[关键词1-0]是什么？
type                      gen_different_keywords_different_models
final_prompt    {"template": "[问询词0-0]的[关键词0-0]和[问询词1-0]的[关键词1...
augment         {'sentence': '请问a10ultra的[WiFi灵敏度]与a10ultrae的[...
question                  请问a10ultra的WiFi灵敏度与a10ultrae的设备尺寸各是怎样的？
Name: 1000, dtype: object

In [14]:
ner_list[1000]

{'name': {'WiFi灵敏度': [[11, 17]], '设备尺寸': [[29, 32]]}}

In [15]:
df_exploded["ner_list"] = ner_list

In [16]:
def bio_tagging(texts, labels):
    bio_words = []
    bio_tags = []
    texts = texts.lower()
    for i, char in enumerate(texts):
        tag = "O"  # 默认为 Outside

        for entity_type, spans in labels.items():
            for span in spans.values():
                for ind in span:
                    if i == int(ind[0]):
                        tag = "B-" + entity_type
                        break
                    elif int(ind[0]) < i <= int(ind[1]):
                        tag = "I-" + entity_type
                        break

        bio_words.append(char)
        bio_tags.append(tag)

    return bio_words, bio_tags

In [17]:
df_exploded = df_exploded[df_exploded["ner_list"].notnull()]

In [18]:
 df_exploded[["question", "ner_list"]].head()

Unnamed: 0,question,ner_list
0,你能提供一下g20标准版的商品访问链接吗？,"{'name': {'商品访问链接': [[13, 18]]}}"
1,g20标准版的购买链接可以获取吗？,"{'name': {'购买链接': [[7, 10]]}}"
2,你知道g20标准版在哪里可以找到在线购买页面链接嘛？,"{'name': {'在线购买页面链接': [[16, 23]]}}"
3,您能告诉我g20标准版的平台是什么吗？,"{'name': {'平台': [[12, 13]]}}"
4,你知道g20标准版所使用的系统平台是什么吗？,"{'name': {'系统平台': [[13, 16]]}}"


In [19]:
df_exploded["bio"] = df_exploded[["question", "ner_list"]].apply(lambda x: bio_tagging(x["question"], x["ner_list"]), axis=1)

In [20]:
df_exploded["bio_words"] = df_exploded["bio"].apply(lambda x: x[0])
df_exploded["bio_tags"] = df_exploded["bio"].apply(lambda x: x[1])
df_exploded = df_exploded.drop("bio", axis=1)

In [21]:
tag2id = {'O': 0, 'B-name': 1, 'I-name': 2}

In [22]:
def tagging(x, tag2id):
    new_list = []
    for i in x:
        new_list.append(tag2id[i])
    return new_list

In [23]:
df_exploded["bio_tags_id"] = df_exploded["bio_tags"].apply(lambda x:tagging(x, tag2id))

In [24]:
from datasets import Dataset 

In [25]:
from datasets import Features, ClassLabel, Sequence, Value
features = Features({'ner_tags': Sequence(ClassLabel(num_classes=3, names=['O', 'B-name', 'I-name'])),
                     'tokens': Sequence(Value(dtype='string'))})
features

{'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-name', 'I-name'], id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [26]:
raw_datasets = Dataset.from_pandas(df_exploded[["bio_words", "bio_tags_id"]].rename(columns={"bio_words": "tokens", "bio_tags_id": "ner_tags"}),
                    features=features, preserve_index=False)

In [27]:
raw_datasets = raw_datasets.train_test_split(test_size=0.2, seed=42, shuffle=True)

In [30]:
raw_datasets.save_to_disk("/data/dataset/kefu/ner_from_template_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/1128 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/283 [00:00<?, ? examples/s]

In [3]:
from datasets import load_from_disk
raw_datasets = load_from_disk("/data/dataset/kefu/ner_from_template_dataset")

In [4]:
ner_feature = raw_datasets['train'].features["ner_tags"]
label_names = ner_feature.feature.names
label_names

['O', 'B-name', 'I-name']

In [5]:
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [6]:
words = raw_datasets['train'][0]["tokens"]
labels = raw_datasets['train'][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

g 2 0 标 准 版 的 进      水      压      力      最      大      值      以 及 p 1 0 s 标 准 版 的 洗      衣      热      水      功      能      分 别 是 什 么 ？ 
O O O O O O O B-name I-name I-name I-name I-name I-name I-name O O O O O O O O O O B-name I-name I-name I-name I-name I-name O O O O O O 


In [7]:
from transformers import AutoTokenizer
#tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-bert-wwm')
tokenizer = AutoTokenizer.from_pretrained('/data/dataset/huggingface/hub/bert-base-chinese')

In [10]:
tokenizer.is_fast

True

In [11]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'g',
 '2',
 '0',
 '标',
 '准',
 '版',
 '的',
 '进',
 '水',
 '压',
 '力',
 '最',
 '大',
 '值',
 '以',
 '及',
 'p',
 '1',
 '0',
 's',
 '标',
 '准',
 '版',
 '的',
 '洗',
 '衣',
 '热',
 '水',
 '功',
 '能',
 '分',
 '别',
 '是',
 '什',
 '么',
 '？',
 '[SEP]']

In [12]:
inputs.word_ids()

[None,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 None]

In [13]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [14]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0]
[-100, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, -100]


In [15]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [16]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/1128 [00:00<?, ? examples/s]

Map:   0%|          | 0/283 [00:00<?, ? examples/s]

In [217]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1128
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 283
    })
})

In [17]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [18]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    1,    2,    2,    2,
            2,    2,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    1,    2,    2,    2,    2,    2,    0,    0,    0,    0,    0,
            0, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    1,    2,
            2,    2,    2,    2,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    1,    2,    2,    2,    2,    2,    0,    0,    0,    0,    0,
         -100, -100]])

In [220]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, -100]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, -100]


In [222]:
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-name',
 'I-name',
 'I-name',
 'I-name',
 'I-name',
 'I-name',
 'I-name',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-name',
 'I-name',
 'I-name',
 'I-name',
 'I-name',
 'I-name',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [19]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [20]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    '/data/dataset/huggingface/hub/bert-base-chinese',
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at /data/dataset/huggingface/hub/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
model.config.num_labels

3

In [29]:
import evaluate

metric = evaluate.load("seqeval")

In [30]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [64]:
from transformers import TrainingArguments
args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=8,
    # weight_decay=0.01,
    lr_scheduler_type="cosine",
    push_to_hub=False,
    per_device_train_batch_size=32,
)

In [65]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)


In [66]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.200838,0.822323,0.843458,0.832757,0.969541
2,No log,0.213762,0.804825,0.857477,0.830317,0.968637
3,No log,0.186009,0.833713,0.85514,0.844291,0.971477
4,No log,0.197802,0.806306,0.836449,0.821101,0.967863
5,No log,0.206852,0.809735,0.85514,0.831818,0.96967
6,No log,0.207046,0.812775,0.86215,0.836735,0.970057
7,No log,0.210048,0.828508,0.869159,0.848347,0.970702
8,No log,0.210318,0.836689,0.873832,0.854857,0.97096


Checkpoint destination directory bert-finetuned-ner/checkpoint-36 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory bert-finetuned-ner/checkpoint-72 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory bert-finetuned-ner/checkpoint-108 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory bert-finetuned-ner/checkpoint-144 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory bert-finetuned-ner/checkpoint-180 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=288, training_loss=0.0014795924847324689, metrics={'train_runtime': 76.7225, 'train_samples_per_second': 117.619, 'train_steps_per_second': 3.754, 'total_flos': 291740466479616.0, 'train_loss': 0.0014795924847324689, 'epoch': 8.0})

In [34]:
trainer.evaluate()

{'eval_loss': 0.10490080714225769,
 'eval_precision': 0.7758620689655172,
 'eval_recall': 0.8411214953271028,
 'eval_f1': 0.8071748878923767,
 'eval_accuracy': 0.9636035105833763,
 'eval_runtime': 0.8291,
 'eval_samples_per_second': 341.343,
 'eval_steps_per_second': 43.422,
 'epoch': 3.0}

In [55]:
"".join(raw_datasets['test'][11]["tokens"])

'你能告诉我g20标准版的升降设备和p10s标准版水箱的大小吗？'

In [43]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "bert-finetuned-ner/checkpoint-180"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

[{'entity_group': 'name',
  'score': 0.99766546,
  'word': '进 水 压 力 最 大 值',
  'start': 7,
  'end': 14},
 {'entity_group': 'name',
  'score': 0.9992973,
  'word': '洗 衣 热 水 功 能',
  'start': 24,
  'end': 30}]

In [56]:
token_classifier("".join(raw_datasets['test'][11]["tokens"]))

[{'entity_group': 'name',
  'score': 0.9992218,
  'word': '升 降 设 备',
  'start': 12,
  'end': 16},
 {'entity_group': 'name',
  'score': 0.91837674,
  'word': '水 箱 的 大 小',
  'start': 24,
  'end': 29}]

In [89]:
train_encodings = tokenizer([df_exploded["bio_words"].iloc[0]],is_split_into_words=True,return_offsets_mapping=True, padding=True, truncation=True)

In [91]:
train_tags = [df_exploded["bio_tags"].iloc[0]]

In [97]:
len(train_encodings.offset_mapping[0])

23

In [95]:
train_tags[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-name',
 'I-name',
 'I-name',
 'I-name',
 'I-name',
 'I-name',
 'O',
 'O']

In [93]:
train_labels

[[-100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 1, 1, -100]]

In [None]:

train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [120]:
tokenizer([["H", "2", "0", "P", "r", "o"], ["h", "2", "0"]], is_split_into_words=True, padding=True, truncation=True)

{'input_ids': [[101, 100, 123, 121, 100, 160, 157, 102], [101, 150, 123, 121, 102, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 0, 0]]}

In [128]:
"H".lower()

'h'

In [597]:
all_dict = {}
for i in range(df_exploded.shape[0]):
    replace = df_exploded['augment'].iloc[i]["replace"]
    primaries = [j['primary_value'] for j in df_exploded['gen'].iloc[i]['prompt']]
    for key in replace:
        if replace[key] in primaries:
            print(i)
            continue
        if key in all_dict:
            all_dict[key].add(replace[key])
        else:
            all_dict[key] = set([replace[key]])

954


In [598]:
for key in all_dict:
    all_dict[key] = list(all_dict[key])
    if key not in all_dict[key]:
        all_dict[key] = all_dict[key] + [key]

In [599]:
def cherry_pick(key, value):
    if key not in all_dict:
        print(key, 'oops')
        return
    u_list = all_dict[key]
    for i in range(len(u_list)-1, -1, -1):
        if u_list[i] == value:
            u_list.pop(i)

In [600]:
cherry_pick_list = []
alias_inv_dict = {}
for k, v in all_dict.items():
    for u in v:
        if u in alias_inv_dict:
            cherry_pick_list.append((k, u, alias_inv_dict[u]))
        alias_inv_dict[u] = k

In [601]:
for i in range(len(cherry_pick_list)):
    cherry_pick(cherry_pick_list[i][0], cherry_pick_list[i][1])
    cherry_pick(cherry_pick_list[i][2], cherry_pick_list[i][1])

In [602]:
cherry_pick_list = []
alias_inv_dict = {}
for k, v in all_dict.items():
    for u in v:
        if u in alias_inv_dict:
            cherry_pick_list.append((k, u, alias_inv_dict[u]))
        alias_inv_dict[u] = k

In [603]:
cherry_pick_list

[]

In [604]:
len(alias_inv_dict)

1796

In [605]:
joblib.dump(all_dict, "/data/dataset/kefu/all_entity.json")

['/data/dataset/kefu/all_entity.json']

In [606]:
df_exploded["keywords"] = df_exploded["question"].apply(
    lambda x: extract_keywords(x, all_model_list, wc))

In [607]:
import importlib 
import database
import preprocessing
import entity_extractor
importlib.reload(database)
importlib.reload(preprocessing)
importlib.reload(entity_extractor)

<module 'entity_extractor' from '/root/PycharmProjects/chatbot/entity_extractor.py'>

In [608]:
ee = entity_extractor.EntityExtractor()

In [609]:
df_exploded["simple_entity"] = df_exploded["question"].apply(lambda x: ee.simple_match(x))

In [610]:
df_exploded["entity"] = df_exploded["keywords"].apply(lambda x: ee.query_entity(x))

In [611]:
df_exploded.head()

Unnamed: 0,cat,gen,template,type,final_prompt,augment,question,keywords,entity,simple_entity
0,sweeping,"{'question': 'g20标准版的商品链接是什么？', 'prompt': [{'p...",[问询词0]的[关键词0]是什么？,gen_same_keywords_for_models,"{""template"": ""[问询词0]的[关键词0]是什么？"", ""replace"": {...","{'sentence': '你能提供一下g20标准版的[商品访问链接]吗？', 'repla...",你能提供一下g20标准版的商品访问链接吗？,"{'question': '你能提供一下g20标准版的商品访问链接吗？', 'model':...",[商品链接],[商品链接]
1,sweeping,"{'question': 'g20标准版的商品链接是什么？', 'prompt': [{'p...",[问询词0]的[关键词0]是什么？,gen_same_keywords_for_models,"{""template"": ""[问询词0]的[关键词0]是什么？"", ""replace"": {...","{'sentence': 'g20标准版的[购买链接]可以获取吗？', 'replace':...",g20标准版的购买链接可以获取吗？,"{'question': 'g20标准版的购买链接可以获取吗？', 'model': ['g...","[商品链接, 采购地]","[商品链接, 采购地]"
2,sweeping,"{'question': 'g20标准版的商品链接是什么？', 'prompt': [{'p...",[问询词0]的[关键词0]是什么？,gen_same_keywords_for_models,"{""template"": ""[问询词0]的[关键词0]是什么？"", ""replace"": {...","{'sentence': '你知道g20标准版在哪里可以找到[在线购买页面链接]嘛？', '...",你知道g20标准版在哪里可以找到在线购买页面链接嘛？,"{'question': '你知道g20标准版在哪里可以找到在线购买页面链接嘛？', 'mo...","[商品链接, 采购地]","[商品链接, 采购地]"
3,sweeping,"{'question': 'g20标准版的平台是什么？', 'prompt': [{'pri...",[问询词0]的[关键词0]是什么？,gen_same_keywords_for_models,"{""template"": ""[问询词0]的[关键词0]是什么？"", ""replace"": {...","{'sentence': '您能告诉我g20标准版的平台是什么吗？', 'replace':...",您能告诉我g20标准版的平台是什么吗？,"{'question': '您能告诉我g20标准版的平台是什么吗？', 'model': [...",[平台],[平台]
4,sweeping,"{'question': 'g20标准版的平台是什么？', 'prompt': [{'pri...",[问询词0]的[关键词0]是什么？,gen_same_keywords_for_models,"{""template"": ""[问询词0]的[关键词0]是什么？"", ""replace"": {...","{'sentence': '你知道g20标准版所使用的[系统平台]是什么吗？', 'repl...",你知道g20标准版所使用的系统平台是什么吗？,"{'question': '你知道g20标准版所使用的系统平台是什么吗？', 'model'...",[平台],[平台]


In [587]:
query = "你知道g20标准版所使用的系统平台是什么吗？"
query_result = []
for name in ee.all_entity:
    if (name == query) or (query in name) or (name in query):
        query_result.append(name)
query_result

['平台']

In [588]:
for alias in sorted(ee.entity_inv.keys(), key=lambda k: len(k), reverse=True):
    if (alias == query) or (query in alias) or (alias in query):
        print(alias, ee.entity_inv[alias])
        query_result.append(ee.entity_inv[alias])

g20标准版 清扫路线
系统平台 平台
平台 平台


In [589]:
"g20标准版" in ee.entity_inv

True

In [310]:
from fuzzywuzzy import process
import copy
from database import DataQuery

In [311]:
def row_to_sentence_simple_query(row):
    other_data = {k: v for k, v in row.items() if k not in [
        '版本', '商品型号']}
    details = []
    for col, value in other_data.items():
        if pd.notna(value):
            detail = f"{col}为{value}"
            details.append(detail)
    return '; '.join(details)

<module 'entity_extractor' from '/root/PycharmProjects/chatbot/entity_extractor.py'>

In [365]:
df_exploded.iloc[0]

cat                                                      sweeping
gen             {'question': 'g20标准版的商品链接是什么？', 'prompt': [{'p...
template                                        [问询词0]的[关键词0]是什么？
type                                 gen_same_keywords_for_models
final_prompt    {"template": "[问询词0]的[关键词0]是什么？", "replace": {...
augment         {'sentence': '你能提供一下g20标准版的[商品访问链接]吗？', 'repla...
question                                    你能提供一下g20标准版的商品访问链接吗？
keywords        {'model': ['g20'], 'version': ['标准版'], 'keywor...
Name: 0, dtype: object

In [355]:
# 创建 DataQuery 的实例
dq = database.DataQuery()

In [356]:
def handler_simple_query(question_obj):
    model = question_obj['model'][0]
    keywords = question_obj['keywords']
    try:
        version = question_obj['version'][0]
    except:
        version = ""

    all_data = dq.query_data(model)
    answer_list = []
    find_list = []

    for keyword in keywords:
        ori_keyword = keyword
        if keyword not in all_data.columns:
            keyword, _ = find_best_match_new(
                keyword, all_data.columns, threshold=65)

        if keyword in all_data.columns and keyword not in ['版本', '商品型号']:
            find_list.append(keyword)
            tmp_df = copy.deepcopy(all_data[['商品型号', '版本', keyword]])
            tmp_df = tmp_df.loc[(tmp_df['商品型号'] == model)]
            if version:
                tmp_df = tmp_df.loc[(tmp_df['版本'] == version)]
            if tmp_df.empty:
                return ''
            sentences = tmp_df.apply(row_to_sentence_simple_query, axis=1)
            sentences = '\n'.join(sentences)
            if ori_keyword != keyword:
                sentences = sentences.replace(keyword, f'{ori_keyword}({keyword})')
            if len(sentences) > 0:
                answer_list.append(sentences)
    
    details = '，'.join(answer_list)
    answer = f'{model}{version}的{details}'
    
    return answer, find_list

In [357]:
def find_best_match_new(question, mapping_list, threshold=65):
    matches = process.extract(question, mapping_list)
    best_matches = [match for match in matches if match[1] >= threshold]
    best_matches = sorted(best_matches, key=lambda x: (-x[1], -len(x[0])))
    # print(best_matches)
    match_score = 0
    total_q = ""
    if len(best_matches) > 0:
        total_q = best_matches[0][0]
        match_score = best_matches[0][1]

    return total_q, match_score

In [121]:
question_obj = oot[oot.version_keywords.apply(lambda x: len(x)>0)]["keywords"].iloc[0]
question_obj

{'model': ['p10pro'], 'version': ['上下水版'], 'keywords': ['清洁液', '毫升']}

In [122]:
handler_simple_query(question_obj)

'p10pro上下水版的清洁液(清洁液储存盒)为1000ml'

In [342]:
"平台" in dq.all_data.columns

True

In [361]:
handler_simple_query(df_exploded["keywords"].iloc[0])

('g20标准版的商品(商品编码)为4969634692760，链接(商品链接)为https://detail.tmall.com/item.htm?id=707235140054&skuId=4969634692760&spm=a21dvs.23580594.0.0.3f063d0d9Hg2Xc',
 ['商品编码', '商品链接'])

In [359]:
df_exploded["keywords"].iloc[0]

{'model': ['h1'], 'version': [], 'keywords': ['使用', '平台', '请问']}

In [362]:
df_exploded.iloc[0]

cat                                                      sweeping
gen             {'question': 'g20标准版的商品链接是什么？', 'prompt': [{'p...
template                                        [问询词0]的[关键词0]是什么？
type                                 gen_same_keywords_for_models
final_prompt    {"template": "[问询词0]的[关键词0]是什么？", "replace": {...
augment         {'sentence': '你能提供一下g20标准版的[商品访问链接]吗？', 'repla...
question                                    你能提供一下g20标准版的商品访问链接吗？
keywords        {'model': ['g20'], 'version': ['标准版'], 'keywor...
Name: 0, dtype: object

In [352]:
question_obj = df_exploded["keywords"].iloc[500]

In [353]:
question_obj

{'model': ['h1'], 'version': [], 'keywords': ['使用', '平台', '请问']}

In [350]:
model = question_obj['model'][0]
keywords = question_obj['keywords']
try:
    version = question_obj['version'][0]
except:
    version = ""

all_data = dq.query_data(model)

In [351]:
all_data.head()

Unnamed: 0,版本,商品编码,平台ID,商品id,商品型号,商品名字,商品分类,商品链接,平台,店铺名称,...,内筒照明,断电记忆,童锁功能,中途添衣,紧急开门,机门锁定/解锁,额定洗涤输入功率,额定脱水输入功率,额定加热输入功率,防水等级
0,标准版,4969634692760,,4969634692760,G20,石头自清洁扫地机器人G20系列扫拖地全自动上下水家用清洗一体机,扫地机,https://detail.tmall.com/item.htm?id=707235140...,淘宝,石头电器旗舰店,...,,,,,,,,,,
1,标准版,5268670697517,,5268670697517,P10S,【新品上市】石头自清洁扫地机器人P10S系列全自动扫拖地清洗一体,扫地机,https://detail.tmall.com/item.htm?id=766171501...,淘宝,石头电器旗舰店,...,,,,,,,,,,
2,上下水版,5439727679635,,5439727679635,P10S Pro,【新品上市】石头自清洁扫地机器人P10S Pro系列扫拖一体全自动,扫地机,https://detail.tmall.com/item.htm?id=766696298...,淘宝,石头电器旗舰店,...,,,,,,,,,,
3,标准版,512525013970,,512525013970,G10S Pure,石头自清洁扫地机器人G10S Pure系列扫拖一体机全自动上下水家用,扫地机,https://detail.tmall.com/item.htm?id=670141190...,淘宝,石头电器旗舰店,...,,,,,,,,,,
4,标准版,4952229349955,,4952229349955,G10S Pro,石头自清洁扫拖机器人G10S系列全自动家用扫地拖地吸尘三合一体机,扫地机,https://detail.tmall.com/item.htm?id=672078527...,淘宝,石头电器旗舰店,...,,,,,,,,,,


In [None]:
model = question_obj['model'][0]
keywords = question_obj['keywords']
try:
    version = question_obj['version'][0]
except:
    version = ""

all_data = dq.query_data(model)
answer_list = []
find_list = []

for keyword in keywords:
    ori_keyword = keyword
    if keyword not in all_data.columns:
        keyword, _ = find_best_match_new(
            keyword, all_data.columns, threshold=65)

    if keyword in all_data.columns and keyword not in ['版本', '商品型号']:
        find_list.append(keyword)
        tmp_df = copy.deepcopy(all_data[['商品型号', '版本', keyword]])
        tmp_df = tmp_df.loc[(tmp_df['商品型号'] == model)]
        if version:
            tmp_df = tmp_df.loc[(tmp_df['版本'] == version)]
        sentences = tmp_df.apply(row_to_sentence_simple_query, axis=1)
        sentences = '\n'.join(sentences)
        if ori_keyword != keyword:
            sentences = sentences.replace(keyword, f'{ori_keyword}({keyword})')
        if len(sentences) > 0:
            answer_list.append(sentences)
details = '，'.join(answer_list)
answer = f'{model}{version}的{details}'

In [334]:
for i in range(df_exploded.shape[0]):
    print(i)
    a = handler_simple_query(df_exploded["keywords"].iloc[i])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [316]:
df_exploded['result'] = df_exploded["keywords"].apply(lambda x: handler_simple_query(x)[1])

IndexError: string index out of range