In [1]:
#导入必要的库
import random
import re
import json
import pandas as pd
import csv

In [27]:
#训练数据读取
def load_data(file_path):
    """
    :param file_path: 需要重新组合的训练数据文件
    :return: df_total
    """
    df_total=pd.read_excel(file_path)
    return df_total
#从标注结果中拆解单个属性
def extract_brand(result,key):
    """
    :param result: 标注结果列
    :param key: 需提取的属性
    :return: 提取出的属性
    """
    if isinstance(result, dict):
        return result.get(key, '')
    elif isinstance(result, str):
        try:
            parsed_result = json.loads(result)
            return parsed_result.get(key, '')
        except json.JSONDecodeError:
            return ''
    else:
        return ''
#计算标志结果中非空属性个数
def count_non_empty_pairs(result):
    """
    :param result: 标注结果列
    :return: count非空属性个数
    """
    if isinstance(result, dict):
        data = result
    elif isinstance(result, str):
        try:
            data = json.loads(result)
            # 确保解析后是字典类型
            if not isinstance(data, dict):
                return 0
        except json.JSONDecodeError:
            return 0
    else:
        return 0
    count = 0
    for key, value in data.items():
        # 这里的"空"指的是None、空字符串、空列表等
        if value not in (None, '', [], {}, ()):
            count += 1
    
    return count
def clean_df(file_path,n):
    """
    对训练数据进行处理，提取品牌和名称，去掉没有品牌或者名称的数据，计算有效属性数，保留含有n个及以上的训练数据
    :param file_path: 训练数据文件地址
    :return: df_cleaned
    """
    df=load_data(file_path)
    df['品牌']=df['标注结果'].apply(extract_brand, args=('品牌',))
    df['品牌']=df['品牌'].apply(str_processed)
    df['名称']=df['标注结果'].apply(extract_brand, args=('名称',))
    df['有效属性数']=df['标注结果'].apply(count_non_empty_pairs)
    df = df.replace(r'^\s*$', pd.NA, regex=True)
    df_dna=df.dropna(subset=["品牌", "名称"])
    df_cleaned=df_dna[df_dna['有效属性数']>=n]
    return df_cleaned
    

In [26]:
#品牌池
def str_processed(data):
    chinese_part = ''.join(re.findall(r'[\u4e00-\u9fa5]', data))
    return chinese_part if chinese_part else data
def get_brand_pool(file_path):
    """
    :param file_path: 品牌库文件地址，包含标准词和扩展词两列
    :return: brand_pool
    """
    brand_df=pd.read_excel(file_path)
    brand_df["扩展词"] = brand_df["扩展词"].fillna("")
    brand_df["扩展词"] = brand_df["标准词"].str.cat(brand_df["扩展词"], sep=",")
    brand_df["标准词"]=brand_df["标准词"].apply(str_processed)
    brand_pool={}
    for index, item in brand_df.iterrows():
        if pd.notna(item['扩展词']):  # 避免空值报错
            extensions = item['扩展词'].split(',')
            extensions = [ext.strip() for ext in extensions if ext.strip()]
        else:
            extensions = []  # 空值时设为空列表 
        brand_pool[item['标准词']] = extensions
    return brand_pool

In [15]:
#名称池&属性池&生成新标注结果
def split_dict(data):
    """
    删除标注结果中品牌名称属性
    """
    key1='品牌'
    key2='名称'
    if isinstance(data, dict):
        if key1 in data:
            del data[key1]
        if key2 in data:
            del data[key2]
        return data
    elif isinstance(data, str):
        parsed_result = json.loads(data)
        if key1 in parsed_result and key2 in parsed_result:
            del parsed_result[key1]
            del parsed_result[key2]
            return parsed_result
    else:
        return data
def combiantion_pool(df_cleaned,brand_pool):
    """
    重组生成新的标注结果
    :param df_cleaned: 清洗后的数据集
    :param brand_pool: 品牌池
    :return: result_df组合后的数据集
    """
    target_df=df_cleaned[df_cleaned['品牌'].notna()&(df_cleaned['品牌'].str.strip()!="")]
    target_df_grouped=target_df.groupby(['类目名称','品牌'])
    result=[]
    for (category, brand), group in target_df_grouped:
        if brand in brand_pool:
            brand_pool_s=brand_pool[brand]
        else:
            brand_pool_s=[brand]
        name_pool=group['标注结果'].apply(extract_brand, args=('名称',)).drop_duplicates().to_list()
        attribute_pools=group[['商品编码','标注结果']]
        attribute_pools = attribute_pools.copy()
        attribute_pools['处理后标注结果']=attribute_pools['标注结果'].apply(split_dict)
        attribute_pools = attribute_pools.drop_duplicates(subset=['处理后标注结果'])
        attribute_pool = dict(zip(attribute_pools['商品编码'], attribute_pools['处理后标注结果']))
        new_attribute_pool=[]
        for item_code,attribute in attribute_pool.items():
            if isinstance(attribute, dict):
                new_name=random.choice(name_pool)
                new_brand=random.choice(brand_pool_s)
                new_attribute={"品牌":new_brand,"名称":new_name,**attribute,"商品编码":item_code}
                new_attribute_pool.append(new_attribute)
            else:
                print("数据类型错误")
        result.append((category,brand,new_attribute_pool))
    rows = []
    for category, brand, dict_list in result:
        for single_dict in dict_list:
            id=single_dict.get('商品编码')
            del single_dict['商品编码']
            rows.append({
                '类目名称': category,
                '品牌': brand,
                '组合标注结果': single_dict,
                '商品编码': id
            })
    result_df = pd.DataFrame(rows)
    return result_df

In [29]:
    #主流程
if __name__ == '__main__':
    df=load_data(r"D:\工作文档\生成\属性识别数据0910.xlsx")
    df_cleaned=clean_df(r"D:\工作文档\生成\属性识别数据0910.xlsx",4)
    brand_pool=get_brand_pool(r"D:\工作文档\大模型\品牌_标准词_20250820.xlsx")
    result_df=combiantion_pool(df_cleaned,brand_pool)

  warn("Workbook contains no default style, apply openpyxl's default")


In [31]:
result_df

Unnamed: 0,类目名称,品牌,组合标注结果,商品编码
0,3D打印机,GODEX,"{'品牌': 'GODEX', '名称': '工业打印机', '型号': 'BPH830i+...",241205014156874472600
1,3D打印机,创想三维,"{'品牌': '创想三维', '名称': '3D打印机', '型号': 'K1 MAX', ...",250321009002986243517
2,GPS定位器,途强,"{'品牌': '途强', '名称': 'GPS定位器', '型号': 'G410', '款式...",250709004911338222816
3,HDMI切换器,绿联,"{'品牌': '绿联', '名称': '切换器', '型号': '30346', '接口配置...",250115001947890921114
4,KVM切换器,绿联,"{'品牌': '绿联', '名称': 'KVM切换器', '型号': '30357', '类...",250508007941237789603
...,...,...,...,...
3601,麦克风,索爱,"{'品牌': '索爱 soaiy', '名称': '麦克风', '型号': 'WS16', ...",241231020198339257300
3602,麦克风,飞利浦,"{'品牌': '飞利浦/PHILIPS', '名称': '麦克风', '型号': 'DLM3...",241231003300092477046
3603,麦克风,飞利浦,"{'品牌': 'PHILIPS飞利浦', '名称': '麦克风', '型号': 'DLM35...",241231008572442102587
3604,（皮肤）冲洗液,迪福特,"{'品牌': '迪福特', '名称': '应急冲洗液', '型号': 'GJXBP', '净...",250603001878989082107


Unnamed: 0,商品编码,标准类目编码,类目名称,商品标题,标注结果,属性列表,品牌,名称,有效属性数
0,250520001331213038209,C150101,硒鼓,佳能 CRG069 硒鼓 标准装，约1900页 青色 单位:个,"{""品牌"":""佳能"",""名称"":""硒鼓"",""型号"":""CRG069"",""打印量"":""1900...","品牌,名称,型号,打印量,是否带芯片,颜色,适用机型,包装规格,计量单位",佳能,硒鼓,6
1,250520001359110301186,C150101,硒鼓,佳能 CRG069 硒鼓 标准装，约2100页，适用LBP673Cdn/LBP673Cdw/...,"{""品牌"":""佳能"",""名称"":""硒鼓"",""型号"":""CRG069"",""打印量"":""2100...","品牌,名称,型号,打印量,是否带芯片,颜色,适用机型,包装规格,计量单位",佳能,硒鼓,7
2,250523002273112445746,C150101,硒鼓,京昇佳能LBP673CDN/CDW/674CX/MF752CDW 756打印机CRG-069...,"{""品牌"":""京昇"",""名称"":""硒鼓"",""型号"":""CRG-069"",""打印量"":""250...","品牌,名称,型号,打印量,是否带芯片,颜色,适用机型,包装规格,计量单位",京昇,硒鼓,7
3,250523010675557981234,C150101,硒鼓,佳能（Canon）硒鼓CRG069 BK（适用LBP673Cdn/LBP673Cdw/LBP...,"{""品牌"":""佳能"",""名称"":""硒鼓"",""型号"":""CRG069"",""打印量"":"""",""是...","品牌,名称,型号,打印量,是否带芯片,颜色,适用机型,包装规格,计量单位",佳能,硒鼓,5
4,250524002474636907245,C150101,硒鼓,佳能（Canon）CRG-069BK硒鼓四色套装适用机型：MF756Cx/MF752Cdw/...,"{""品牌"":""佳能"",""名称"":""硒鼓"",""型号"":""CRG-069"",""打印量"":"""",""...","品牌,名称,型号,打印量,是否带芯片,颜色,适用机型,包装规格,计量单位",佳能,硒鼓,6
...,...,...,...,...,...,...,...,...,...
4895,241226006553522326572,C040602,周转箱,箱大王 Xlj-15 塑料分格周转箱 零件整理盒 多格零件盒 350四格箱372*276*80mm,"{""品牌"":""箱大王"",""名称"":""周转箱"",""型号"":""Xlj-15"",""尺寸规格"":""3...","品牌,名称,型号,尺寸规格,容量,是否分隔,是否带盖,颜色,材质,包装规格,计量单位",箱大王,周转箱,7
4896,241231002380560802647,C020103,竹竿,闻早 训练竹竿 跳高杆 菜架长竹竿 搭架毛竹杆粗细竹子 1.5cm*1.5m长,"{""品牌"":""闻早"",""名称"":""训练竹竿"",""直径"":""1.5cm"",""长度"":""1.5m...","品牌,名称,直径,长度,工艺,包装规格,计量单位",闻早,训练竹竿,4
4897,250214008462125231277,C13070114,竹签,美厨（maxcook）竹签烧烤签 烧烤针羊肉串 穿肉签烤针配件 250*3mm 250支MC...,"{""品牌"":""美厨（maxcook）"",""名称"":""竹签"",""型号"":""MCPJ7200"",...","品牌,名称,型号,规格,材质,包装规格,计量单位",美厨（maxcook）,竹签,6
4898,241231017976009465030,C250803,自拍杆,大疆 如影SC 云台稳定器 单位:个,"{""品牌"":""大疆"",""名称"":""云台稳定器"",""型号"":""如影SC"",""款式"":"""",""是...","品牌,名称,型号,款式,是否带补光灯,材质,功能,计量单位",大疆,云台稳定器,4
