# step1 - 对于图片主体的box数量统计

In [23]:
import os
import pandas as pd
import re

def process_excel(input_file, output_file):
    # 读取Excel文件
    df = pd.read_excel(input_file)

    # 定义一个函数来移除后缀
    def remove_suffix(name):
        return re.sub(r'_\d+$', '', name)

    # 应用函数到'Image Name'列
    df['Image Name'] = df['Image Name'].apply(remove_suffix)

    # 计算每个Image Name的出现次数
    name_counts = df['Image Name'].value_counts()

    # 创建一个新的'box_no'列，并填充对应的计数
    df['box_no'] = df['Image Name'].map(name_counts)

    # 保存修改后的DataFrame到新的Excel文件
    df.to_excel(output_file, index=False)

    print(f"处理完成，结果已保存到 {output_file}")

def process_all_folders(base_path):
    for root, dirs, files in os.walk(base_path):
        if 'grounding_output' in dirs:
            grounding_output_path = os.path.join(root, 'grounding_output')
            input_file = os.path.join(grounding_output_path, 'grounding_results.xlsx')
            if os.path.exists(input_file):
                output_file = os.path.join(grounding_output_path, 'grounding_results_processed.xlsx')
                process_excel(input_file, output_file)

# 设置基础路径
base_path = 'D://code//data//Lv2期结论//京喜_from_0501//筛选'

# 处理所有文件夹
process_all_folders(base_path)

print("所有文件夹处理完成")




处理完成，结果已保存到 D://code//data//Lv2期结论//京喜_from_0501//筛选\1047\grounding_output\grounding_results_processed.xlsx
处理完成，结果已保存到 D://code//data//Lv2期结论//京喜_from_0501//筛选\12010\grounding_output\grounding_results_processed.xlsx
处理完成，结果已保存到 D://code//data//Lv2期结论//京喜_from_0501//筛选\12811\grounding_output\grounding_results_processed.xlsx
处理完成，结果已保存到 D://code//data//Lv2期结论//京喜_from_0501//筛选\1349\grounding_output\grounding_results_processed.xlsx
处理完成，结果已保存到 D://code//data//Lv2期结论//京喜_from_0501//筛选\1355\grounding_output\grounding_results_processed.xlsx
处理完成，结果已保存到 D://code//data//Lv2期结论//京喜_from_0501//筛选\13661\grounding_output\grounding_results_processed.xlsx
处理完成，结果已保存到 D://code//data//Lv2期结论//京喜_from_0501//筛选\1476\grounding_output\grounding_results_processed.xlsx
处理完成，结果已保存到 D://code//data//Lv2期结论//京喜_from_0501//筛选\15908\grounding_output\grounding_results_processed.xlsx
处理完成，结果已保存到 D://code//data//Lv2期结论//京喜_from_0501//筛选\1656\grounding_output\grounding_results_processed.xlsx
处理完成，结果已保存到 D://code//da

# step2 - 文本框识别, 并合并相邻的文本框

In [24]:
# 修改后的代码, 先从图片中识别出文本, 然后分两步
# ① 对文本框进行阈值下的合并; 同时也保留原文本框
# ② 对文本进行高度和关键词的分类



import os
import glob
from tqdm import tqdm
import pandas as pd
from paddleocr import PaddleOCR
from PIL import Image
import math
import re

# 设置输入和输出路径
input_folder_path = 'D://code//data//Lv2期结论//京喜_from_0501//筛选'
output_file_path = 'D://code//data//Lv2期结论//京喜_from_0501//筛选//txt_info.xlsx'

# 加载 OCR 模型
ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False)

def calculate_shortest_distance(point_a, points_bcd):
    shortest_distance = float('inf')
    for point_bcd in points_bcd:
        distance = ((point_bcd[0] - point_a[0]) ** 2 + (point_bcd[1] - point_a[1]) ** 2) ** 0.5
        if distance < shortest_distance:
            shortest_distance = distance
    return shortest_distance

def merge_text_boxes(img_path, style):
    result = ocr.ocr(img_path, cls=True)
    img = Image.open(img_path)
    img_width, img_height = img.size

    if not result or not result[0]:
        print(f"No text detected in the image: {img_path}")
        return None, None

    rectangles_with_text = result[0]

    original_text_box_info = []
    for rectangle in rectangles_with_text:
        points = rectangle[0]
        original_text_box_info.append({
            'File Name': os.path.basename(img_path),
            'Style': style,
            'x1': points[0][0],
            'y1': points[0][1],
            'x2': points[2][0],
            'y2': points[2][1],
            'text': rectangle[1][0]
        })

    merged_text_boxes = []

    for index, row in pd.DataFrame(original_text_box_info).iterrows():
        if not merged_text_boxes:
            merged_text_boxes.append(row.to_dict())
        else:
            last_merged_box = merged_text_boxes[-1]

            if calculate_shortest_distance((row['x1'], row['y1']), [(last_merged_box['x1'], last_merged_box['y1']), (last_merged_box['x2'], last_merged_box['y1']), (last_merged_box['x2'], last_merged_box['y2']), (last_merged_box['x1'], last_merged_box['y2'])]) < 100:
                last_merged_box['text'] += ' ' + row['text']
                last_merged_box['x1'] = min(last_merged_box['x1'], row['x1'])
                last_merged_box['y1'] = min(last_merged_box['y1'], row['y1'])
                last_merged_box['x2'] = max(last_merged_box['x2'], row['x2'])
                last_merged_box['y2'] = max(last_merged_box['y2'], row['y2'])
            else:
                merged_text_boxes.append(row.to_dict())

    original_text_box_df = pd.DataFrame(original_text_box_info)
    merged_text_box_df = pd.DataFrame(merged_text_boxes)

    for i, box in original_text_box_df.iterrows():
        if box['y1'] < img_height / 2 and box['y2'] < img_height / 2:
            region = '上半'
        elif box['y1'] >= img_height / 2 and box['y2'] >= img_height / 2:
            region = '下半'
        elif box['x1'] < img_width / 2 and box['x2'] < img_width / 2:
            region = '左半'
        else:
            region = '右半'
        original_text_box_df.at[i, 'Region'] = region

        box_area = (box['x2'] - box['x1']) * (box['y2'] - box['y1'])
        box_per = box_area / (img_width * img_height)
        original_text_box_df.at[i, 'txt_Area'] = box_area
        original_text_box_df.at[i, 'txt_Per'] = box_per

    for i, box in merged_text_box_df.iterrows():
        if box['y1'] < img_height / 2 and box['y2'] < img_height / 2:
            region = '上半'
        elif box['y1'] >= img_height / 2 and box['y2'] >= img_height / 2:
            region = '下半'
        elif box['x1'] < img_width / 2 and box['x2'] < img_width / 2:
            region = '左半'
        else:
            region = '右半'
        merged_text_box_df.at[i, 'Region'] = region

        merge_area = (box['x2'] - box['x1']) * (box['y2'] - box['y1'])
        merge_per = merge_area / (img_width * img_height)
        merged_text_box_df.at[i, 'Area'] = merge_area
        merged_text_box_df.at[i, 'Per'] = merge_per

    return merged_text_box_df, original_text_box_df

keyword_groups = {
    '通用': ['以旧换新', '只换不修', '包邮', '无理由退', '先用后付', '京东白条', '期免息', '送货上门', '保修'],
    '价保': ['价保', '保价'],
    '纯价格': ['¥', '夫', '￥', r'\b价\b', '到手价', '活动价'],
    '直降': ['立减', '直降', '降', '立省', r'^(?!.*升降).*$', r'^(?!.*降温).*$', r'^(?!.*降噪).*$', r'^(?!.*降低).*$'],
    '折扣': ['折', r'^(?!.*折叠).*$', r'^(?!.*翻折).*$'],
    '满减': [r'.*满.*减.*', r'.*满.*-.*', r'.*满.*免.*'],
    '用券': ['用券', '领券', '券'],
    '返券': ['返券', '京豆', '返现', r'.*返.*E卡.*', r'.*返.*红包.*'],
    '限时': ['.*小时$', '.*天$', '时间', 'time', 'TIME', '限时', r'.*月.*日.*', r'.*日.*点.*', r'.*:.*', r'.*:.*', r'.*：.*', r'\b\d{1,2}\.\d{1,2}-\d{1,2}\b'],
    'xx元任选': [r'.*元.*件.*'],
    '赠品': [r'.*满.*赠.*', r'.*满.*送.*', '送', '抽', '奖励', '赠', r'^(?!.*送货).*$', r'^(?!.*送礼).*$', r'^(?!.*送装).*$', r'^(?!.*配送).*$', r'^(?!.*送达).*$'],
    '节日名称': ['节', '出游季', '购物季', '毕业季', '开学季', '黑五', '周年庆', '儿童节', '父亲节', '端午节', '七夕', '中秋节', '国庆', '万圣节', '感恩节', '元旦', '圣诞', '情人节', '春节', '元宵节', '38节', '3.8节', '清明节', '母亲节', '618', '购物季', '开学季', '11.11', '黑五', '12.12', '女神节', '出游季', '放价季', '吃货节', '家装节'],
    '是否限购': ['限购', '限量']
}

def keyword_analysis(text):
    results = {}
    for key, words in keyword_groups.items():
        results[key] = any(re.search(word, text) for word in words)
    return results

def height_analysis(x1, y1, x2, y2):
    height = abs(y2 - y1)
    return height

def process_images(folder_path, subfolder_name):
    image_files = []
    for root, dirs, files in os.walk(folder_path):
        if 'grounding_output' in root and ('price' in root or 'txt' in root):
            for file in files:
                if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
                    image_files.append(os.path.join(root, file))
    
    combined_results = []
    for img_path in tqdm(image_files, desc=f'Processing images in {subfolder_name}'):
        style = 'price' if 'price' in img_path else 'txt'
        merged_df, original_df = merge_text_boxes(img_path, style)
        if merged_df is not None and original_df is not None:
            merged_df['Subfolder'] = subfolder_name
            original_df['Subfolder'] = subfolder_name
            combined_results.append({
                'original': original_df,
                'merged': merged_df
            })
    
    return combined_results

# 主程序
all_results = []
for folder in os.listdir(input_folder_path):
    if folder.isdigit():
        folder_path = os.path.join(input_folder_path, folder)
        if os.path.isdir(folder_path):
            print(f"正在处理文件夹: {folder_path}")
            results = process_images(folder_path, folder)
            all_results.extend(results)

final_combined_data = []
for result in all_results:
    result['original']['Type'] = 'Original'
    result['merged']['Type'] = 'Merged'
    combined = pd.concat([result['original'], result['merged']], ignore_index=True)
    final_combined_data.append(combined)

final_combined_df = pd.concat(final_combined_data, ignore_index=True)
final_combined_df.sort_values(by=['Subfolder', 'File Name', 'Type'], inplace=True)

for index, row in tqdm(final_combined_df.iterrows(), total=final_combined_df.shape[0], desc="Analyzing text"):
    keyword_results = keyword_analysis(row['text'])
    for key, value in keyword_results.items():
        final_combined_df.at[index, key] = value
    
    height = height_analysis(row['x1'], row['y1'], row['x2'], row['y2'])
    final_combined_df.at[index, 'Height'] = height
    final_combined_df.at[index, 'Height_Category'] = (
        'Height_<18' if height < 18 else
        'Height_18-29' if 18 <= height < 29 else
        'Height_29-38' if 29 <= height < 38 else
        'Height_>38'
    )

final_combined_df.to_excel(output_file_path, index=False)

print('处理完成')

import datetime
current_time = datetime.datetime.now()
formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
print(f"完成时间: {formatted_time}")




正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\1047


Processing images in 1047: 100%|██████████| 34/34 [00:07<00:00,  4.65it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\12010


Processing images in 12010: 100%|██████████| 81/81 [00:23<00:00,  3.47it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\12811


Processing images in 12811: 100%|██████████| 30/30 [00:07<00:00,  3.87it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\1349


Processing images in 1349: 100%|██████████| 140/140 [00:27<00:00,  5.08it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\1355


Processing images in 1355: 100%|██████████| 132/132 [00:17<00:00,  7.39it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\13661


Processing images in 13661: 100%|██████████| 51/51 [00:16<00:00,  3.11it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\1476


Processing images in 1476: 100%|██████████| 16/16 [00:04<00:00,  3.45it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\15908


Processing images in 15908: 100%|██████████| 100/100 [00:51<00:00,  1.94it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\1656


Processing images in 1656: 100%|██████████| 55/55 [00:19<00:00,  2.76it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\1657


Processing images in 1657: 100%|██████████| 108/108 [00:43<00:00,  2.51it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\16777


Processing images in 16777: 100%|██████████| 119/119 [00:43<00:00,  2.76it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\34919


Processing images in 34919: 100%|██████████| 63/63 [00:33<00:00,  1.88it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\35404


Processing images in 35404: 100%|██████████| 108/108 [00:45<00:00,  2.36it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\6191


Processing images in 6191: 100%|██████████| 20/20 [00:08<00:00,  2.35it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\6221


Processing images in 6221: 100%|██████████| 50/50 [00:23<00:00,  2.10it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\739


Processing images in 739: 100%|██████████| 30/30 [00:18<00:00,  1.61it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\753


Processing images in 753: 100%|██████████| 9/9 [00:06<00:00,  1.50it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\760


Processing images in 760: 100%|██████████| 10/10 [00:07<00:00,  1.35it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\9435


Processing images in 9435: 100%|██████████| 45/45 [00:30<00:00,  1.50it/s]


正在处理文件夹: D://code//data//Lv2期结论//京喜_from_0501//筛选\9775


Processing images in 9775: 100%|██████████| 98/98 [00:46<00:00,  2.12it/s]
Analyzing text: 100%|██████████| 12730/12730 [00:02<00:00, 5381.84it/s]


处理完成
完成时间: 2024-10-18 10:54:04


In [None]:
# # 自动文件名进行处理的方法, 但是还没有验证


# import os
# import glob
# from tqdm import tqdm
# import pandas as pd
# from paddleocr import PaddleOCR
# from PIL import Image
# import math
# import re

# # 设置输入和输出路径
# input_folder_path = 'D://code//data//Lv2期结论//京喜_from_0501//筛选'
# output_file_path = 'D://code//data//Lv2期结论//京喜_from_0501//筛选//txt_info.xlsx'

# # 加载 OCR 模型
# ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False)

# def calculate_shortest_distance(point_a, points_bcd):
#     shortest_distance = float('inf')
#     for point_bcd in points_bcd:
#         distance = ((point_bcd[0] - point_a[0]) ** 2 + (point_bcd[1] - point_a[1]) ** 2) ** 0.5
#         if distance < shortest_distance:
#             shortest_distance = distance
#     return shortest_distance

# def merge_text_boxes(img_path, style):
#     result = ocr.ocr(img_path, cls=True)
#     img = Image.open(img_path)
#     img_width, img_height = img.size

#     if not result or not result[0]:
#         print(f"No text detected in the image: {img_path}")
#         return None, None

#     rectangles_with_text = result[0]

#     # 提取文件名并处理
#     file_name = os.path.basename(img_path)
#     # 移除扩展名
#     file_name = os.path.splitext(file_name)[0]
#     # 移除可能的 'txt_' 或 'price_' 前缀
#     file_name = re.sub(r'^(txt_|price_)', '', file_name)
#     # 确保文件名格式为 XXXXXXXXXXXXXXXX_XXXXXXXXXXXXXXXX
#     if '_' in file_name:
#         parts = file_name.split('_')
#         if len(parts) >= 2:
#             file_name = f"{parts[-2]}_{parts[-1]}"

#     original_text_box_info = []
#     for rectangle in rectangles_with_text:
#         points = rectangle[0]
#         original_text_box_info.append({
#             'File Name': file_name,  # 使用处理后的文件名
#             'Style': style,
#             'x1': points[0][0],
#             'y1': points[0][1],
#             'x2': points[2][0],
#             'y2': points[2][1],
#             'text': rectangle[1][0]
#         })

#     merged_text_boxes = []

#     for index, row in pd.DataFrame(original_text_box_info).iterrows():
#         if not merged_text_boxes:
#             merged_text_boxes.append(row.to_dict())
#         else:
#             last_merged_box = merged_text_boxes[-1]

#             if calculate_shortest_distance((row['x1'], row['y1']), [(last_merged_box['x1'], last_merged_box['y1']), (last_merged_box['x2'], last_merged_box['y1']), (last_merged_box['x2'], last_merged_box['y2']), (last_merged_box['x1'], last_merged_box['y2'])]) < 100:
#                 last_merged_box['text'] += ' ' + row['text']
#                 last_merged_box['x1'] = min(last_merged_box['x1'], row['x1'])
#                 last_merged_box['y1'] = min(last_merged_box['y1'], row['y1'])
#                 last_merged_box['x2'] = max(last_merged_box['x2'], row['x2'])
#                 last_merged_box['y2'] = max(last_merged_box['y2'], row['y2'])
#             else:
#                 merged_text_boxes.append(row.to_dict())

#     original_text_box_df = pd.DataFrame(original_text_box_info)
#     merged_text_box_df = pd.DataFrame(merged_text_boxes)

#     for i, box in original_text_box_df.iterrows():
#         if box['y1'] < img_height / 2 and box['y2'] < img_height / 2:
#             region = '上半'
#         elif box['y1'] >= img_height / 2 and box['y2'] >= img_height / 2:
#             region = '下半'
#         elif box['x1'] < img_width / 2 and box['x2'] < img_width / 2:
#             region = '左半'
#         else:
#             region = '右半'
#         original_text_box_df.at[i, 'Region'] = region

#         box_area = (box['x2'] - box['x1']) * (box['y2'] - box['y1'])
#         box_per = box_area / (img_width * img_height)
#         original_text_box_df.at[i, 'txt_Area'] = box_area
#         original_text_box_df.at[i, 'txt_Per'] = box_per

#     for i, box in merged_text_box_df.iterrows():
#         if box['y1'] < img_height / 2 and box['y2'] < img_height / 2:
#             region = '上半'
#         elif box['y1'] >= img_height / 2 and box['y2'] >= img_height / 2:
#             region = '下半'
#         elif box['x1'] < img_width / 2 and box['x2'] < img_width / 2:
#             region = '左半'
#         else:
#             region = '右半'
#         merged_text_box_df.at[i, 'Region'] = region

#         merge_area = (box['x2'] - box['x1']) * (box['y2'] - box['y1'])
#         merge_per = merge_area / (img_width * img_height)
#         merged_text_box_df.at[i, 'Area'] = merge_area
#         merged_text_box_df.at[i, 'Per'] = merge_per

#     return merged_text_box_df, original_text_box_df

# keyword_groups = {
#     '通用': ['以旧换新', '只换不修', '包邮', '无理由退', '先用后付', '京东白条', '期免息', '送货上门', '保修'],
#     '价保': ['价保', '保价'],
#     '纯价格': ['¥', '夫', '￥', r'\b价\b', '到手价', '活动价'],
#     '直降': ['立减', '直降', '降', '立省', r'^(?!.*升降).*$', r'^(?!.*降温).*$', r'^(?!.*降噪).*$', r'^(?!.*降低).*$'],
#     '折扣': ['折', r'^(?!.*折叠).*$', r'^(?!.*翻折).*$'],
#     '满减': [r'.*满.*减.*', r'.*满.*-.*', r'.*满.*免.*'],
#     '用券': ['用券', '领券', '券'],
#     '返券': ['返券', '京豆', '返现', r'.*返.*E卡.*', r'.*返.*红包.*'],
#     '限时': ['.*小时$', '.*天$', '时间', 'time', 'TIME', '限时', r'.*月.*日.*', r'.*日.*点.*', r'.*:.*', r'.*:.*', r'.*：.*', r'\b\d{1,2}\.\d{1,2}-\d{1,2}\b'],
#     'xx元任选': [r'.*元.*件.*'],
#     '赠品': [r'.*满.*赠.*', r'.*满.*送.*', '送', '抽', '奖励', '赠', r'^(?!.*送货).*$', r'^(?!.*送礼).*$', r'^(?!.*送装).*$', r'^(?!.*配送).*$', r'^(?!.*送达).*$'],
#     '节日名称': ['节', '出游季', '购物季', '毕业季', '开学季', '黑五', '周年庆', '儿童节', '父亲节', '端午节', '七夕', '中秋节', '国庆', '万圣节', '感恩节', '元旦', '圣诞', '情人节', '春节', '元宵节', '38节', '3.8节', '清明节', '母亲节', '618', '购物季', '开学季', '11.11', '黑五', '12.12', '女神节', '出游季', '放价季', '吃货节', '家装节'],
#     '是否限购': ['限购', '限量']
# }

# def keyword_analysis(text):
#     results = {}
#     for key, words in keyword_groups.items():
#         results[key] = any(re.search(word, text) for word in words)
#     return results

# def height_analysis(x1, y1, x2, y2):
#     height = abs(y2 - y1)
#     return height

# def process_images(folder_path, subfolder_name):
#     image_files = []
#     for root, dirs, files in os.walk(folder_path):
#         if 'grounding_output' in root and ('price' in root or 'txt' in root):
#             for file in files:
#                 if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
#                     image_files.append(os.path.join(root, file))
    
#     combined_results = []
#     for img_path in tqdm(image_files, desc=f'Processing images in {subfolder_name}'):
#         style = 'price' if 'price' in img_path else 'txt'
#         merged_df, original_df = merge_text_boxes(img_path, style)
#         if merged_df is not None and original_df is not None:
#             merged_df['Subfolder'] = subfolder_name
#             original_df['Subfolder'] = subfolder_name
#             combined_results.append({
#                 'original': original_df,
#                 'merged': merged_df
#             })
    
#     return combined_results

# # 主程序
# all_results = []
# for folder in os.listdir(input_folder_path):
#     if folder.isdigit():
#         folder_path = os.path.join(input_folder_path, folder)
#         if os.path.isdir(folder_path):
#             print(f"正在处理文件夹: {folder_path}")
#             results = process_images(folder_path, folder)
#             all_results.extend(results)

# final_combined_data = []
# for result in all_results:
#     result['original']['Type'] = 'Original'
#     result['merged']['Type'] = 'Merged'
#     combined = pd.concat([result['original'], result['merged']], ignore_index=True)
#     final_combined_data.append(combined)

# final_combined_df = pd.concat(final_combined_data, ignore_index=True)
# final_combined_df.sort_values(by=['Subfolder', 'File Name', 'Type'], inplace=True)

# for index, row in tqdm(final_combined_df.iterrows(), total=final_combined_df.shape[0], desc="Analyzing text"):
#     keyword_results = keyword_analysis(row['text'])
#     for key, value in keyword_results.items():
#         final_combined_df.at[index, key] = value
    
#     height = height_analysis(row['x1'], row['y1'], row['x2'], row['y2'])
#     final_combined_df.at[index, 'Height'] = height
#     final_combined_df.at[index, 'Height_Category'] = (
#         'Height_<18' if height < 18 else
#         'Height_18-29' if 18 <= height < 29 else
#         'Height_29-38' if 29 <= height < 38 else
#         'Height_>38'
#     )

# final_combined_df.to_excel(output_file_path, index=False)

# print('处理完成')

# import datetime
# current_time = datetime.datetime.now()
# formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
# print(f"完成时间: {formatted_time}")




# step3 - 将分散在各个grounding_output文件夹中的grounding_results_processed.xlsx合并起来

In [25]:
import os
import pandas as pd
import openpyxl

def merge_excel_files(base_path):
    # 用于存储所有数据框的列表
    all_dataframes = []

    # 遍历基础路径下的所有文件夹
    for root, dirs, files in os.walk(base_path):
        if 'grounding_output' in dirs:
            grounding_output_path = os.path.join(root, 'grounding_output')
            excel_file = os.path.join(grounding_output_path, 'grounding_results_processed.xlsx')
            
            if os.path.exists(excel_file):
                # 读取Excel文件
                df = pd.read_excel(excel_file)
                
                # 添加新列，值为当前子文件夹的名称
                subfolder_name = os.path.basename(os.path.dirname(grounding_output_path))
                df['Subfolder'] = subfolder_name
                
                # 将数据框添加到列表中
                all_dataframes.append(df)

    # 合并所有数据框
    if all_dataframes:
        merged_df = pd.concat(all_dataframes, ignore_index=True)
        
        # 保存合并后的数据框到新的Excel文件
        output_file = os.path.join(base_path, 'img_info.xlsx')
        merged_df.to_excel(output_file, index=False)
        print(f"合并完成，结果保存在: {output_file}")
    else:
        print("没有找到符合条件的Excel文件")

# 使用示例
base_path = "D://code//data//Lv2期结论//京喜_from_0501//筛选"
merge_excel_files(base_path)




合并完成，结果保存在: D://code//data//Lv2期结论//京喜_from_0501//筛选\img_info.xlsx


# step-4 将布局和ctr参数进行合并

In [41]:
import pandas as pd
import cv2
import numpy as np
from PIL import Image
import os



# def merge_excel_files(txt_box_info_file, img_box_info_file, output_file):
#     # 读取 txt_box_info 文件
#     txt_df = pd.read_excel(txt_box_info_file)
#     txt_df = txt_df.rename(columns={'File Name':'Image Name', 'x1': 'txt_x1', 'y1': 'txt_y1', 'x2': 'txt_x2', 'y2': 'txt_y2'})

#     # 读取 img_box_info 文件
#     img_df = pd.read_excel(img_box_info_file)

#     # 重命名列
#     img_df = img_df.rename(columns={'x1': 'img_x1', 'y1': 'img_y1', 'x3': 'img_x2', 'y3':'img_y2', 'Subfolder':'Style'})
#     img_df = img_df.loc[:, ['Image Name', 'Style','img_x1', 'img_y1', 'img_x2', 'img_y2', 'box_no', ]]
    
#     # 合并两个 DataFrame，使用 txt_box_info 的表头作为准
#     merged_df = pd.concat([txt_df, img_df], ignore_index=True)
    
#     # 将 img_box_info 中缺少的数据设置为空
#     merged_df = merged_df.fillna("")
    
#     # 将合并后的 DataFrame 写入新的 Excel 文件
#     with pd.ExcelWriter(output_file) as writer:
#         merged_df.to_excel(writer, index=False)


# if __name__ == '__main__':
#     txt_box_info_file = f"D://code//data//Lv2期结论//京喜_from_0501//筛选//txt_info.xlsx"
#     img_box_info_file = f"D://code//data//Lv2期结论//京喜_from_0501//筛选//img_info.xlsx"
#     output_file = f"D://code//data//Lv2期结论//京喜_from_0501//筛选//merged_info.xlsx"
    
#     merge_excel_files(txt_box_info_file, img_box_info_file, output_file)
    
#     print("Excel files merged successfully!")




import os
import pandas as pd
import re

# 定义路径
data_1 = f"D://code//data//Lv2期结论//京喜_from_0501//筛选//merged_info.xlsx"
data_2 = "D://code//data//Lv2期结论//京喜_from_0501//京喜数据_from_0501_筛选.csv"
# data_3 = 'D://code//data//howtodo_from_0401//服饰鞋靴箱包//品类聚类-服饰鞋靴箱包.csv'
output_path = f"D://code//data//Lv2期结论//京喜_from_0501//筛选//merged_info_ctr.xlsx"

# 读取df1
df1 = pd.read_excel(data_1)

# 读取df2
df2 = pd.read_csv(data_2)

aggregated_data = df2.groupby('img_url').agg({
    'cid2': 'first',  # 使用 'first' 函数来选择分组中的第一个值
    'cid3': 'first',
    'uv': 'sum',
    'click_uv': 'sum',
    'gmv_cj':'sum',
    'sale_qtty_cj':'sum'
#     'folder_path': 'first'  # 同样使用 'first' 函数选择第一个值
}).reset_index()  # 重置索引

df2 = aggregated_data

# 计算ctr字段
df2['ctr'] = df2['click_uv'] / df2['uv']


def extract_filename(x):
    # 分割路径，取倒数第二部分和最后一部分（文件名部分）
    parts = x.split('/')
    return f"{parts[-2]}_{os.path.splitext(parts[-1])[0]}"  # 保留原文件扩展名
    # return f"{os.path.splitext(parts[-1])[0]}.jpg"  # 保留原文件扩展名

# 应用函数
df2['only_2'] = df2['img_url'].apply(extract_filename)

# 初始化结果列表
results = []

# 遍历df1的每一行
for index, row1 in df1.iterrows():
    # 查找df2中匹配的行
    matching_rows_df2 = df2[df2['only_2'] == row1['Image Name']]
    
    # 如果没有找到匹配的行，则只添加df1的当前行
    if matching_rows_df2.empty:
        results.append(row1.to_dict())
    else:
        # 对于找到的每个匹配行，先添加df1的当前行，然后添加匹配的df2行
        results.append(row1.to_dict())
        for _, row2 in matching_rows_df2.iterrows():
            # 可能需要添加额外的逻辑来处理多个匹配的情况
            # 这里假设每个df1的行在df2中最多只有一个匹配
            merged_row = {**row1.to_dict(), **row2.to_dict()}
            results.append(merged_row)

# 将结果列表转换为DataFrame
result_df = pd.DataFrame(results)

result_df_drop = result_df.dropna(subset=['uv'])

# 保存到指定路径
result_df_drop.to_excel(output_path, index=False)

print(f"Merged file saved to {output_path}")


import datetime

current_time = datetime.datetime.now()
formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")

print(formatted_time)
print(formatted_time)
print(formatted_time)




Merged file saved to D://code//data//Lv2期结论//京喜_from_0501//筛选//merged_info_ctr.xlsx
2024-10-18 11:20:19
2024-10-18 11:20:19
2024-10-18 11:20:19


# step4 - 将图片按照比例进行分类
### x<0.77 / 0.77<x<1.3 / x>1.3

In [42]:
import pandas as pd
import numpy as np
import os
import shutil
from PIL import Image



# 读取Excel文件
df = pd.read_excel(f'D://code//data//Lv2期结论//京喜_from_0501//筛选//img_info.xlsx')

# # 图片路径前缀
# path_to_your_images = f'D://code//data//background_color//服饰鞋靴箱包//{x}//grounding_output//{y}'

# # 去掉File Name列中的后缀
# df['File Name'] = df['File Name'].str.split('_').str[0]

# # 定义一个函数来计算新的矩形框坐标
# def calculate_new_coordinates(group):
#     x_coords = group['main_box_x'] + group['main_box_width']
#     y_coords = group['main_box_y'] + group['main_box_height']
    
#     min_x = group['main_box_x'].min()
#     min_y = group['main_box_y'].min()
#     max_x = x_coords.max()
#     max_y = y_coords.max()
    
#     return pd.Series({
#         'merge_x1': min_x,
#         'merge_y1': min_y,
#         'merge_x2': max_x,
#         'merge_y2': max_y
#     })


# 定义一个函数来计算新的矩形框坐标
def calculate_new_coordinates(group):

    min_x = group['x1'].min()
    min_y = group['y1'].min()
    max_x = group['x3'].max()
    max_y = group['y3'].max()
    
    return pd.Series({
        'merge_x1': min_x,
        'merge_y1': min_y,
        'merge_x2': max_x,
        'merge_y2': max_y
    })


# 按File Name分组并计算新坐标
df = df.groupby('Image Name').apply(calculate_new_coordinates).reset_index()

# 计算矩形框的横纵比
df['aspect_ratio'] = (df['merge_x2'] - df['merge_x1']) / (df['merge_y2'] - df['merge_y1'])

# # 在图片路径前缀下创建新文件夹
# os.makedirs(os.path.join(path_to_your_images, '小于0.77'), exist_ok=True)
# os.makedirs(os.path.join(path_to_your_images, '0.77到1.3'), exist_ok=True)
# os.makedirs(os.path.join(path_to_your_images, '大于1.3'), exist_ok=True)

# 定义一个函数来分类图片并复制到相应文件夹
def classify_and_copy_image(row):
    # image_path = os.path.join(path_to_your_images, f"{row['File Name']}")
    if row['aspect_ratio'] < 0.77:
        # shutil.copy(image_path, os.path.join(path_to_your_images, '小于0.77', f"{row['File Name']}.jpg"))
        return '小于0.77'
    elif 0.77 <= row['aspect_ratio'] <= 1.3:
        # shutil.copy(image_path, os.path.join(path_to_your_images, '0.77到1.3', f"{row['File Name']}.jpg"))
        return '0.77到1.3'
    else:
        # shutil.copy(image_path, os.path.join(path_to_your_images, '大于1.3', f"{row['File Name']}.jpg"))
        return '大于1.3'

# 应用分类函数并添加结果列
df['classification'] = df.apply(classify_and_copy_image, axis=1)

# 保存结果到Excel
output_file = os.path.join('D://code//data//Lv2期结论//京喜_from_0501//筛选//0.77-1.3.xlsx')
df.to_excel(output_file, index=False)

print(f"处理完成，结果已保存到 {output_file}")



处理完成，结果已保存到 D://code//data//Lv2期结论//京喜_from_0501//筛选//0.77-1.3.xlsx


# step5 - 文本:布局分类&热力图生成

In [130]:
# 识别文本框是在3x3网格中，并将图片复制到相应的分类目录中，并保存可视化结果
# 这段代码是包含左上角的,即还是对可能的logo进行了统计



import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import shutil
import os
from tqdm import tqdm
import concurrent.futures
import datetime
import numpy as np
import seaborn as sns


x = '1355'
y = 'txt'

# 1. 读取和预处理数据
def normalize_coordinates(row):
    width = 616
    height = 616
    row['left_norm'] = max(0, min(row['txt_x1'] / width, 1))
    row['top_norm'] = max(0, min(row['txt_y1'] / height, 1))
    row['right_norm'] = max(0, min(row['txt_x2'] / width, 1))
    row['bottom_norm'] = max(0, min(row['txt_y2'] / height, 1))
    return row

# 2. 绘制矩形和网格
def draw_rectangles(group):
    fig, ax = plt.subplots(figsize=(5, 5), dpi=100)
    
    for _, row in group.iterrows():
        rect = Rectangle((row['left_norm'], 1 - row['bottom_norm']), 
                         row['right_norm'] - row['left_norm'], 
                         row['bottom_norm'] - row['top_norm'],
                         fill=False, edgecolor='r')
        ax.add_patch(rect)
    
    for i in range(3):
        for j in range(3):
            rect = Rectangle((j/3, 1 - (i+1)/3), 1/3, 1/3, fill=False, edgecolor='b')
            ax.add_patch(rect)
    
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.axis('off')
    return fig, ax

# 3. 判断重叠和分类
def check_overlap(rect, grid_cell):
    return not (rect[2] < grid_cell[0] or rect[0] > grid_cell[2] or
                rect[3] < grid_cell[1] or rect[1] > grid_cell[3])

def classify_image(group):
    overlaps = [0] * 9
    
    for _, row in group.iterrows():
        rect = (row['left_norm'], row['top_norm'], row['right_norm'], row['bottom_norm'])
        for i in range(3):
            for j in range(3):
                grid_cell = (j/3, i/3, (j+1)/3, (i+1)/3)
                if check_overlap(rect, grid_cell):
                    overlaps[i*3 + j] = 1
    return ''.join(map(str, overlaps))  # 直接使用join方法生成字符串

# 4. 处理单个图像
def process_image(name, group, x, y):
    classification = classify_image(group)
    
    # 复制图片
    source = os.path.join(f'D://code//data//Lv2期结论//京喜_from_0501//筛选//{x}//grounding_output//{y}', name)
    destination = os.path.join(f'D://code//data//Lv2期结论//京喜_from_0501//筛选//{x}//grounding_output//{y}//50%_txt_classified_images', classification.zfill(9), name)  # 使用zfill方法填充前导零
    os.makedirs(os.path.dirname(destination), exist_ok=True)
    shutil.copy2(source, destination)
    
    # 保存可视化结果
    fig, ax = draw_rectangles(group)
    visualization_name = f"{name.split('.')[0]}_visualization.png"  # 保留原文件名的前导零
    fig.savefig(os.path.join(f'D://code//data//Lv2期结论//京喜_from_0501//筛选//{x}//grounding_output//{y}//50%_txt_visualizations', visualization_name), bbox_inches='tight', pad_inches=0)
    plt.close(fig)
    
    return name, classification


# 主处理函数
def main(x, y):
    # 读取CSV文件
    print("读取并预处理数据...")

    # 在读取df时添加筛选条件
    def filter_by_rectangle(row):
        right, bottom = 616 * 0.3, 616 * 0.2
        if row['txt_x2'] < right and row['txt_y2'] < bottom:
            return False
        return True

    df = pd.read_excel(os.path.join(f'D://code//data//Lv2期结论//京喜_from_0501//筛选//merged_info_ctr.xlsx'))
    # df = df[df['img_x1'].isna()]
    # df = df[df['Subfolder'] == 9775]
    # df = df[df['Style'] == 'txt']
    # df = df[df['Type'] == 'Original']
    # df = df.sort_values('ctr', ascending=False)
    # df['File Name1'] = df['File Name1'] + '.jpg'

    # df = df[(df['img_x1'].isna()) & (df['Subfolder'] == 9775) & (df['Style'] == 'txt') & (df['Type'] == 'Original')]
    # df = df.sort_values('ctr', ascending=False)
    # df['File Name1'] = df['File Name1'] + '.jpg'

    df = df[(df['img_x1'].isna()) & (df['Subfolder'] == 1355) & (df['Style'] == 'txt') & (df['Type'] == 'Original')]
    df = df.sort_values('ctr', ascending=False)
    df['Image Name'] = df['Image Name'] + '.jpg'


    # 应用左上角的筛选条件
    df = df[df.apply(filter_by_rectangle, axis=1)]

    rows_to_keep = int(len(df) * 0.5)
    df = df.head(rows_to_keep)

    # 应用normalize_coordinates函数
    df = df.apply(normalize_coordinates, axis=1)

    # 创建输出目录
    classified_images_dir = os.path.join(f'D://code//data//Lv2期结论//京喜_from_0501//筛选//{x}//grounding_output//{y}', '50%_txt_classified_images')
    visualizations_dir = os.path.join(f'D://code//data//Lv2期结论//京喜_from_0501//筛选//{x}//grounding_output//{y}', '50%_txt_visualizations')
    os.makedirs(classified_images_dir, exist_ok=True)
    os.makedirs(visualizations_dir, exist_ok=True)

    # 按File name分组并处理
    grouped = df.groupby('Image Name')
    print("处理图像...")

    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        futures = [executor.submit(process_image, name, group, x, y) for name, group in grouped]
        
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            name, classification = future.result()
            results.append((name, classification))
    
    # 在原Excel文件中新增一列，保存分类结果
    classification_df = pd.DataFrame(results, columns=['Image Name', 'Classification'])
    df = pd.merge(df, classification_df, on='Image Name', how='left')
    df.to_excel(os.path.join(f'D://code//data//Lv2期结论//京喜_from_0501//筛选//{x}//grounding_output//{y}//50%_txt_info_with_classification-{x}_{y}.xlsx'), index=False)
    
    print("处理完成！")

if __name__ == "__main__":

    main(x, y)

current_time = datetime.datetime.now()
formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")

print(formatted_time)
print(formatted_time)
print(formatted_time)
print(x, y)








读取并预处理数据...
处理图像...


100%|██████████| 63/63 [00:01<00:00, 43.37it/s]


处理完成！
2024-10-18 13:05:12
2024-10-18 13:05:12
2024-10-18 13:05:12
1355 txt


In [131]:
# 针对分类结果, 绘制每个类别的文本框热力图
#

# 创建保存结果的文件夹
output_folder = os.path.join(f'D://code//data//Lv2期结论//京喜_from_0501//筛选//{x}//grounding_output//{y}', '50%_txt_output_heatmaps')
os.makedirs(output_folder, exist_ok=True)

# 读取Excel文件
df = pd.read_excel(os.path.join(f'D://code//data//Lv2期结论//京喜_from_0501//筛选//{x}//grounding_output//{y}//50%_txt_info_with_classification-{x}_{y}.xlsx'))

# 按Classification列进行分组
grouped = df.groupby('Classification')

# 遍历每个分组
for name, group in grouped:
    # 创建一个新的图形
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
    
    # 绘制矩形框
    ax1.set_xlim(0, 616)
    ax1.set_ylim(616, 0)
    for _, row in group.iterrows():
        x1, y1, x2, y2 = row['txt_x1'], row['txt_y1'], row['txt_x2'], row['txt_y2']
        # 排除左上角的文本框
        if x1 <= 616 * 0.3 and y1 <= 616 * 0.2:
            continue
        width = x2 - x1
        height = y2 - y1
        rect = plt.Rectangle((x1, y1), width, height, fill=False, edgecolor='r')
        ax1.add_patch(rect)
    ax1.set_title(f'Bounding Boxes for {name}')
    ax1.set_xlabel('X')
    ax1.set_ylabel('Y')

    # 创建热力图
    heatmap = np.zeros((616, 616))
    for _, row in group.iterrows():
        x1, y1, x2, y2 = row['txt_x1'], row['txt_y1'], row['txt_x2'], row['txt_y2']
        # 排除左上角的文本框
        if x1 <= 616 * 0.3 and y1 <= 616 * 0.2:
            continue
        x1, y1 = max(0, min(x1, 615)), max(0, min(y1, 615))
        x2, y2 = max(0, min(x2, 615)), max(0, min(y2, 615))
        heatmap[int(y1):int(y2)+1, int(x1):int(x2)+1] += 1

    # 绘制热力图
    sns.heatmap(heatmap, ax=ax2, cmap='YlOrRd', cbar=True)
    ax2.set_title(f'Heatmap for {name}')
    ax2.set_xlabel('X')
    ax2.set_ylabel('Y')

    # 调整子图之间的间距
    plt.tight_layout()

    # 保存图像
    plt.savefig(os.path.join(output_folder, f'{name}_heatmap.png'), dpi=100, bbox_inches='tight')
    plt.close()  # 关闭图形，释放内存

    print(f"已保存 {name} 的热力图")

print("所有热力图已保存在 output_heatmaps 文件夹中")

import datetime

current_time = datetime.datetime.now()
formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")

print(formatted_time)
print(formatted_time)
print(formatted_time)
print(x, y)




已保存 100 的热力图
已保存 111 的热力图
已保存 1000100 的热力图
已保存 1000111 的热力图
已保存 1001101 的热力图
已保存 1001111 的热力图
已保存 100000100 的热力图
已保存 100100111 的热力图
已保存 100110111 的热力图
所有热力图已保存在 output_heatmaps 文件夹中
2024-10-18 13:06:38
2024-10-18 13:06:38
2024-10-18 13:06:38
1355 txt


# 文本大小的总结

In [132]:
# 新流程, 可以通过list方式, 来合并读取


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os


# x = ['女士春夏上装', '女士春夏下装', '女士休闲鞋']
# y = ['price', 'txt']

x = ['1355']
y = ['txt']


def filter_by_rectangle(row):
    right, bottom = 616 * 0.3, 616 * 0.2
    if row['txt_x2'] < right and row['txt_y2'] < bottom:
        return False
    return True


for item_x in x:
    for item_y in y:
        # 读取Excel文件
        df = pd.read_excel(f'D://code//data//Lv2期结论//京喜_from_0501//筛选//{item_x}//grounding_output//{item_y}//50%_txt_info_with_classification-{item_x}_{item_y}.xlsx')

        # 应用 filter_by_rectangle 函数来过滤数据
        df = df[df.apply(filter_by_rectangle, axis=1)]

        # 替换 Height_Category 列的值
        df['Height_Category'] = df['Height_Category'].replace({
            'Height_>38': 'Height大于38',
            'Height_18-29': 'Height18到29',
            'Height_29-38': 'Height29到38',
            'Height_<18': 'Height小于18'
        })

        # 删除 structure 为空值的行
        # df = df.dropna(subset=['structure'])
        df = df.dropna(subset=['Height_Category'])

        # 确保必要的列存在
        required_columns = ['Height_Category', 'txt_x1', 'txt_y1', 'txt_x2', 'txt_y2']
        if not all(col in df.columns for col in required_columns):
            raise ValueError(f"DataFrame must contain all of these columns: {required_columns}")

        # 创建输出目录
        output_dir = f"D://code//data//Lv2期结论//京喜_from_0501//筛选//{item_x}//grounding_output//{item_y}//{item_x}_{item_y}_wordsize_heatmaps"
        os.makedirs(output_dir, exist_ok=True)

        # 按 structure 和 box_no 分组
        grouped = df.groupby(['Height_Category'])

        # 遍历每个分组
        for (box_no), group in grouped:
            # 创建一个空的 2D numpy 数组来存储热力图数据，大小为 616x616
            heatmap_data = np.zeros((616, 616))

            # 对每个矩形框增加热度值
            for _, row in group.iterrows():
                x1, y1, x2, y2 = map(int, [row['txt_x1'], row['txt_y1'], row['txt_x2'], row['txt_y2']])
                # 确保坐标不超出边界
                x1, x2 = max(0, min(x1, 616)), max(0, min(x2, 616))
                y1, y2 = max(0, min(y1, 616)), max(0, min(y2, 616))
                heatmap_data[y1:y2, x1:x2] += 1

            # 创建图形，设置大小为正方形
            plt.figure(figsize=(10, 10))

            # 使用 seaborn 绘制热力图
            sns.heatmap(heatmap_data, cmap='YlOrRd', cbar_kws={'label': 'Frequency'}, square=True)

            # 设置标题和轴标签
            # plt.title(f'Bounding Box Heatmap - Structure: {structure}, word size: {Height_Category}')
            plt.xlabel('X coordinate')
            plt.ylabel('Y coordinate')

            # 调整图形以保持正方形比例
            plt.gca().set_aspect('equal', adjustable='box')

            # 保存图形
            output_path = os.path.join(output_dir, f"heatmap_wordsize_{box_no}.png")
            plt.savefig(output_path, dpi=300, bbox_inches='tight')
            plt.close()

            print(f"Heatmap saved to: {output_path}")

print("All heatmaps have been generated.")




Heatmap saved to: D://code//data//Lv2期结论//京喜_from_0501//筛选//1355//grounding_output//txt//1355_txt_wordsize_heatmaps\heatmap_wordsize_('Height18到29',).png
Heatmap saved to: D://code//data//Lv2期结论//京喜_from_0501//筛选//1355//grounding_output//txt//1355_txt_wordsize_heatmaps\heatmap_wordsize_('Height29到38',).png
Heatmap saved to: D://code//data//Lv2期结论//京喜_from_0501//筛选//1355//grounding_output//txt//1355_txt_wordsize_heatmaps\heatmap_wordsize_('Height大于38',).png
Heatmap saved to: D://code//data//Lv2期结论//京喜_from_0501//筛选//1355//grounding_output//txt//1355_txt_wordsize_heatmaps\heatmap_wordsize_('Height小于18',).png
All heatmaps have been generated.


In [133]:
'''
这里是通过读取list形式, 来简化输入的
'''


import pandas as pd
import os
from tqdm import tqdm
from openai import OpenAI

# 定义 x 和 y 列表
x_list = ['1355']  # 示例值，请根据实际需求修改
y_list = ['txt']  # 示例值，请根据实际需求修改

# 设置API密钥和基础URL
os.environ["OPENAI_API_KEY"] = "35f54cc4-be7a-4414-808e-f5f9f0194d4f"
os.environ["OPENAI_API_BASE"] = "http://gpt-proxy.jd.com/gateway/azure"

client = OpenAI(
    api_key=os.environ["OPENAI_API_KEY"],
    base_url=os.environ["OPENAI_API_BASE"],
)

def filter_by_rectangle(row):
    right, bottom = 616 * 0.3, 616 * 0.2
    if row['txt_x2'] < right and row['txt_y2'] < bottom:
        return False
    return True

# 定义一个函数，使用GPT-4模型对文本进行自动摘要
def summarize_with_gpt4(text):
    response = client.chat.completions.create(
        model="gpt-4o",  # 使用正确的模型名称，如果需要请更改
        messages=[
            {"role": "system", "content": f"你是一个非常优秀的电商分析师,现在有这样一个数据集data:\n\n{text}"},
            {"role": "user", "content": """
                # Role 
                    角色: 电商数据分析师。
                # Profile 
                    简介: 我是一名电商数据分析师，我的工作是通过收集和整理各种信息，找出影响商品销量的因素，并形成报告。
                ## Background 
                    背景: 现在有一些关于同一类商品的描述信息，我需要从中进行归纳总结，找出这些描述信息都是从哪些维度切入的， 并且在这个维度上统计出下钻的一个细分维度。
                ## Goals 
                    目标: 基于我给到的商品描述信息数据集，归纳总结出描述的方向维度，需要特别关注与细化商品本身的特性，并统计这些维度出现的频率。
                ## Constrains 
                    约束条件: 1、时刻保持自己是电商数据分析师的角色, 2、可以进行适当的联想和猜测, 3、举例的时候禁止出现"", 4、统计频率的时候请仔细仔细再仔细
                ## Tone 
                    语气风格: 正式的，客观的，科学的。
                ## Skills 
                    技能: 1、你有出色的文本理解能力,能够理解输入数据的含义 2、你有出色的归纳总结能力,能够归纳总结出数据的描述维度 3、你也有出色的数据统计能力,能够精确的统计出各个维度出现的频次。
                ## OutputFormat 
                    输出格式:以文字方式输出，按照维度，细分维度，细分维度下具体内容举例，细分维度出现频次呈现"""}
        ],
    )
    return response.choices[0].message.content.strip()

# 遍历 x 和 y 的所有组合
for x in x_list:
    for y in y_list:
        print(f"Processing: {x} - {y}")
        
        # 读取Excel文件
        file_path = f'D://code//data//Lv2期结论//京喜_from_0501//筛选//{x}//grounding_output//{y}//50%_txt_info_with_classification-{x}_{y}.xlsx'
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue
        
        df = pd.read_excel(file_path)
        # df = df.dropna(subset=['structure'])

        # 确保 'text' 列中的所有值都是字符串
        df['text'] = df['text'].astype(str)
        
        df = df[df.apply(filter_by_rectangle, axis=1)]
        df_grouped = df.groupby(['Height_Category'])
        
        # 遍历每个分组，合并文本并进行总结
        summaries = []
        
        # 遍历每个分组
        for (height_category), group in tqdm(df_grouped):
            # 合并该组的所有文本
            all_text = " ".join(group['text'].dropna())
            # print(f"Structure: {structure}")
            print(f"Height Category: {height_category}")
            print(f"Text: {all_text[:100]}...")  # 只打印前100个字符
            
            # 使用 GPT-4 进行总结
            try:
                summary = summarize_with_gpt4(all_text)
                print(f"Summary: {summary[:100]}...")  # 只打印前100个字符
            except Exception as e:
                print(f"Error in summarization: {str(e)}")
                summary = "Error in summarization"
            
            # 将结果添加到列表中
            summaries.append({
                # 'structure': structure,
                'Height_Category': height_category,
                'text': all_text,
                'summary': summary
            })
        
        # 创建一个新的DataFrame来存储结果
        result_df = pd.DataFrame(summaries)
        
        # 保存结果到Excel文件
        output_file = f"D://code//data//Lv2期结论//京喜_from_0501//筛选//{x}//grounding_output//{y}//{x}_{y}_文本分类总结.xlsx"
        result_df.to_excel(output_file, index=False)
        print(f"Results saved to: {output_file}")

print("All processing completed.")





Processing: 1355 - txt


  0%|          | 0/4 [00:00<?, ?it/s]

Height Category: ('Height_18-29',)
Text: CHUNYUFENG WaUmnao SANDER Wailnlao CHUNYUFENG...


 25%|██▌       | 1/4 [00:19<00:57, 19.25s/it]

Summary: ## 商品描述信息的数据分析总结

根据提供的数据集 "CHUNYUFENG WaUmnao SANDER Wailnlao CHUNYUFENG"，我们可以得出以下分析结果：

### 维度 1: ...
Height Category: ('Height_29-38',)
Text: 休闲百搭时尚，亲肤舒适 京喜自营 京喜自营 休闲百搭时尚，亲肤舒适 京喜自营 夏季新款热卖 京喜自营 京喜自营 夏季新款热卖 京喜自营 京喜自营 夏季新款热卖 京喜自营 夏季新款热卖 京喜自营 时尚优...


 50%|█████     | 2/4 [00:27<00:25, 12.78s/it]

Summary: ### 电商商品描述信息维度总结报告

#### 1. 商品类别
- **细分维度**: T恤类型
  - 具体内容: 休闲百搭时尚, 时尚优质百搭女士T恤
  - 出现频次: 4

#### 2. ...
Height Category: ('Height_<18',)
Text: CHUNYUFENG...


 75%|███████▌  | 3/4 [00:59<00:21, 21.64s/it]

Summary: ## 商品描述维度分析报告

经过对所提供的商品描述信息数据集的详细分析与总结，以下归纳出商品描述的主要方向维度、细分维度及其具体内容，并统计了各个维度出现的频次。

### 1. 产品特性

###...
Height Category: ('Height_>38',)
Text: 纯欲风 包邮 包邮 包邮 包邮 包邮 包邮 包邮 包邮 包邮 包邮 四季百搭 包邮 包邮 精梳纯棉 包邮 纯欲风 包邮 4EM 包邮 包邮 包邮 包邮 包邮 包邮 包邮 包邮 包邮 包邮 包邮 包邮 ...


100%|██████████| 4/4 [01:07<00:00, 16.95s/it]

Summary: ## 商品描述信息的数据分析报告

### 1. 物流方式
#### 细分维度:
- 包邮

#### 频次统计:
- 包邮: 61 次

### 2. 商品风格
#### 细分维度:
- 纯欲风
-...
Results saved to: D://code//data//Lv2期结论//京喜_from_0501//筛选//1355//grounding_output//txt//1355_txt_文本分类总结.xlsx
All processing completed.





In [22]:
'''
这里是通过读取list形式, 来简化输入的
'''



import pandas as pd
import os
from tqdm import tqdm
from openai import OpenAI

# 定义 x 和 y 列表
x_list = ['1657']  # 示例值，请根据实际需求修改
y_list = ['txt']  # 示例值，请根据实际需求修改

# 设置API密钥和基础URL
os.environ["OPENAI_API_KEY"] = "35f54cc4-be7a-4414-808e-f5f9f0194d4f"
os.environ["OPENAI_API_BASE"] = "http://gpt-proxy.jd.com/gateway/azure"

client = OpenAI(
    api_key=os.environ["OPENAI_API_KEY"],
    base_url=os.environ["OPENAI_API_BASE"],
)

def filter_by_rectangle(row):
    right, bottom = 616 * 0.3, 616 * 0.2
    if row['txt_x2'] < right and row['txt_y2'] < bottom:
        return False
    return True

# 定义一个函数，使用GPT-4模型对文本进行自动摘要
def summarize_with_gpt4(text):
    response = client.chat.completions.create(
        model="gpt-4o",  # 使用正确的模型名称，如果需要请更改
        messages=[
            {"role": "system", "content": f"你是一个非常优秀的电商分析师,现在有这样一个数据集data:\n\n{text}"},
            {"role": "user", "content": """
                # Role 
                角色: 电商数据分析师。
                # Profile 
                简介: 我是一名电商数据分析师，我的工作是通过收集和整理各种信息，找出影响商品销量的因素，并形成报告。
                ## Background 
                背景: 现在有一些关于同一类商品的描述信息，我需要从中进行归纳总结，并基于一些前置的定义，找出这些描述信息都是从哪些维度切入的。
                ## Goals 
                目标: 基于我给到的商品描述信息数据集和前置的维度定义，归纳总结出描述的方向维度，需要特别关注与细化商品本身的卖点特性，并统计这些维度出现的频率。
                ## Definitions
                定义：
                1. 直接展示价格：直接展示价格信息，到手价，预估到手价，会员价等，通常包含上述前缀，¥+具体的价格数字或者具体的价格数字+元。
                2. 折扣信息：描述商品的折扣，通常包含具体的折扣数字+折。
                3. 直降信息：描述商品相较原价进行了大幅降价，通常包含直降、立减。
                4. 满减信息：描述若购买到一定金额，可以在此基础上进行金额优惠，通常包含满+具体的金额+减+具体的金额
                5. 赠品信息：描述若购买商品则会赠送服务或商品，通常包含赠、送
                6. 限时：描述商品促销的时间，通常包含活动时间段、活动开始时间、活动结束时间
                7. 品牌名称：描述商品的品牌名称
                8. 代言人信息：描述商品的代言人信息
                9. 价保：价格保护，通常包含价保
                10. 店铺背书：描述店铺的信息，通常包含旗舰店、自营
                11. 物流服务：描述商品所包含的物流服务，通常包含物流时效、运费险、物流名称、仓库名称、包邮
                12. 直接展示价格属于价格信息一级维度，折扣信息、直降信息、满减信息、赠品信息、限时属于价促活动一级维度，品牌名称、代言人信息属于品牌信息一级维度，价保、店铺背书、物流服务属于服务保障一级维度
                ## Constrains 
                约束条件: 1、时刻保持自己是电商数据分析师的角色，2、可以进行适当的联想和猜测，3、举例的时候禁止出现""，4、统计频率的时候请仔细仔细再仔细，5、若识别到的内容不在上述定义的维度中，可自行命名并统计，请不要忽视未被定义的维度，特别是关于商品本身的卖点信息描述
                ## Tone 
                语气风格: 正式的，客观的，科学的。
                ## Skills 
                技能: 1、你有出色的文本理解能力，能够理解输入数据的含义 2、你有出色的归纳总结能力，能够归纳总结出数据的描述维度 3、你也有出色的数据统计能力，能够精确的统计出各个维度出现的频次。
                ## OutputFormat 
                输出格式:以文字方式输出，一级维度，一级维度下具体内容和举例和频次，输出顺序按照价格信息、价促活动、品牌信息、服务保障、商品卖点进行输出，商品卖点为未定义维度，请你依照自己的知识库信息进行汇总输出，需要特别注意，是关于商品本身的描述，输出格式为1.价格信息 总频次 直接展示价格 频次 举例 以此类推,注意输出要精简，减少不必要的换行
                    """}
        ],
    )
    return response.choices[0].message.content.strip()




# 遍历 x 和 y 的所有组合
for x in x_list:
    for y in y_list:
        print(f"Processing: {x} - {y}")
        
        # 读取Excel文件
        file_path = f'D://code//data//Lv2期结论//京喜//筛选//{x}//grounding_output//{y}//50%_txt_info_with_classification-{x}_{y}.xlsx'
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue
        
        df = pd.read_excel(file_path)
        # df = df.dropna(subset=['structure'])

        # 确保 'text' 列中的所有值都是字符串
        df['text'] = df['text'].astype(str)
        
        df = df[df.apply(filter_by_rectangle, axis=1)]
        df_grouped = df.groupby(['Height_Category'])
        
        # 遍历每个分组，合并文本并进行总结
        summaries = []
        
        # 遍历每个分组
        for (height_category), group in tqdm(df_grouped):
            # 合并该组的所有文本
            all_text = " ".join(group['text'].dropna())
            # print(f"Structure: {structure}")
            print(f"Height Category: {height_category}")
            print(f"Text: {all_text[:100]}...")  # 只打印前100个字符
            
            # 使用 GPT-4 进行总结
            try:
                summary = summarize_with_gpt4(all_text)
                print(f"Summary: {summary[:100]}...")  # 只打印前100个字符
            except Exception as e:
                print(f"Error in summarization: {str(e)}")
                summary = "Error in summarization"
            
            # 将结果添加到列表中
            summaries.append({
                # 'structure': structure,
                'Height_Category': height_category,
                'text': all_text,
                'summary': summary
            })
        
        # 创建一个新的DataFrame来存储结果
        result_df = pd.DataFrame(summaries)
        
        # 保存结果到Excel文件
        output_file = f"D://code//data//Lv2期结论//京喜//筛选//{x}//grounding_output//{y}//{x}_{y}_文本分类总结.xlsx"
        result_df.to_excel(output_file, index=False)
        print(f"Results saved to: {output_file}")

print("All processing completed.")





Processing: 1657 - txt


  0%|          | 0/4 [00:00<?, ?it/s]

Height Category: ('Height_18-29',)
Text: 卡扣设计拆装方便 不易扯下 20条装 20*20cm 柔软亲肤·粘合力强·不易缩水 铲刷两件套 BRUSHDUSTPANSUIT 贴心好用每天安心相伴 WORLDLIFE洁牙擦 特殊纳米密胺绵深层清洁...


 25%|██▌       | 1/4 [00:16<00:50, 16.78s/it]

Summary: 好的，让我们开始根据提供的商品描述信息进行归纳总结和频次统计。

### 1. 价格信息
总频次：0

- 直接展示价格
    - 频次：0
    - 举例：无

### 2. 价促活动
总频次：...
Height Category: ('Height_29-38',)
Text: 通用型拉链头/简单更耐用 京喜自营 直接安装 无需缝纫 京喜自营 舒适按摩 京喜自营 弹性梳齿 发型师级 镂空按摩梳 镂空设计 京喜自营 全包防护/加厚涤纶 京喜自营 双层 双层 20*20cm20条...


 50%|█████     | 2/4 [00:51<00:55, 27.58s/it]

Summary: 基于提供的数据集和定义的维度，以下是对描述信息的归纳总结以及频次统计：

1. **价格信息** (总频次: 0)
    - 直接展示价格 (频次: 0)

2. **价促活动** (总频次: 1)...
Height Category: ('Height_<18',)
Text: 更方便更轻松 着色污 CoolSumr 秒出美照! 自带反光板 PC CLASSICFLOSSRODS 50 柔精恒滑洁牙护边 DENTALFLOSS强韧不伤牙齿 DENTALFLOSS强韧不伤牙齿 ...


 75%|███████▌  | 3/4 [01:01<00:19, 19.48s/it]

Summary: 以下是基于提供的商品描述信息数据集归纳总结出的各维度方向及其出现频次的统计结果：

### 1. 价格信息
总频次：0  
- 直接展示价格：0次
  - 举例：无  

### 2. 价促活动
总频...
Height Category: ('Height_>38',)
Text: 包邮 拉链头 可拆卸式 包邮 牙膏の开挂收纳 清爽一整夜） 要你爱不释口 包邮 按摩头皮 全包裹 包邮 背面卡扣 包邮 强力去渍 不伤表面 包邮 免缝·裤脚修改 包邮 刷子簸箕套装 安睡整晚 包邮 创...


100%|██████████| 4/4 [01:14<00:00, 18.60s/it]

Summary: 根据你提供的数据集和定义，我归纳总结了描述信息的方向维度并统计了每个维度的频次。以下是具体的分析结果：

1. **价格信息** 总频次：0
   - 直接展示价格：0
     - 举例：无

2....
Results saved to: D://code//data//Lv2期结论//京喜//筛选//1657//grounding_output//txt//1657_txt_文本分类总结.xlsx
All processing completed.



