# step1 - 对图片进行分类(price/txt/white/scene)

In [None]:
import os
import shutil
from PIL import Image
from paddleocr import PaddleOCR
from tqdm import tqdm
from datetime import datetime
import math
import time
import numpy as np
import pandas as pd



z = '男鞋_from_0501'



%config IPCompleter.greedy=True


def process_folder(root_folder):
    # 初始化 PaddleOCR
    ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False)

    # 遍历根文件夹下的所有子文件夹
    for dirpath, dirnames, filenames in os.walk(root_folder):
        if os.path.basename(dirpath) == 'grounding_output':
            price_folder = os.path.join(dirpath, 'price')
            txt_folder = os.path.join(dirpath, 'txt')
            scene_folder = os.path.join(dirpath, 'scene')
            white_folder = os.path.join(dirpath, 'white')

            # 如果文件夹不存在，则创建
            for folder in [price_folder, txt_folder, scene_folder, white_folder]:
                if not os.path.exists(folder):
                    os.makedirs(folder)

            '''
            step1 - 筛选价促卖点图
            '''
            print('Step 1: 筛选价促卖点图')

            image_files = [os.path.join(dirpath, filename) for filename in os.listdir(dirpath) if filename.endswith(('.jpg', '.png'))]

            # 使用 tqdm 创建进度条
            with tqdm(total=len(image_files), desc="Processing images") as pbar:
                # 遍历源文件夹中的所有图片文件
                for filename in image_files:
                    img_path = os.path.join(dirpath, filename)

                    # 使用 PaddleOCR 进行文字识别
                    result = ocr.ocr(img_path, cls=True)

                    if not result:
                        # 如果识别结果为空,则跳过这张图片,不进行移动操作
                        pbar.write(f"Image '{filename}' skipped due to empty OCR result.")
                        pbar.update(1)
                        continue

                    data_list = result[0]

                    # 检查识别结果是否有关键词
                    contains_keyword = False

                    if data_list:
                        # 定义关键词列表
                        keywords = ['满', '减', '折', '到手价', '送', '免息', '活动价', '包邮价', '参考价',
                                    r'.*满.*减.*', r'.*满.*-.*', r'.*满.*赠.*', r'.*满.*送.*', r'.*价.*',
                                    '券', '优惠', '用券', '领券', '券', '送', '低至', '立减', '直降', '免息',
                                    '¥', '夫', '￥']

                        # 遍历识别结果中的文本
                        for data in data_list:
                            text = data[1][0]  # 获取文本内容
                            # 检查当前文本是否包含关键词
                            if any(keyword in text for keyword in keywords):
                                contains_keyword = True
                                break

                    # 根据检查结果移动文件
                    if contains_keyword:
                        try:
                            shutil.move(img_path, price_folder)
                            # pbar.write(f"Image '{filename}' moved to price_folder.")
                        except Exception as e:
                            pbar.write(f"Failed to move image '{filename}': {e}")
                            continue
                    # else:
                    #     # 如果不包含关键词，移动到其他文件夹
                    #     # 你可以根据需要修改这里的逻辑
                    #     try:
                    #         shutil.move(img_path, txt_folder)
                    #         pbar.write(f"Image '{filename}' moved to scene_folder.")
                    #     except Exception as e:
                    #         pbar.write(f"Failed to move image '{filename}': {e}")

                    # 重置 contains_keyword 变量
                    contains_keyword = False

                    # 更新进度条
                    pbar.update(1)

            print("卖点图 classification completed.")

            # 获取当前时间
            current_time = datetime.now()

            # 格式化输出当前时间
            print("完成时间:", current_time)

            '''
            step2 - 筛选白底图
            '''

            print('Step 2: 筛选白底图')

            def move_images_with_white_pixels(base_path, white_folder, threshold=0.40):
                # 获取源文件夹中的图片文件列表
                image_files = [filename for filename in os.listdir(base_path) if filename.endswith(('.jpg', '.png'))]

                # 使用 tqdm 创建进度条
                with tqdm(total=len(image_files), desc=f"Moving images from {base_path}") as pbar:
                    # 遍历源文件夹中的所有图片文件
                    for filename in image_files:
                        img_path = os.path.join(base_path, filename)

                        # 打开图片并获取像素信息
                        with Image.open(img_path) as img:
                            # 确认图片是 RGB 模式
                            if img.mode!= 'RGB':
                                img = img.convert('RGB')
                            # 获取图片的宽度和高度
                            width, height = img.size

                            # 统计白色像素点的数量
                            white_pixels = 0
                            for x in range(width):
                                for y in range(height):
                                    # 获取像素点的 RGB 值
                                    pixel_value = img.getpixel((x, y))
                                    # 如果是 RGB 图像，解包为三个值，否则为四个值
                                    if len(pixel_value) == 3:
                                        r, g, b = pixel_value
                                    else:  # 处理带有透明度的图像
                                        r, g, b, a = pixel_value
                                    # 如果 RGB 值都大于 230，则认为是白色像素点
                                    if r > 230 and g > 230 and b > 230:
                                        white_pixels += 1

                            # 计算白色像素点占比
                            white_ratio = white_pixels / (width * height)

                            # 如果白色像素点占比超过阈值，则将图片移动到目标文件夹
                            if white_ratio > threshold:
                                target_path = os.path.join(white_folder, filename)
                                try:
                                    shutil.move(img_path, white_folder)
                                    # pbar.write(f"Image '{filename}' moved to white folder.")
                                except Exception as e:
                                    pbar.write(f"Error moving image '{filename}': {e}")
                                    continue

                        # 更新进度条
                        pbar.update(1)

            # 调用函数，将白色像素点占比超过 30%的图片从源文件夹列表中移动到目标文件夹
            move_images_with_white_pixels(dirpath, white_folder, threshold=0.40)

            # 获取当前时间
            current_time = datetime.now()

            # 格式化输出当前时间
            print("完成时间:", current_time)

            '''
            step3 - 筛选功能卖点图
            '''

            print('Step 3: 筛选功能卖点图')

            image_files = [filename for filename in os.listdir(dirpath) if filename.endswith(('.jpg', '.png'))]
            for filename in image_files:

                img_path = os.path.join(dirpath, filename)
                # print(f"Processing image: {img_path}")

                img = Image.open(img_path)
                width, height = img.size

                # 读取图片下部分 5/6
                cropped_img = img.crop((0, height // 6, width, height))

                # 将 PIL 图像对象转换为 numpy 数组
                img_np = np.array(cropped_img)

                # OCR 处理
                result = ocr.ocr(img_np, cls=True)

                if result is None:
                    continue

                rectangles_with_text = result[0]

                if rectangles_with_text is None:
                    continue

                line_count = len(rectangles_with_text)

                # 如果文本行数大于等于 3, 将图片移动到目标文件夹
                if line_count >= 2:
                    target_path = os.path.join(txt_folder, filename)
                    shutil.move(img_path, txt_folder)
                    # print(f"'{filename}' moved to txt folder.")

                else:
                    # print(f"'{filename}' remains in source folder.")
                    continue

            '''
            step4 - 将剩余图片归类到 scene
            '''

            print('Step 4: 归类图片到 scene')

            # 检查目标文件夹是否存在，如果不存在则创建
            if not os.path.exists(scene_folder):
                os.makedirs(scene_folder)

            # 获取源文件夹中的所有图片文件列表
            image_files = [filename for filename in os.listdir(dirpath) if filename.endswith(('.jpg', '.png'))]

            # 移动图片到 txt_folder
            for filename in image_files:
                img_path = os.path.join(dirpath, filename)
                target_path = os.path.join(scene_folder, filename)
                shutil.move(img_path, scene_folder)
                # print(f"Image '{filename}' moved to scene_folder.")

            print("Image moving completed.")

            # 获取当前时间
            current_time = datetime.now()

            # 格式化输出当前时间
            print("完成时间:", current_time)

            '''
            step5 - 将 white 中有文本的图片移到 txt_folder
            '''

            # 获取源文件夹中的所有图片文件列表
            image_files = [filename for filename in os.listdir(white_folder) if filename.endswith(('.jpg', '.png'))]

            # 使用 tqdm 创建进度条
            with tqdm(total=len(image_files), desc="Processing images") as pbar:

                # 遍历源文件夹中的所有图片文件
                for filename in image_files:
                    img_path = os.path.join(white_folder, filename)

                    img = Image.open(img_path)
                    width, height = img.size

                    # 读取图片下部分 3/4
                    cropped_img = img.crop((0, height // 6, width, height))

                    # 将 PIL 图像对象转换为 numpy 数组
                    img_np = np.array(cropped_img)

                    # OCR 处理
                    result = ocr.ocr(img_np, cls=True)

                    if result is None:
                        continue

                    rectangles_with_text = result[0]

                    if rectangles_with_text is None:
                        continue

                    # 统计文本框高度大于 30 像素的行数
                    lines_above_30_count = 0
                    for rectangle in rectangles_with_text:
                        # 计算文本框的高度和宽度
                        points = rectangle[0]
                        x_A, y_A = points[0]

                        # 计算 A 点到 BCD 的距离
                        distances = []
                        for point in points[1:]:
                            x_B, y_B = point
                            distance = math.sqrt((x_B - x_A) ** 2 + (y_B - y_A) ** 2)
                            distances.append(distance)

                        # 获取最短的距离作为文本框的大小
                        text_size = min(distances)

                        # 统计高度大于 x 像素的行数
                        if text_size > 30:
                            lines_above_30_count += 1

                    # 如果大于 30 像素的行数大于等于 2，将图片移动到目标文件夹
                    if lines_above_30_count >= 2:
                        # 如果识别结果不为空且行数小于等于 3，则将图片复制到 txt_folder
                        target_path = os.path.join(txt_folder, filename)
                        shutil.move(img_path, target_path)
                        # pbar.write(f"Image '{filename}' copied to txt_folder.")
                    else:
                        # pbar.write(f"Image '{filename}' remains in source folder.")
                        continue

                    # 更新进度条
                    pbar.update(1)

            print("Image classification completed.")

            print('okkk')

root_folder = f"D://code//data//Lv2期结论//{z}"
process_folder(root_folder)


# 获取当前时间
current_time = datetime.now()

# 打印当前时间
print("当前系统时间是:", current_time)
print("当前系统时间是:", current_time)
print("当前系统时间是:", current_time)




Step 1: 筛选价促卖点图


Processing images: 100%|██████████| 3/3 [00:01<00:00,  2.06it/s]


卖点图 classification completed.
完成时间: 2024-10-25 10:44:21.967672
Step 2: 筛选白底图


Moving images from D://code//data//Lv2期结论//男鞋_from_0501\12066\grounding_output: 100%|██████████| 3/3 [00:00<00:00,  3.93it/s]


完成时间: 2024-10-25 10:44:22.743692
Step 3: 筛选功能卖点图
Step 4: 归类图片到 scene
Image moving completed.
完成时间: 2024-10-25 10:44:23.433699


Processing images: 0it [00:00, ?it/s]


Image classification completed.
okkk
当前系统时间是: 2024-10-25 10:44:23.445910
当前系统时间是: 2024-10-25 10:44:23.445910
当前系统时间是: 2024-10-25 10:44:23.445910
Step 1: 筛选价促卖点图


Processing images: 100%|██████████| 1549/1549 [15:30<00:00,  1.66it/s]


卖点图 classification completed.
完成时间: 2024-10-25 10:59:54.293389
Step 2: 筛选白底图


Moving images from D://code//data//Lv2期结论//男鞋_from_0501\6908\grounding_output: 100%|██████████| 1087/1087 [04:32<00:00,  3.99it/s]


完成时间: 2024-10-25 11:04:26.425259
Step 3: 筛选功能卖点图
Step 4: 归类图片到 scene
Image moving completed.
完成时间: 2024-10-25 11:08:16.630374


Processing images:   4%|▍         | 11/257 [00:58<21:55,  5.35s/it]


Image classification completed.
okkk
当前系统时间是: 2024-10-25 11:09:15.468869
当前系统时间是: 2024-10-25 11:09:15.468869
当前系统时间是: 2024-10-25 11:09:15.468869
Step 1: 筛选价促卖点图


Processing images: 100%|██████████| 1540/1540 [28:48<00:00,  1.12s/it]


卖点图 classification completed.
完成时间: 2024-10-25 11:38:03.716722
Step 2: 筛选白底图


Moving images from D://code//data//Lv2期结论//男鞋_from_0501\6909\grounding_output: 100%|██████████| 1094/1094 [07:11<00:00,  2.54it/s]


完成时间: 2024-10-25 11:45:15.048260
Step 3: 筛选功能卖点图
Step 4: 归类图片到 scene
Image moving completed.
完成时间: 2024-10-25 11:53:35.780953


Processing images:   6%|▋         | 18/281 [02:26<35:43,  8.15s/it] 


Image classification completed.
okkk
当前系统时间是: 2024-10-25 11:56:02.506917
当前系统时间是: 2024-10-25 11:56:02.506917
当前系统时间是: 2024-10-25 11:56:02.506917
Step 1: 筛选价促卖点图


Processing images: 100%|██████████| 735/735 [14:11<00:00,  1.16s/it]


卖点图 classification completed.
完成时间: 2024-10-25 12:10:13.706556
Step 2: 筛选白底图


Moving images from D://code//data//Lv2期结论//男鞋_from_0501\6910\grounding_output: 100%|██████████| 672/672 [04:22<00:00,  2.56it/s]


完成时间: 2024-10-25 12:14:35.746579
Step 3: 筛选功能卖点图
Step 4: 归类图片到 scene
Image moving completed.
完成时间: 2024-10-25 12:20:36.395838


Processing images:   3%|▎         | 7/268 [02:40<1:40:00, 22.99s/it]


Image classification completed.
okkk
当前系统时间是: 2024-10-25 12:23:17.356484
当前系统时间是: 2024-10-25 12:23:17.356484
当前系统时间是: 2024-10-25 12:23:17.356484
Step 1: 筛选价促卖点图


Processing images: 100%|██████████| 894/894 [20:19<00:00,  1.36s/it]


卖点图 classification completed.
完成时间: 2024-10-25 12:43:36.926607
Step 2: 筛选白底图


Moving images from D://code//data//Lv2期结论//男鞋_from_0501\6911\grounding_output: 100%|██████████| 844/844 [05:29<00:00,  2.56it/s]


完成时间: 2024-10-25 12:49:06.722476
Step 3: 筛选功能卖点图
Step 4: 归类图片到 scene
Image moving completed.
完成时间: 2024-10-25 13:02:00.284146


Processing images:  19%|█▊        | 42/227 [03:16<14:27,  4.69s/it]


Image classification completed.
okkk
当前系统时间是: 2024-10-25 13:05:17.216087
当前系统时间是: 2024-10-25 13:05:17.216087
当前系统时间是: 2024-10-25 13:05:17.216087
Step 1: 筛选价促卖点图


Processing images: 100%|██████████| 1911/1911 [51:16<00:00,  1.61s/it]  


卖点图 classification completed.
完成时间: 2024-10-25 13:56:33.485396
Step 2: 筛选白底图


Moving images from D://code//data//Lv2期结论//男鞋_from_0501\6912\grounding_output: 100%|██████████| 1156/1156 [05:07<00:00,  3.76it/s]


完成时间: 2024-10-25 14:01:40.752562
Step 3: 筛选功能卖点图
Step 4: 归类图片到 scene
Image moving completed.
完成时间: 2024-10-25 14:07:44.823355


Processing images:   5%|▌         | 19/364 [02:24<43:52,  7.63s/it]  


Image classification completed.
okkk
当前系统时间是: 2024-10-25 14:10:09.812357
当前系统时间是: 2024-10-25 14:10:09.812357
当前系统时间是: 2024-10-25 14:10:09.812357
Step 1: 筛选价促卖点图


Processing images: 100%|██████████| 1080/1080 [29:13<00:00,  1.62s/it] 


卖点图 classification completed.
完成时间: 2024-10-25 14:39:23.615116
Step 2: 筛选白底图


Moving images from D://code//data//Lv2期结论//男鞋_from_0501\6913\grounding_output: 100%|██████████| 871/871 [03:57<00:00,  3.67it/s]


完成时间: 2024-10-25 14:43:20.774983
Step 3: 筛选功能卖点图
Step 4: 归类图片到 scene
Image moving completed.
完成时间: 2024-10-25 15:00:25.032667


Processing images:  15%|█▌        | 32/210 [04:04<22:37,  7.63s/it]


Image classification completed.
okkk
当前系统时间是: 2024-10-25 15:04:29.138892
当前系统时间是: 2024-10-25 15:04:29.138892
当前系统时间是: 2024-10-25 15:04:29.138892
Step 1: 筛选价促卖点图


Processing images: 100%|██████████| 7/7 [00:14<00:00,  2.11s/it]


卖点图 classification completed.
完成时间: 2024-10-25 15:04:43.964532
Step 2: 筛选白底图


Moving images from D://code//data//Lv2期结论//男鞋_from_0501\9783\grounding_output: 100%|██████████| 7/7 [00:01<00:00,  3.73it/s]


完成时间: 2024-10-25 15:04:45.905748
Step 3: 筛选功能卖点图
Step 4: 归类图片到 scene
Image moving completed.
完成时间: 2024-10-25 15:04:49.633090


Processing images:   0%|          | 0/2 [00:02<?, ?it/s]


Image classification completed.
okkk
当前系统时间是: 2024-10-25 15:04:52.229769
当前系统时间是: 2024-10-25 15:04:52.229769
当前系统时间是: 2024-10-25 15:04:52.229769


# step2.1 - 对四分类的数据总结
### 针对整体

In [13]:
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import openpyxl

# 所有的通用变量都放在这里,方便管理
x_list = ['6908','6909','6910','6911','6912','6913','9783','12066']
y_list = ['txt', 'price']
path = 'Lv2期结论'
z = '男鞋_from_0501'

# 定义基础路径和CSV文件路径
base_path = f'D://code//data//Lv2期结论//{z}'
csv_file_path = f'D://code//data//Lv2期结论//{z}//{z}.csv'

# 读取CSV文件
df = pd.read_csv(csv_file_path)

def extract_matching_part(img_url):
    if pd.isna(img_url):
        return None
    img_url = img_url.split('?')[0]
    img_url = os.path.splitext(img_url)[0]
    parts = img_url.split('/')
    if len(parts) >= 2:
        return f"{parts[-2]}_{parts[-1]}"
    return None

df['matching_part'] = df['img_url'].apply(extract_matching_part)

def process_grounding_folder(grounding_path):
    folder_stats = {
        'price': {'count': 0, 'uv': 0, 'click_uv': 0},
        'txt': {'count': 0, 'uv': 0, 'click_uv': 0},
        'scene': {'count': 0, 'uv': 0, 'click_uv': 0},
        'white': {'count': 0, 'uv': 0, 'click_uv': 0}
    }

    for folder_name in ['price', 'txt', 'scene', 'white']:
        folder_path = os.path.join(grounding_path, folder_name)
        if os.path.exists(folder_path):
            for filename in os.listdir(folder_path):
                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
                    filename_without_ext = os.path.splitext(filename)[0]
                    filtered_df = df[df['matching_part'] == filename_without_ext]
                    if not filtered_df.empty:
                        folder_stats[folder_name]['count'] += 1
                        folder_stats[folder_name]['uv'] += filtered_df['uv'].sum()
                        folder_stats[folder_name]['click_uv'] += filtered_df['click_uv'].sum()

    # 计算每个文件夹的CTR
    for folder in folder_stats:
        if folder_stats[folder]['uv'] > 0:
            folder_stats[folder]['ctr'] = folder_stats[folder]['click_uv'] / folder_stats[folder]['uv']
        else:
            folder_stats[folder]['ctr'] = 0

    return folder_stats

all_stats = {}

for root, dirs, files in os.walk(base_path):
    if 'grounding_output' in dirs:
        grounding_path = os.path.join(root, 'grounding_output')
        folder_name = os.path.basename(root)
        stats = process_grounding_folder(grounding_path)
        all_stats[folder_name] = stats

# 创建一个Excel工作簿来保存结果
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = "Image Distribution Stats"

# 写入表头
headers = ["Folder", "Subfolder", "Count", "UV", "Click UV", "CTR", "Brand"]
for col, header in enumerate(headers, start=1):
    sheet.cell(row=1, column=col, value=header)

row = 2  # 从第二行开始写入数据

# 检查并创建url_1文件夹
output_folder = os.path.join(base_path, 'url_1')
os.makedirs(output_folder, exist_ok=True)

# 输出统计结果
for folder, stats in all_stats.items():
    print(f"\n统计结果 for {folder}:")
    for subfolder, data in stats.items():
        if data['count'] > 0:
            ctr = data['click_uv'] / data['uv'] if data['uv'] > 0 else 0
            print(f"  {subfolder}: 图片数量 = {data['count']}, UV = {data['uv']}, Click UV = {data['click_uv']}, CTR = {ctr:.4f}")

# 定义颜色映射
color_map = {
    'price': '#FF6B6B',  # 柔和的红色
    'txt': '#4ECDC4',    # 青绿色
    'scene': '#7986CB',  # 淡紫色
    'white': '#FFD93D'   # 明亮的黄色
}

# 绘制每个grounding_output文件夹的饼图并保存数据
for folder, stats in all_stats.items():
    counts = [data['count'] for data in stats.values() if data['count'] > 0]
    labels = [subfolder for subfolder, data in stats.items() if data['count'] > 0]
    ctrs = [data['ctr'] for data in stats.values() if data['count'] > 0]
    colors = [color_map.get(label, 'gray') for label in labels]  # 使用颜色映射，如果没有指定则默认为灰色
    
    if counts:  # 只有当有数据时才绘图
        plt.figure(figsize=(12, 9))
        wedges, texts, autotexts = plt.pie(counts, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors)
        
        # 添加数量和CTR标签
        for i, (autotext, ctr) in enumerate(zip(autotexts, ctrs)):
            autotext.set_text(f'{autotext.get_text()}\n({counts[i]})\nCTR: {ctr:.4f}')
        
        plt.title(f"Image Distribution in {folder}")
        plt.axis('equal')

        # 添加图例
        plt.legend(wedges, [f"{label} ({count}, CTR: {ctr:.4f})" for label, count, ctr in zip(labels, counts, ctrs)],
                    title="Categories", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
        
        output_chart_path = os.path.join(output_folder, f'{folder}_brand_all_url_1_chart.png')
        plt.savefig(output_chart_path, bbox_inches='tight')
        plt.close()
        print(f"饼图已保存至: {output_chart_path}")

        # 写入数据到Excel, 添加品牌列 "brand" 的值为 "all"
        for subfolder, data in stats.items():
            if data['count'] > 0:
                sheet.cell(row=row, column=1, value=folder)
                sheet.cell(row=row, column=2, value=subfolder)
                sheet.cell(row=row, column=3, value=data['count'])
                sheet.cell(row=row, column=4, value=data['uv'])
                sheet.cell(row=row, column=5, value=data['click_uv'])
                sheet.cell(row=row, column=6, value=data['ctr'])
                sheet.cell(row=row, column=7, value="all")  # 添加品牌列，值为 "all"
                row += 1

# 保存Excel文件
excel_output_path = os.path.join(output_folder, f'brand_all_url_1_chart.xlsx')
workbook.save(excel_output_path)
print(f"统计结果已保存至Excel文件: {excel_output_path}")

import datetime
current_time = datetime.datetime.now()
formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
print(f"完成时间: {formatted_time}")
print(f"完成时间: {formatted_time}")
print(f"完成时间: {formatted_time}")





统计结果 for 12066:
  scene: 图片数量 = 3, UV = 119, Click UV = 9, CTR = 0.0756

统计结果 for 6908:
  price: 图片数量 = 462, UV = 1997399, Click UV = 75119, CTR = 0.0376
  txt: 图片数量 = 120, UV = 2777440, Click UV = 90451, CTR = 0.0326
  scene: 图片数量 = 721, UV = 14113428, Click UV = 461185, CTR = 0.0327
  white: 图片数量 = 246, UV = 2132073, Click UV = 76381, CTR = 0.0358

统计结果 for 6909:
  price: 图片数量 = 446, UV = 787151, Click UV = 28230, CTR = 0.0359
  txt: 图片数量 = 275, UV = 2535197, Click UV = 100304, CTR = 0.0396
  scene: 图片数量 = 556, UV = 6398186, Click UV = 275823, CTR = 0.0431
  white: 图片数量 = 263, UV = 1562568, Click UV = 60597, CTR = 0.0388

统计结果 for 6910:
  price: 图片数量 = 63, UV = 50103, Click UV = 2934, CTR = 0.0586
  txt: 图片数量 = 169, UV = 844799, Click UV = 27781, CTR = 0.0329
  scene: 图片数量 = 242, UV = 1738848, Click UV = 74195, CTR = 0.0427
  white: 图片数量 = 261, UV = 497952, Click UV = 20120, CTR = 0.0404

统计结果 for 6911:
  price: 图片数量 = 50, UV = 1135608, Click UV = 54777, CTR = 0.0482
  txt: 图片数量 = 2

# step2.2 - 对四分类的数据总结
### 针对brand进行了分成的处理

In [14]:
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import openpyxl

# 定义基础路径和CSV文件路径
# z = '男士春夏下装_from_0501'
base_path = f'D://code//data//Lv2期结论//{z}'
csv_file_path = f'D://code//data//Lv2期结论//{z}//{z}.csv'
brand_path = f'D://code//data//Lv2期结论//{z}//男鞋品牌分层.xlsx'

# filter_layer_cases = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
filter_layer_cases = [[1.0, 2.0],[3.0, 4.0], [5.0, 6.0]]


for filter_layers in filter_layer_cases:
    # 格式化 filter_layers
    filter_layers_str = "_".join(map(str, filter_layers))

    # 读取CSV文件和品牌分类文件
    df = pd.read_csv(csv_file_path)
    df_brand = pd.read_excel(brand_path)

    # 合并品牌信息
    df = pd.merge(df, df_brand, on='main_brand_code', how='left')

    # 筛选数据
    filtered_df = df[df['最终分层'].isin(filter_layers)]

    def extract_matching_part(img_url):
        if pd.isna(img_url):
            return None
        img_url = img_url.split('?')[0]
        img_url = os.path.splitext(img_url)[0]
        parts = img_url.split('/')
        if len(parts) >= 2:
            return f"{parts[-2]}_{parts[-1]}"
        return None

    filtered_df['matching_part'] = filtered_df['img_url'].apply(extract_matching_part)

    def process_grounding_folder(grounding_path):
        folder_stats = {
            'price': {'count': 0, 'uv': 0, 'click_uv': 0},
            'txt': {'count': 0, 'uv': 0, 'click_uv': 0},
            'scene': {'count': 0, 'uv': 0, 'click_uv': 0},
            'white': {'count': 0, 'uv': 0, 'click_uv': 0}
        }

        for folder_name in ['price', 'txt', 'scene', 'white']:
            folder_path = os.path.join(grounding_path, folder_name)
            if os.path.exists(folder_path):
                for filename in os.listdir(folder_path):
                    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
                        filename_without_ext = os.path.splitext(filename)[0]
                        folder_filtered_df = filtered_df[filtered_df['matching_part'] == filename_without_ext]
                        if not folder_filtered_df.empty:
                            folder_stats[folder_name]['count'] += 1
                            folder_stats[folder_name]['uv'] += folder_filtered_df['uv'].sum()
                            folder_stats[folder_name]['click_uv'] += folder_filtered_df['click_uv'].sum()

        # 计算每个文件夹的CTR
        for folder in folder_stats:
            if folder_stats[folder]['uv'] > 0:
                folder_stats[folder]['ctr'] = folder_stats[folder]['click_uv'] / folder_stats[folder]['uv']
            else:
                folder_stats[folder]['ctr'] = 0

        return folder_stats

    all_stats = {}

    for root, dirs, files in os.walk(base_path):
        if 'grounding_output' in dirs:
            grounding_path = os.path.join(root, 'grounding_output')
            folder_name = os.path.basename(root)
            stats = process_grounding_folder(grounding_path)
            all_stats[folder_name] = stats

    # 定义颜色映射
    color_map = {
        'price': '#FF6B6B',  # 柔和的红色
        'txt': '#4ECDC4',    # 青绿色
        'scene': '#7986CB',  # 淡紫色
        'white': '#FFD93D'   # 明亮的黄色
    }

    # 绘制饼图
    for folder, stats in all_stats.items():
        counts = [data['count'] for data in stats.values() if data['count'] > 0]
        labels = [subfolder for subfolder, data in stats.items() if data['count'] > 0]
        ctrs = [data['ctr'] for data in stats.values() if data['count'] > 0]
        colors = [color_map.get(label, 'gray') for label in labels]  # 使用颜色映射，如果没有指定则默认为灰色
        
        if counts:  # 只有当有数据时才绘图
            plt.figure(figsize=(12, 9))
            wedges, texts, autotexts = plt.pie(counts, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors)
            
            # 添加数量和CTR标签
            for i, (autotext, ctr) in enumerate(zip(autotexts, ctrs)):
                autotext.set_text(f'{autotext.get_text()}\n({counts[i]})\nCTR: {ctr:.4f}')
            
            plt.title(f"Image Distribution in {folder} ({filter_layers_str})")
            plt.axis('equal')
            
            # 添加图例
            plt.legend(wedges, [f"{label} ({count}, CTR: {ctr:.4f})" for label, count, ctr in zip(labels, counts, ctrs)],
                       title="Categories", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
            
            output_chart_path = os.path.join(base_path, f'url_1//{folder}_brand_{filter_layers_str}_url_1_chart.png')
            plt.savefig(output_chart_path, bbox_inches='tight')
            plt.close()
            print(f"饼图已保存至: {output_chart_path}")

    # 创建一个Excel工作簿来保存结果
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    sheet.title = f"Stats ({filter_layers_str})"

    # 写入表头
    headers = ["Folder", "Subfolder", "Count", "UV", "Click UV", "CTR", "Brand"]
    for col, header in enumerate(headers, start=1):
        sheet.cell(row=1, column=col, value=header)

    row = 2  # 从第二行开始写入数据

    # 写入统计数据
    for folder, stats in all_stats.items():
        for subfolder, data in stats.items():
            if data['count'] > 0:
                sheet.cell(row=row, column=1, value=folder)
                sheet.cell(row=row, column=2, value=subfolder)
                sheet.cell(row=row, column=3, value=data['count'])
                sheet.cell(row=row, column=4, value=data['uv'])
                sheet.cell(row=row, column=5, value=data['click_uv'])
                sheet.cell(row=row, column=6, value=data['ctr'])
                sheet.cell(row=row, column=7, value=filter_layers_str)  # 添加品牌列，值为格式化后的筛选条件
                row += 1

    # 保存Excel文件
    output_folder = os.path.join(base_path)
    os.makedirs(output_folder, exist_ok=True)
    excel_output_path = os.path.join(output_folder, f'url_1//brand_{filter_layers_str}_url_1_chart.xlsx')
    workbook.save(excel_output_path)
    print(f"统计结果已保存至Excel文件: {excel_output_path}")

import datetime
current_time = datetime.datetime.now()
formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
print(f"完成时间: {formatted_time}")
print(f"完成时间: {formatted_time}")
print(f"完成时间: {formatted_time}")




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['matching_part'] = filtered_df['img_url'].apply(extract_matching_part)


饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6908_brand_1.0_2.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6909_brand_1.0_2.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6910_brand_1.0_2.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6911_brand_1.0_2.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6912_brand_1.0_2.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6913_brand_1.0_2.0_url_1_chart.png
统计结果已保存至Excel文件: D://code//data//Lv2期结论//男鞋_from_0501\url_1//brand_1.0_2.0_url_1_chart.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['matching_part'] = filtered_df['img_url'].apply(extract_matching_part)


饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6908_brand_3.0_4.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6909_brand_3.0_4.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6910_brand_3.0_4.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6911_brand_3.0_4.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6912_brand_3.0_4.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6913_brand_3.0_4.0_url_1_chart.png
统计结果已保存至Excel文件: D://code//data//Lv2期结论//男鞋_from_0501\url_1//brand_3.0_4.0_url_1_chart.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['matching_part'] = filtered_df['img_url'].apply(extract_matching_part)


饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//12066_brand_5.0_6.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6908_brand_5.0_6.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6909_brand_5.0_6.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6910_brand_5.0_6.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6911_brand_5.0_6.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6912_brand_5.0_6.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//6913_brand_5.0_6.0_url_1_chart.png
饼图已保存至: D://code//data//Lv2期结论//男鞋_from_0501\url_1//9783_brand_5.0_6.0_url_1_chart.png
统计结果已保存至Excel文件: D://code//data//Lv2期结论//男鞋_from_0501\url_1//brand_5.0_6.0_url_1_chart.xlsx


## 将分类结果统计为excel文件

In [10]:
import pandas as pd

# 指定要拼接的Excel文件路径列表
excel_file_paths = [
    f"D://code//data//Lv2期结论//{z}//url_1//brand_all_url_1_chart.xlsx",
    f"D://code//data//Lv2期结论//{z}//url_1//brand_1.0_2.0_3.0_url_1_chart.xlsx",
    f"D://code//data//Lv2期结论//{z}//url_1//brand_1.0_2.0_3.0_url_1_chart.xlsx",
    f"D://code//data//Lv2期结论//{z}//url_1//brand_4.0_5.0_6.0_url_1_chart.xlsx"
]

# 用于存储读取的每个Excel文件的数据框
dataframes = []

# 逐个读取指定的Excel文件并添加到dataframes列表中
for file_path in excel_file_paths:
    df = pd.read_excel(file_path)
    dataframes.append(df)

# 将所有数据框上下拼接在一起
merged_df = pd.concat(dataframes, axis=0, ignore_index=True)

# 可以根据需要将拼接后的结果保存为新的Excel文件
merged_df.to_excel(f"D://code//data//Lv2期结论//{z}//url_1//all_url_1_chart.xlsx", index=False)





In [None]:
# import os
# import pandas as pd

# # 存储图片信息的列表
# image_info = []


# # x = '京喜_from_0501'


# # 一级文件夹路径
# root_folder = f'D://code//data//Lv2期结论//{x}//筛选'



# # 遍历一级文件夹下的所有二级文件夹
# for sub_folder in os.listdir(root_folder):
#     if sub_folder.isdigit():  # 只处理数字命名的二级文件夹
#         grounding_folder = os.path.join(root_folder, sub_folder, 'grounding_output')
#         if os.path.exists(grounding_folder):
#             for sub_sub_folder in ['price', 'txt', 'white', 'scene']:
#                 sub_sub_folder_path = os.path.join(grounding_folder, sub_sub_folder)
#                 if os.path.exists(sub_sub_folder_path):
#                     for image_file in os.listdir(sub_sub_folder_path):
#                         if "txt_" in image_file:
#                             image_file = image_file.replace("txt_", "")
#                         elif "price_" in image_file:
#                             image_file = image_file.replace("price_", "")
#                         image_info.append([image_file, sub_sub_folder, sub_folder])

# # 创建DataFrame并保存为Excel
# df = pd.DataFrame(image_info, columns=['图片名', '分类', 'cid3'])
# # df.to_excel('D://code//data//Lv2期结论//京喜_from_0501//筛选//分类数据image_info.xlsx', index=False)

# # 读取Excel文件
# # data = pd.read_csv('D://code//data//Lv2期结论//京喜_from_0501//京喜数据_from_0501_筛选.csv')
# # df2 = pd.read_csv('D://code//data//Lv2期结论//京喜_from_0501//京喜数据_from_0501_筛选.csv')

# df2 = pd.read_csv(csv_file_path)

# # 按照img_url列进行聚合，并对指定列进行求和
# aggregated_data = df2.groupby('img_url').agg({
#     'uv':'sum',
#     'click_uv':'sum',
#     'gmv_cj':'sum',
#     'sale_qtty_cj':'sum'
# }).reset_index()

# # 将原始数据中与聚合相关的列合并到聚合后的数据中
# aggregated_data = pd.merge(aggregated_data, df2[['sku', 'img_url','img_type', 'bu_id', 'cid1', 'cid2', 'cid3', 'main_brand_code','shop_id']], on='img_url', how='left')

# # 保存为新的Excel文件
# # aggregated_data.to_excel('D://code//data//Lv2期结论//京喜_from_0501//url加总原始数据.xlsx', index=False)
# # csv_file_path = 'D://code//data//Lv2期结论//京喜_from_0501//url加总原始数据.xlsx'
# # df = pd.read_excel(csv_file_path)

# def extract_matching_part(img_url):
#     if pd.isna(img_url):
#         return None
#     img_url = img_url.split('?')[0]
#     img_url = os.path.splitext(img_url)[0]
#     parts = img_url.split('/')
#     if len(parts) >= 2:
#         return f"{parts[-2]}_{parts[-1]}"
#     return None

# aggregated_data['matching_part'] = aggregated_data['img_url'].apply(extract_matching_part)

# # df_x = pd.read_excel('D://code//data//Lv2期结论//京喜_from_0501//筛选//分类数据image_info.xlsx')

# # 确保列数据类型匹配（如果需要）
# aggregated_data['matching_part'] = aggregated_data['matching_part'].astype(str)
# df['图片名'] = df['图片名'].apply(lambda x: str(x).replace('.jpg', '') if isinstance(x, (str, bytes)) else x)

# # 根据条件进行拼接
# merged_df = pd.merge(aggregated_data, df, left_on='matching_part', right_on='图片名', how='right')
# merged_df.to_excel(f'D://code//data//Lv2期结论//{x}//sku分类数据表.xlsx', index=False)


