# step1 - 下载图片
这里通过修改文件保存名,改为了两段式,主要是为了避免一段式命名时出现的大量重复问题

filename_parts = url.split('/')

filename = f"{filename_parts[-2]}_{filename_parts[-1]}"

In [6]:
import os
import pandas as pd
import requests
from tqdm import tqdm

def download_images(df, img_url_column, cid3_column, save_folder):

    # 获取链接列数据
    urls = df[img_url_column]

    total_images = len(df)  # 总图片数量

    # 遍历每个链接并下载内容
    with tqdm(total=total_images, desc='Downloading images', unit='image') as pbar:
        for url, cid3 in zip(urls, df[cid3_column]):
            # 在每个链接前面拼接指定的字符串
            modified_url = 'https://img20.360buyimg.com/ling/' + url
            response = requests.get(modified_url)
            if response.status_code == 200:

                # 提取文件名,这里保存的是二段式的文件名
                filename_parts = url.split('/')
                filename = f"{filename_parts[-2]}_{filename_parts[-1]}"

                # 按照 cid3 创建保存图片的文件夹（如果不存在）
                folder_path = os.path.join(save_folder, str(cid3))
                if not os.path.exists(folder_path):
                    os.makedirs(folder_path)
                    
                # 保存文件
                save_path = os.path.join(folder_path, filename)
                with open(save_path, 'wb') as f:
                    f.write(response.content)
            else:
                # print(f'{url} 下载失败')
                continue

            pbar.update(1)  # 更新进度条

    print('下载完成')

# 读取 CSV 文件
df = pd.read_csv('D://code//data//howtodo_Lv2//京喜_from_0501//京喜数据_from_0501_筛选.csv')

# 按照img_url字段进行聚合
df = df.drop_duplicates(subset=['img_url'])
# df.shape

# 在京喜项目中会出现大量的重复sku,所以这里再针对sku进行一次去重
df = df.drop_duplicates(subset=['sku'])
df.shape

# 调用函数进行图片下载
download_images(df, 'img_url', 'cid3', 'D://code//data//howtodo_Lv2//京喜_from_0501')




Downloading images:  89%|████████▉ | 3352/3753 [03:11<00:22, 17.48image/s]  

下载完成





# step2 - 按照新分类聚合
在现在的流程中,对cid3进行了聚合,避免太多分散的cid3,在聚合后会删除原来的cid3文件夹

In [None]:
import pandas as pd
import os
import shutil
from tqdm import tqdm  # 导入tqdm库，用于显示进度条

# 读取CSV文件
df = pd.read_csv('D://code//data//howtodo_from_0401//服饰鞋靴箱包//品类聚类-服饰鞋靴箱包.csv')

# 处理缺失值，将缺失值填充为0
df['三级类目ID'] = df['三级类目ID'].fillna(0)

# 将浮点数转换为整数
df['三级类目ID'] = df['三级类目ID'].astype(int)

# 处理合并单元格，使用ffill()方法填充合并单元格中的值
df['聚类名称'] = df['聚类名称'].fillna(method='ffill')

# 创建目标文件夹
target_folder = 'D://code//data//howtodo_from_0401//服饰鞋靴箱包//'
os.makedirs(target_folder, exist_ok=True)

# 定义图片文件的扩展名列表
image_extensions = ['.jpg', '.jpeg', '.png', '.gif']

# 遍历每个分组
for group_name, group_df in tqdm(df.groupby('聚类名称'), desc="Processing groups"):
    
    # 创建以A列值命名的文件夹
    new_folder_path = os.path.join(target_folder, str(group_name))
    os.makedirs(new_folder_path, exist_ok=True)
    
    # 遍历B列中的文件夹名
    for folder_name in tqdm(group_df['三级类目ID'], desc="Processing folders", leave=False):
        
        # 将文件夹名转换为字符串类型
        folder_name_str = str(folder_name)
        
        # 拼接文件夹的完整路径
        folder_path = os.path.join(target_folder, folder_name_str)
        
        # 检查文件夹是否存在
        if not os.path.exists(folder_path):
            print(f"Folder '{folder_path}' does not exist. Skipping...")
            continue
        
        # 遍历文件夹中的所有文件
        for file_name in os.listdir(folder_path):
            
            # 拼接文件的完整路径
            file_path = os.path.join(folder_path, file_name)
            
            # 判断文件是否为图片文件
            if any(file_name.lower().endswith(ext) for ext in image_extensions):
                # 拼接目标文件的完整路径
                target_file_path = os.path.join(new_folder_path, file_name)
                
                # 如果目标文件已经存在，则跳过移动该文件
                if os.path.exists(target_file_path):
                    print(f"File '{target_file_path}' already exists. Skipping...")
                else:
                    # 确保目标文件夹存在
                    os.makedirs(new_folder_path, exist_ok=True)
                    
                    # 移动图片文件到新建的文件夹中
                    shutil.move(file_path, new_folder_path)

  

'''这里会删除所有为数字的文件夹'''
'''这里会删除所有为数字的文件夹'''
'''这里会删除所有为数字的文件夹'''


# 获取目标文件夹中的所有子文件夹
subfolders = [f.path for f in os.scandir(target_folder) if f.is_dir()]

# 遍历每个子文件夹
for folder in subfolders:
    # 获取子文件夹名称
    folder_name = os.path.basename(folder)
    
    # 检查子文件夹名称是否为数字
    if folder_name.isdigit():
        # 强制删除子文件夹及其内容
        shutil.rmtree(folder)
        



# step3 - 分拣包含关键词的图片
这里会先对图片进行ocr,然后将包含"京喜"关键词图片单独筛选出来


In [7]:
import os
import shutil
from paddleocr import PaddleOCR
from tqdm import tqdm

ocr = PaddleOCR(use_angle_cls=True, lang="ch", show_log=False)


def ocr_image(image_path, ocr, pbar):
    result = ocr.ocr(image_path, cls=True)

    if not result:
        pbar.write(f"Image '{os.path.basename(image_path)}' skipped due to empty OCR result.")
        pbar.update(1)
        return ""

    data_list = result[0]

    # 检查识别结果是否有关键词
    contains_keyword = False

    if data_list:
        keywords = ['京喜自营']

        # 遍历识别结果中的文本
        for data in data_list:
            text = data[1][0]  # 获取文本内容
            # 检查当前文本是否包含关键词
            if any(keyword.lower() in text.lower() for keyword in keywords):
                contains_keyword = True
                break

    if not contains_keyword:
        # pbar.write(f"Image '{os.path.basename(image_path)}' skipped due to no matching keyword.")
        pbar.update(1)
        return ""

    # 返回识别结果
    return " ".join([data[1][0] for data in data_list])


def process_folder(source_folder, destination_folder, keywords):
    # 获取所有图片文件
    image_files = []
    for root, dirs, files in os.walk(source_folder):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
                image_files.append((root, file))

    # 使用tqdm创建进度条
    with tqdm(total=len(image_files), desc="Processing images") as pbar:
        for root, file in image_files:
            file_path = os.path.join(root, file)

            try:
                text = ocr_image(file_path, ocr, pbar)
                if not text:
                    continue

                # print(text)

                if any(keyword.lower() in text.lower() for keyword in keywords):
                    # 创建目标子文件夹
                    relative_path = os.path.relpath(root, source_folder)
                    new_folder = os.path.join(destination_folder, relative_path)
                    os.makedirs(new_folder, exist_ok=True)

                    # 移动文件
                    shutil.move(file_path, os.path.join(new_folder, file))
                    # print(f"Moved {file} to {new_folder}")

            except Exception as e:
                # print(f"Error processing {file}: {str(e)}")
                continue

            pbar.update(1)

# 使用示例
source_folder = "D://code//data//howtodo_Lv2//京喜_from_0501"
destination_folder = "D://code//data//howtodo_Lv2//京喜_from_0501//筛选"
keywords = ["京喜"]

process_folder(source_folder, destination_folder, keywords)




Processing images: 100%|██████████| 3352/3352 [34:37<00:00,  1.61it/s]


# step4 - 蓝色贴片筛选

In [2]:
# 方法是通过计算蓝色贴片的像素占比来实现的
# 因为贴片的面积大小基本固定,影响占比的只是里面的文字多少,所以它会比较稳定在一个值的范围,是一个可行的研判方法



import os
from PIL import Image
import numpy as np
import pandas as pd
from tqdm import tqdm

def calculate_blue_ratio(image_path, rgb_lower, rgb_upper):
    try:
        with Image.open(image_path) as img:
            img = img.convert('RGB')
            img_array = np.array(img)

        mask = np.all((img_array >= rgb_lower) & (img_array <= rgb_upper), axis=-1)
        blue_pixels = np.sum(mask)
        total_pixels = mask.size
        blue_ratio = blue_pixels / total_pixels

        return blue_ratio

    except Exception as e:
        print(f"无法处理图像 {image_path}: {str(e)}")
        return None

def process_folder(folder_path, rgb_lower, rgb_upper):
    results = []
    
    for subfolder in os.listdir(folder_path):
        subfolder_path = os.path.join(folder_path, subfolder)
        if os.path.isdir(subfolder_path):
            for file in tqdm(os.listdir(subfolder_path), desc=f"Processing {subfolder}"):
                if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                    image_path = os.path.join(subfolder_path, file)
                    blue_ratio = calculate_blue_ratio(image_path, rgb_lower, rgb_upper)
                    if blue_ratio is not None:
                        results.append((subfolder, file, blue_ratio))
    
    return results

def save_to_excel(results, output_file):
    df = pd.DataFrame(results, columns=['cid3', 'Image', 'Blue Ratio'])
    df.to_excel(output_file, index=False)

# 主程序
folder_path = 'd://code//data//Lv2期结论//京喜_from_0501//筛选'  # 替换为您的文件夹路径
output_file = 'd://code//data//Lv2期结论//京喜_from_0501//筛选//blue_ratio_results.xlsx'

# 定义蓝色范围（RGB值）
rgb_lower = np.array([0, 130, 250])  # 较暗的蓝色
rgb_upper = np.array([0, 140, 255])  # 较亮的蓝色

results = process_folder(folder_path, rgb_lower, rgb_upper)
save_to_excel(results, output_file)

print(f"Results have been saved to {output_file}")



Processing 1047:   0%|          | 0/51 [00:00<?, ?it/s]

Processing 1047: 100%|██████████| 51/51 [00:00<00:00, 51.23it/s]
Processing 12010: 100%|██████████| 86/86 [00:01<00:00, 49.97it/s]
Processing 12811: 100%|██████████| 45/45 [00:00<00:00, 61.50it/s]
Processing 1349: 100%|██████████| 265/265 [00:04<00:00, 59.74it/s]
Processing 1355: 100%|██████████| 305/305 [00:06<00:00, 47.54it/s]
Processing 13661: 100%|██████████| 75/75 [00:01<00:00, 57.06it/s]
Processing 1476: 100%|██████████| 21/21 [00:00<00:00, 64.00it/s]
Processing 15908: 100%|██████████| 193/193 [00:03<00:00, 59.04it/s]
Processing 1656: 100%|██████████| 76/76 [00:01<00:00, 59.73it/s]
Processing 1657: 100%|██████████| 148/148 [00:02<00:00, 58.59it/s]
Processing 16777: 100%|██████████| 200/200 [00:03<00:00, 63.70it/s]
Processing 34919: 100%|██████████| 92/92 [00:01<00:00, 56.83it/s]
Processing 35404: 100%|██████████| 173/173 [00:02<00:00, 61.74it/s]
Processing 6191: 100%|██████████| 39/39 [00:00<00:00, 56.35it/s]
Processing 6221: 100%|██████████| 87/87 [00:01<00:00, 61.53it/s]
Proces

Results have been saved to d://code//data//Lv2期结论//京喜_from_0501//筛选//blue_ratio_results.xlsx


In [4]:
import pandas as pd

# 读取第一个 Excel 文件
blue_ratio_results = pd.read_excel('d://code//data//Lv2期结论//京喜_from_0501//筛选//blue_ratio_results.xlsx')

# 读取第二个 Excel 文件
merged_info_ctr = pd.read_excel('d://code//data//Lv2期结论//京喜_from_0501//筛选//merged_info_ctr.xlsx')

# 对 merged_info_ctr.xlsx 表中的 Image Name 列的所有值的结尾添加.jpg
merged_info_ctr['Image Name'] = merged_info_ctr['Image Name'] + '.jpg'

# 创建一个空列表来存储匹配的 Blue Ratio 值
matched_blue_ratios = []

# 遍历 blue_ratio_results.xlsx 表中的每一行
for index, row in blue_ratio_results.iterrows():
    # 检查 Image Name 列的值是否在 merged_info_ctr.xlsx 表中
    if row['Image'] in merged_info_ctr['Image Name'].values:

        # 如果存在匹配项，找到对应的行并将 Blue Ratio 值添加到该行的后面
        matched_row_index = merged_info_ctr[merged_info_ctr['Image Name'] == row['Image']].index[0]
        merged_info_ctr.loc[matched_row_index, 'Blue Ratio'] = row['Blue Ratio']

# 输出结果
print(merged_info_ctr)


                                   Image Name  \
0      667e8c77Fea571c39_828c9e4956e6f12f.jpg   
1      667e8c77Fea571c39_828c9e4956e6f12f.jpg   
2      667e8c77Fea571c39_828c9e4956e6f12f.jpg   
3      667e8c77Fea571c39_828c9e4956e6f12f.jpg   
4      667e8c77Fea571c39_828c9e4956e6f12f.jpg   
...                                       ...   
16266  6704a50cF06ce0cfc_43c2c54e7e428b2d.jpg   
16267  6704a50cF06ce0cfc_43c2c54e7e428b2d.jpg   
16268  6704a50cF06ce0cfc_43c2c54e7e428b2d.jpg   
16269  6705813dF8615d213_41dd92e9b4972afc.jpg   
16270  6705813dF8615d213_41dd92e9b4972afc.jpg   

                               File Name1 Style  txt_x1  txt_y1  txt_x2  \
0      667e8c77Fea571c39_828c9e4956e6f12f   txt    41.0   133.0   206.0   
1      667e8c77Fea571c39_828c9e4956e6f12f   txt    23.0   517.0   600.0   
2      667e8c77Fea571c39_828c9e4956e6f12f   txt    73.0   133.0   173.0   
3      667e8c77Fea571c39_828c9e4956e6f12f   txt    71.0   189.0   174.0   
4      667e8c77Fea571c39_828c9e4956e

In [84]:
import pandas as pd

# 读取第一个 Excel 文件
blue_ratio_results = pd.read_excel('d://code//data//Lv2期结论//京喜_from_0501//筛选//blue_ratio_results.xlsx')

# 读取第二个 Excel 文件
merged_info_ctr = pd.read_excel('d://code//data//Lv2期结论//京喜_from_0501//筛选//merged_info_ctr.xlsx')




In [85]:
merged_info_ctr = merged_info_ctr.drop_duplicates(subset=['Image Name'])
# 去掉 Image 列中每个值结尾的.xxx
blue_ratio_results['Image Name'] = blue_ratio_results['Image Name'].str.split('.').str[0]

In [87]:
# 选择要拼接的列
selected_cols = ['Image Name','Style','Subfolder','uv', 'click_uv', 'gmv_cj', 'sale_qtty_cj', 'ctr']
selected_cols_df = merged_info_ctr[selected_cols].copy()
selected_cols_df.head()
selected_cols_df.shape

(1852, 8)

In [88]:
blue_ratio_results

Unnamed: 0,cid3,Image Name,Blue Ratio
0,1047,667e8c77Fea571c39_828c9e4956e6f12f,0.050491
1,1047,667e8d8cFd1e12884_b77cc342352a7b11,0.060959
2,1047,66877b7dFb066d343_2da59a4b5837ac83,0.000000
3,1047,66877b80Fda4a35ae_991a57f13a39c1b3,0.000000
4,1047,66877caeFf14d8e7e_c0a0ece89a85deba,0.052869
...,...,...,...
2130,饼图数据,753_distribution,0.000000
2131,饼图数据,760_distribution,0.000000
2132,饼图数据,9435_distribution,0.000000
2133,饼图数据,9775_distribution,0.000000


In [89]:
selected_cols_df

Unnamed: 0,Image Name,Style,Subfolder,uv,click_uv,gmv_cj,sale_qtty_cj,ctr
0,667e8c77Fea571c39_828c9e4956e6f12f,txt,1047.0,1229,32,50.96,5,0.026037
9,667e8d8cFd1e12884_b77cc342352a7b11,txt,1047.0,961,13,8.80,2,0.013528
23,66877caeFf14d8e7e_c0a0ece89a85deba,txt,1047.0,2445,89,75.40,6,0.036401
32,66877cafFc20262c7_c62ca9bf5e70fba6,txt,1047.0,4431,145,125.84,11,0.032724
41,66877cb5F6bf88754_29dd99cc0caf06eb,txt,1047.0,18,2,0.00,0,0.111111
...,...,...,...,...,...,...,...,...
16257,66fc129dFcb1bf0fe_23c21f6889982fb2,9775,,98,2,0.00,0,0.020408
16259,66fc18d1Ff2654e34_aa2f561bc5484e9f,9775,,113,6,0.00,0,0.053097
16264,6704a0b9F9fac46cf_a7a56937f89abcb9,9775,,128,4,0.00,0,0.031250
16266,6704a50cF06ce0cfc_43c2c54e7e428b2d,9775,,96,2,0.00,0,0.020833


In [90]:
merged_table = pd.merge(blue_ratio_results, selected_cols_df, left_on='Image Name', right_on='Image Name', how='left')

In [91]:
merged_table.head()

Unnamed: 0,cid3,Image Name,Blue Ratio,Style,Subfolder,uv,click_uv,gmv_cj,sale_qtty_cj,ctr
0,1047,667e8c77Fea571c39_828c9e4956e6f12f,0.050491,txt,1047.0,1229.0,32.0,50.96,5.0,0.026037
1,1047,667e8d8cFd1e12884_b77cc342352a7b11,0.060959,txt,1047.0,961.0,13.0,8.8,2.0,0.013528
2,1047,66877b7dFb066d343_2da59a4b5837ac83,0.0,1047,,88.0,7.0,21.9,1.0,0.079545
3,1047,66877b80Fda4a35ae_991a57f13a39c1b3,0.0,txt,1047.0,2513.0,39.0,19.9,1.0,0.015519
4,1047,66877caeFf14d8e7e_c0a0ece89a85deba,0.052869,txt,1047.0,2445.0,89.0,75.4,6.0,0.036401


In [92]:
# 保存修改后的表格为新的 Excel 文件
merged_table.to_excel('d://code//data//Lv2期结论//京喜_from_0501//筛选//m123.xlsx', index=False)

# 输出结果
print("保存成功！")



保存成功！
