# 获取cos中的所有文件

In [59]:
from qcloud_cos import CosConfig
from qcloud_cos import CosS3Client
import sys
import os
import logging
import re
from urllib.parse import unquote

In [60]:
# 初始化cos client

# 正常情况日志级别使用 INFO，需要定位时可以修改为 DEBUG，此时 SDK 会打印和服务端的通信信息
logging.basicConfig(level=logging.ERROR, stream=sys.stdout)

# 1. 设置属性
secret_id = 'XXXXX'
secret_key = 'XXXXX'
region = 'ap-nanjing'
token = None
scheme = 'https'

config = CosConfig(Region=region, SecretId=secret_id, SecretKey=secret_key, Token=token, Scheme=scheme)
client = CosS3Client(config)

In [61]:
# 分页遍历出某一个桶内的所有对象
pics = []
marker = ""
while True:
    response = client.list_objects(Bucket='这里填bucket的名字',Prefix='uPic/',Marker=marker, MaxKeys=10)
    if 'Contents' in response:
        for content in response['Contents']:
            pics.append(content['Key'])
    if response['IsTruncated'] == 'false':
        break
    marker = response['NextMarker']

In [62]:
# 过滤掉文件夹，只保留文件
pics = [pic for pic in pics if not pic.endswith('/')]

## 读取本地文件中的图片

In [63]:
# 指定目录
base_dir = '/Users/zcy/workspace/gitee/note'

# 读取目录下的所有文件，排除隐藏文件
files = [f for f in os.listdir(base_dir) if not f.startswith('.')]
files

# 如果是目录，则递归读取
for f in files:
    if os.path.isdir(os.path.join(base_dir, f)):
        files.extend([os.path.join(f, f2) for f2 in os.listdir(os.path.join(base_dir, f)) if not f2.startswith('.')])

# 只保留文件
files = [f for f in files if os.path.isfile(os.path.join(base_dir, f))]
files

# 遍历每一个文件，一行一行的判断，是否包含图片链接。
typora_pics = []
for f in files:
    try:
        # 尝试使用不同的编码打开文件
        encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'big5']
        for encoding in encodings:
            try:
                with open(os.path.join(base_dir, f), 'r', encoding=encoding) as file:
                    for line in file:
                        if 'note-1252548816' in line:
                            typora_pics.append(line)
                break
            except UnicodeDecodeError:
                continue
    except Exception as e:
        print(f"处理文件 {f} 时出错: {str(e)}")

In [64]:
def extract_image_links(markdown_texts):
    # 定义正则表达式匹配两种格式的图片链接
    pattern = r'!\[.*?\]\((.*?)\)|<img[^>]+src="([^"]+)"'
    all_image_links = []  # 用于存储所有提取到的图片链接

    # 遍历输入的 Markdown 文本列表
    for markdown_text in markdown_texts:
        # 使用正则表达式查找所有匹配的链接
        matches = re.findall(pattern, markdown_text)
        # 提取链接
        image_links = [match[0] or match[1] for match in matches]
        # 移除掉 "https://oss的路径前缀/"
        image_links = [link.replace('https://oss的路径前缀/', '') for link in image_links]
        # 对中文进行 unquote(encoded_str)
        image_links = [unquote(link) for link in image_links]
        # 将提取到的链接添加到总列表中
        all_image_links.extend(image_links)

    return all_image_links

In [65]:
# 
clear_pics = extract_image_links(typora_pics)

In [66]:
# 获取需要删除的图片id
needDeletePics = [pic for pic in pics if pic not in clear_pics]
print(len(needDeletePics))
needDeletePics


0


[]

In [None]:
# 循环遍历删除
for pic in needDeletePics:
    client.delete_object(Bucket='这里是bucket名称', Key=pic)
