In [17]:
import os

# 定义路径
processed_folder = "/content/output/output"  # 原始文件夹路径
final_output_folder = "/content/drive/My Drive/output_final"  # 最终保存的文件夹

# 创建目标文件夹（如果不存在）
if not os.path.exists(final_output_folder):
    os.makedirs(final_output_folder)

# 遍历所有 .txt 文件
for file_name in os.listdir(processed_folder):
    source_file_path = os.path.join(processed_folder, file_name)

    # 仅处理 .txt 文件
    if file_name.endswith(".txt"):
        print(f"Processing file: {file_name}")

        try:
            # 打开文件并读取内容
            with open(source_file_path, "r", encoding="utf-8") as file:
                lines = file.readlines()  # 读取所有行

            # 如果文件没有内容，跳过
            if not lines:
                print(f"Skipping empty file: {file_name}")
                continue

            # 初始化存储清理后的行的列表
            cleaned_lines = []

            for line in lines:
                line = line.strip()  # 去除行首尾的空格和换行符

                # 跳过所有等号分隔线
                if line == "==================================================":
                    continue

                # 跳过以 "Video: xxx.mp4" 开头的行
                if line.startswith("Video:"):
                    continue

                # 删除 "Classification Output: "
                line = line.replace("Classification Output: ", "")

                # 如果行非空，添加到清理后的列表
                if line:
                    cleaned_lines.append(line)

            # 修改文件名：删除 "_output" 后缀
            if "_output" in file_name:
                new_file_name = file_name.replace("_output", "")
            else:
                new_file_name = file_name
            destination_file_path = os.path.join(final_output_folder, new_file_name)

            # 保存清理后的内容到目标文件夹
            with open(destination_file_path, "w", encoding="utf-8") as file:
                file.write("\n".join(cleaned_lines))  # 按行写入清理后的内容

            print(f"Finished processing: {file_name}. Saved as {destination_file_path}")

        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

print("\nProcessing complete.")

Processing file: subject_101_randompphb93l6wy_vid_0_39_output.txt
Finished processing: subject_101_randompphb93l6wy_vid_0_39_output.txt. Saved as /content/drive/My Drive/output_final/subject_101_randompphb93l6wy_vid_0_39.txt
Processing file: subject_100_jqsyphj1sj_vid_0_11_output.txt
Finished processing: subject_100_jqsyphj1sj_vid_0_11_output.txt. Saved as /content/drive/My Drive/output_final/subject_100_jqsyphj1sj_vid_0_11.txt
Processing file: subject_101_randompphb93l6wy_vid_2_25_output.txt
Finished processing: subject_101_randompphb93l6wy_vid_2_25_output.txt. Saved as /content/drive/My Drive/output_final/subject_101_randompphb93l6wy_vid_2_25.txt
Processing file: subject_100_jqsyphj1sj_vid_1_10_output.txt
Finished processing: subject_100_jqsyphj1sj_vid_1_10_output.txt. Saved as /content/drive/My Drive/output_final/subject_100_jqsyphj1sj_vid_1_10.txt
Processing file: subject_0_2msdhgqawh_vid_0_13_output.txt
Finished processing: subject_0_2msdhgqawh_vid_0_13_output.txt. Saved as /conte

In [18]:
import os
import zipfile
from google.colab import drive

# Step 1: 挂载 Google Drive
drive.mount('/content/drive')

# Step 2: 定义路径
uploaded_zip = "/content/drive/My Drive/output.zip"  # Google Drive 中的 output.zip 路径
source_folder = "/content/output"  # 解压后的文件夹路径
destination_folder = "/content/drive/My Drive/output_processed"  # 保存去重后的结果到 Google Drive

# Step 3: 解压文件
if not os.path.exists(source_folder):
    os.makedirs(source_folder)

# 检查文件是否存在
if not os.path.exists(uploaded_zip):
    raise FileNotFoundError(f"The file {uploaded_zip} does not exist. Please check the file path.")

with zipfile.ZipFile(uploaded_zip, 'r') as zip_ref:
    zip_ref.extractall(source_folder)
print(f"Files have been extracted to: {source_folder}")

# 检查解压后的文件夹内容
print(f"Checking extracted files in: {source_folder}")
print(os.listdir(source_folder))

# Step 4: 创建目标文件夹
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Step 5: 递归查找所有 .txt 文件
def find_txt_files(folder):
    """递归查找所有 .txt 文件"""
    txt_files = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith(".txt"):
                txt_files.append(os.path.join(root, file))
    return txt_files

# 找到所有 .txt 文件
txt_files = find_txt_files(source_folder)
print(f"Found .txt files: {txt_files}")

# Step 6: 去重并保存文件
for source_file_path in txt_files:
    file_name = os.path.basename(source_file_path)  # 获取文件名
    print(f"Processing file: {file_name}")

    # 使用集合去重
    try:
        with open(source_file_path, "r", encoding="utf-8") as file:
            lines = file.readlines()
            unique_lines = set(lines)  # 使用 set 去重

        # 保存去重后的内容到目标文件夹
        destination_file_path = os.path.join(destination_folder, file_name)
        with open(destination_file_path, "w", encoding="utf-8") as file:
            file.writelines(sorted(unique_lines))  # 排序并写入文件

        print(f"Finished processing: {file_name}. Saved to {destination_file_path}")
    except Exception as e:
        print(f"Error processing file {file_name}: {e}")

print("All files have been processed. Unique lines have been saved in Google Drive.")

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
Finished processing: subject_101_randompphb93l6wy_vid_2_37_output.txt. Saved to /content/drive/My Drive/output_processed/subject_101_randompphb93l6wy_vid_2_37_output.txt
Processing file: subject_10_n5yny96tpq_vid_1_21_output.txt
Finished processing: subject_10_n5yny96tpq_vid_1_21_output.txt. Saved to /content/drive/My Drive/output_processed/subject_10_n5yny96tpq_vid_1_21_output.txt
Processing file: subject_104_06lc4xlb48_vid_2_41_output.txt
Finished processing: subject_104_06lc4xlb48_vid_2_41_output.txt. Saved to /content/drive/My Drive/output_processed/subject_104_06lc4xlb48_vid_2_41_output.txt
Processing file: subject_19_ah2mcee837_vid_1_18_output.txt
Finished processing: subject_19_ah2mcee837_vid_1_18_output.txt. Saved to /content/drive/My Drive/output_processed/subject_19_ah2mcee837_vid_1_18_output.txt
Processing file: subject_103_xcjreth6q9_vid_2_30_output.txt
Finished processing: subject_103_xcjreth6q9_vid_2_30_output.txt. Saved to /conten

In [19]:
import os

# 定义路径
processed_folder = "/content/output/output"  # 原始文件夹路径
final_output_folder = "/content/drive/My Drive/output_final"  # 最终保存的文件夹

# 创建目标文件夹（如果不存在）
if not os.path.exists(final_output_folder):
    os.makedirs(final_output_folder)

# 遍历所有 .txt 文件
for file_name in os.listdir(processed_folder):
    source_file_path = os.path.join(processed_folder, file_name)

    # 仅处理 .txt 文件
    if file_name.endswith(".txt"):
        print(f"Processing file: {file_name}")

        try:
            # 打开文件并读取内容
            with open(source_file_path, "r", encoding="utf-8") as file:
                lines = file.readlines()  # 读取所有行

            # 如果文件没有内容，跳过
            if not lines:
                print(f"Skipping empty file: {file_name}")
                continue

            # 初始化存储清理后的行的列表
            cleaned_lines = []

            for line in lines:
                line = line.strip()  # 去除行首尾的空格和换行符

                # 跳过所有等号分隔线
                if line == "==================================================":
                    continue

                # 跳过以 "Video: xxx.mp4" 开头的行
                if line.startswith("Video:"):
                    continue

                # 删除 "Classification Output: "
                line = line.replace("Classification Output: ", "")

                # 如果行非空，添加到清理后的列表
                if line:
                    cleaned_lines.append(line)

            # 修改文件名：删除 "_output" 后缀
            if "_output" in file_name:
                new_file_name = file_name.replace("_output", "")
            else:
                new_file_name = file_name
            destination_file_path = os.path.join(final_output_folder, new_file_name)

            # 保存清理后的内容到目标文件夹
            with open(destination_file_path, "w", encoding="utf-8") as file:
                file.write("\n".join(cleaned_lines))  # 按行写入清理后的内容

            print(f"Finished processing: {file_name}. Saved as {destination_file_path}")

        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

print("\nProcessing complete.")

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
Processing file: subject_10_n5yny96tpq_vid_1_21_output.txt
Finished processing: subject_10_n5yny96tpq_vid_1_21_output.txt. Saved as /content/drive/My Drive/output_final/subject_10_n5yny96tpq_vid_1_21.txt
Processing file: subject_104_06lc4xlb48_vid_2_41_output.txt
Finished processing: subject_104_06lc4xlb48_vid_2_41_output.txt. Saved as /content/drive/My Drive/output_final/subject_104_06lc4xlb48_vid_2_41.txt
Processing file: subject_19_ah2mcee837_vid_1_18_output.txt
Finished processing: subject_19_ah2mcee837_vid_1_18_output.txt. Saved as /content/drive/My Drive/output_final/subject_19_ah2mcee837_vid_1_18.txt
Processing file: subject_103_xcjreth6q9_vid_2_30_output.txt
Finished processing: subject_103_xcjreth6q9_vid_2_30_output.txt. Saved as /content/drive/My Drive/output_final/subject_103_xcjreth6q9_vid_2_30.txt
Processing file: subject_10_n5yny96tpq_vid_1_20_output.txt
Finished processing: subject_10_n5yny96tpq_vid_1_20_output.txt. Saved as /cont

In [20]:
import os

# 定义路径
processed_folder = "/content/drive/My Drive/output_processed"  # 原始文件夹
final_output_folder = "/content/drive/My Drive/output_final"  # 最终输出文件夹

# 创建最终输出文件夹
if not os.path.exists(final_output_folder):
    os.makedirs(final_output_folder)

# 遍历去重后的文件夹
file_count = 0  # 成功处理的文件计数
skipped_files = []  # 保存被跳过的文件名
for file_name in os.listdir(processed_folder):
    source_file_path = os.path.join(processed_folder, file_name)

    # 仅处理 .txt 文件
    if file_name.endswith(".txt"):
        print(f"Processing file: {file_name}")

        try:
            # 检查文件是否为空
            if os.stat(source_file_path).st_size == 0:
                print(f"Skipping empty file: {file_name}")
                skipped_files.append(file_name)
                continue

            # 打开文件并读取内容
            with open(source_file_path, "r", encoding="utf-8") as file:
                lines = file.readlines()  # 读取所有行

            print(f"Original content of {file_name}:")
            print(lines)  # 打印文件原始内容，便于调试

            # 初始化存储处理后行的列表
            cleaned_lines = []

            for line in lines:
                line = line.strip()  # 去除行首尾的空格和换行符

                # 跳过分隔线和标记行
                if line.startswith("===") or line.startswith("Classification Output:") or line.startswith("Video:"):
                    continue

                # 保存非空的有效行
                if line:
                    cleaned_lines.append(line)

            # 如果清理后没有有效内容，跳过当前文件
            if not cleaned_lines:
                print(f"No valid content found in file: {file_name}")
                skipped_files.append(file_name)
                continue

            # 修改文件名：删除 "_output" 后缀
            if "_output" in file_name:
                new_file_name = file_name.replace("_output", "")
            else:
                new_file_name = file_name

            # 确保文件名唯一：如果文件名已存在，自动增加编号
            destination_file_path = os.path.join(final_output_folder, new_file_name)
            counter = 1
            while os.path.exists(destination_file_path):
                base_name, ext = os.path.splitext(new_file_name)
                destination_file_path = os.path.join(final_output_folder, f"{base_name}_{counter}{ext}")
                counter += 1

            # 保存处理后的内容到目标文件夹
            with open(destination_file_path, "w", encoding="utf-8") as file:
                file.write("\n".join(cleaned_lines))  # 按行写入清理后的内容

            print(f"Finished processing: {file_name}. Saved as {destination_file_path}")
            file_count += 1  # 成功处理的文件计数

        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

# 总结处理结果
print(f"\nSummary:")
print(f"Processed {file_count} files.")
if skipped_files:
    print(f"Skipped {len(skipped_files)} files due to no valid content or being empty:")
    for skipped_file in skipped_files:
        print(f" - {skipped_file}")
else:
    print("No files were skipped.")

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
Original content of subject_120_randomkwkczvv409_vid_0_13_output.txt:
No valid content found in file: subject_120_randomkwkczvv409_vid_0_13_output.txt
Processing file: subject_109_cgpemx4wd1_vid_2_30_output.txt
Original content of subject_109_cgpemx4wd1_vid_2_30_output.txt:
No valid content found in file: subject_109_cgpemx4wd1_vid_2_30_output.txt
Processing file: subject_104_06lc4xlb48_vid_0_2_output.txt
Original content of subject_104_06lc4xlb48_vid_0_2_output.txt:
No valid content found in file: subject_104_06lc4xlb48_vid_0_2_output.txt
Processing file: subject_1_1zug2hqfz1_vid_1_12_output.txt
Original content of subject_1_1zug2hqfz1_vid_1_12_output.txt:
No valid content found in file: subject_1_1zug2hqfz1_vid_1_12_output.txt
Processing file: subject_126_kto30htac2_vid_0_18_output.txt
Original content of subject_126_kto30htac2_vid_0_18_output.txt:
No valid content found in file: subject_126_kto30htac2_vid_0_18_output.txt
Processing file: subj

In [21]:
import os

# 定义路径
final_output_folder = "/content/drive/My Drive/output_final"  # 最终文件夹路径

# 遍历目标文件夹中的所有文件
for file_name in os.listdir(final_output_folder):
    file_path = os.path.join(final_output_folder, file_name)

    # 检查文件是否大于 1KB（1024字节）
    if os.path.isfile(file_path) and os.stat(file_path).st_size > 1024:
        print(f"Processing file: {file_name}")

        try:
            # 打开文件并读取所有行
            with open(file_path, "r", encoding="utf-8") as file:
                lines = file.readlines()  # 读取所有行

            # 去重并保持行的顺序
            seen = set()
            unique_lines = []
            for line in lines:
                line = line.strip()  # 去除首尾空格和换行符
                if line not in seen:
                    unique_lines.append(line)
                    seen.add(line)

            # 覆盖原文件，写入去重后的内容
            with open(file_path, "w", encoding="utf-8") as file:
                file.write("\n".join(unique_lines))  # 按行写入去重后的内容

            print(f"Finished processing: {file_name}. File size: {os.stat(file_path).st_size} bytes")

        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

print("\nDe-duplication complete.")

Processing file: subject_108_randomn93c6rsmts_vid_0_3.txt
Finished processing: subject_108_randomn93c6rsmts_vid_0_3.txt. File size: 4375 bytes
Processing file: subject_121_8h5y1xzwge_vid_0_15.txt
Finished processing: subject_121_8h5y1xzwge_vid_0_15.txt. File size: 4450 bytes
Processing file: subject_103_xcjreth6q9_vid_1_35.txt
Finished processing: subject_103_xcjreth6q9_vid_1_35.txt. File size: 4462 bytes
Processing file: subject_123_random3v0bx7575d_vid_0_29.txt
Finished processing: subject_123_random3v0bx7575d_vid_0_29.txt. File size: 4584 bytes
Processing file: subject_12_g0lww4zt4d_vid_2_11.txt
Finished processing: subject_12_g0lww4zt4d_vid_2_11.txt. File size: 4487 bytes
Processing file: subject_108_randomn93c6rsmts_vid_0_4.txt
Finished processing: subject_108_randomn93c6rsmts_vid_0_4.txt. File size: 4651 bytes
Processing file: subject_16_znxhms0ar6_vid_2_13.txt
Finished processing: subject_16_znxhms0ar6_vid_2_13.txt. File size: 4458 bytes
Processing file: subject_123_random3v0bx7

In [28]:
import os

def remove_repeated_sentences(text):
    """
    从文本中去重重复的句子，并按原顺序保留。

    Args:
        text (str): 输入的长文本。

    Returns:
        str: 去重后的文本。
    """
    # 分割文本为句子，简单以句号（.）作为分隔符
    sentences = text.split(". ")
    seen = set()  # 用于存储已出现的句子
    unique_sentences = []  # 存储去重后的句子

    # 遍历句子，去重并保持顺序
    for sentence in sentences:
        # 去掉首尾的空格，防止空格影响判断
        sentence = sentence.strip()
        # 跳过重复的句子
        if sentence not in seen:
            unique_sentences.append(sentence)
            seen.add(sentence)

    # 将去重后的句子重新拼接为文本，用 ". " 作为句子间的分隔符
    return ". ".join(unique_sentences) + ("." if text.strip().endswith(".") else "")


# 文件夹路径
final_output_folder = "/content/drive/My Drive/output_final"  # 替换为你的文件夹路径

# 遍历目标文件夹中的所有文件
for file_name in os.listdir(final_output_folder):
    file_path = os.path.join(final_output_folder, file_name)

    # 检查是否是文件，并且文件大小大于 1KB（1024 字节）
    if os.path.isfile(file_path) and os.stat(file_path).st_size > 1024:
        print(f"Processing file: {file_name}")

        try:
            # 打开文件并读取内容
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()  # 读取整个文件内容

            # 调用去重函数处理文本
            deduplicated_content = remove_repeated_sentences(content)

            # 覆盖原文件，写入去重后的内容
            with open(file_path, "w", encoding="utf-8") as file:
                file.write(deduplicated_content)

            print(f"Finished processing: {file_name}. File size after deduplication: {os.stat(file_path).st_size} bytes")

        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

print("\nDe-duplication complete.")

Processing file: subject_108_randomn93c6rsmts_vid_0_3.txt
Finished processing: subject_108_randomn93c6rsmts_vid_0_3.txt. File size after deduplication: 368 bytes
Processing file: subject_121_8h5y1xzwge_vid_0_15.txt
Finished processing: subject_121_8h5y1xzwge_vid_0_15.txt. File size after deduplication: 630 bytes
Processing file: subject_103_xcjreth6q9_vid_1_35.txt
Finished processing: subject_103_xcjreth6q9_vid_1_35.txt. File size after deduplication: 728 bytes
Processing file: subject_123_random3v0bx7575d_vid_0_29.txt
Finished processing: subject_123_random3v0bx7575d_vid_0_29.txt. File size after deduplication: 508 bytes
Processing file: subject_12_g0lww4zt4d_vid_2_11.txt
Finished processing: subject_12_g0lww4zt4d_vid_2_11.txt. File size after deduplication: 427 bytes
Processing file: subject_108_randomn93c6rsmts_vid_0_4.txt
Finished processing: subject_108_randomn93c6rsmts_vid_0_4.txt. File size after deduplication: 490 bytes
Processing file: subject_16_znxhms0ar6_vid_2_13.txt
Finish

In [29]:
import os

# 定义路径
final_output_folder = "/content/drive/My Drive/output_final"  # 最终文件夹路径

# 初始化统计
large_files = []  # 用于存储大于 1KB 的文件信息

# 遍历文件夹中的所有文件
for file_name in os.listdir(final_output_folder):
    file_path = os.path.join(final_output_folder, file_name)

    # 检查是否是文件，并且文件大小大于 1KB
    if os.path.isfile(file_path):
        file_size = os.stat(file_path).st_size  # 获取文件大小（字节）
        if file_size > 1024:  # 判断是否大于 1KB
            large_files.append((file_name, file_size))

# 打印结果
if large_files:
    print(f"Found {len(large_files)} files larger than 1KB:")
    for file_name, file_size in large_files:
        print(f" - {file_name}: {file_size / 1024:.2f} KB")
else:
    print("No files larger than 1KB found in the folder.")

print("\nCheck complete.")

Found 5 files larger than 1KB:
 - subject_1_1zug2hqfz1_vid_1_29.txt: 1.29 KB
 - subject_102_ax9bbn1mcc_vid_0_25.txt: 1.03 KB
 - subject_16_znxhms0ar6_vid_1_32.txt: 1.02 KB
 - subject_124_zw8quy4q8v_vid_0_23.txt: 1.09 KB
 - subject_107_t9w4xfx125_vid_1_16.txt: 1.14 KB

Check complete.
