In [4]:
import os
import re

def clean_text(text):
    # Remove text between [#ns and ] or [#nsc and ]
    cleaned_text = re.sub(r'\[#ns.*?\]', '', text, flags=re.DOTALL)
    cleaned_text = re.sub(r'\[#nsc.*?\]', '', cleaned_text, flags=re.DOTALL)
    return cleaned_text

def process_privacy_policies(input_directory):
    # Create an output directory if it doesn't exist
    output_directory = 'manual_policies'
    os.makedirs(output_directory, exist_ok=True)

    # Walk through all directories and subdirectories
    for root, dirs, files in os.walk(input_directory):
        for file in files:
            if file == 'privacy_policy.txt':
                input_path = os.path.join(root, file)
                parent_dir = os.path.basename(root)
                output_filename = f'{parent_dir}_privacy_policy'
                output_path = os.path.join(output_directory, output_filename)

                # Read the content of the file
                with open(input_path, 'r', encoding='utf-8') as f:
                    content = f.read()

                # Clean the content
                cleaned_content = clean_text(content)

                # Write the cleaned content to a new file
                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(cleaned_content)

                print(f'Processed {input_path} -> {output_path}')

# Example usage
input_directory = 'policies_annotated'  # Replace with the path to your directory
process_privacy_policies(input_directory)


Processed policies_annotated\annotated_policies\afmstudio.craigslistclassifiedslocal\privacy_policy.txt -> manual_policies\afmstudio.craigslistclassifiedslocal_privacy_policy
Processed policies_annotated\annotated_policies\ai.appfront.luckyslice\privacy_policy.txt -> manual_policies\ai.appfront.luckyslice_privacy_policy
Processed policies_annotated\annotated_policies\ai.blueplate.app\privacy_policy.txt -> manual_policies\ai.blueplate.app_privacy_policy
Processed policies_annotated\annotated_policies\ai.halloween.aifilter.art\privacy_policy.txt -> manual_policies\ai.halloween.aifilter.art_privacy_policy
Processed policies_annotated\annotated_policies\aiart.midjourney.dreamnow\privacy_policy.txt -> manual_policies\aiart.midjourney.dreamnow_privacy_policy
Processed policies_annotated\annotated_policies\at.tomtasche.reader.pro\privacy_policy.txt -> manual_policies\at.tomtasche.reader.pro_privacy_policy
Processed policies_annotated\annotated_policies\bau.moho.freevpn\privacy_policy.txt -> m

In [6]:
import os

def count_words(text):
    words = text.split()
    return len(words)

def generate_report(directory):
    report_lines = []
    total_words = 0
    file_count = 0

    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)

            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            word_count = count_words(content)
            total_words += word_count
            file_count += 1
            report_lines.append(f'{file}: {word_count} words')

    average_word_count = total_words / file_count if file_count > 0 else 0
    report_lines.append(f'\nAverage word count: {average_word_count:.2f}')

    report_path = os.path.join(directory, 'report.txt')

    with open(report_path, 'w', encoding='utf-8') as report_file:
        report_file.write('\n'.join(report_lines))

    print(f'Report generated at {report_path}')

# Example usage
manual_policies_directory = 'manual_policies'  # Replace with the path to your manual_policies directory if different
generate_report(manual_policies_directory)


Report generated at manual_policies\report.txt
