In [1]:
import requests
from bs4 import BeautifulSoup
import os
import regex as re
import wordninja

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}


def process_app_page(app_id, directory_path):
    failed_apps = []  # List to store app ids/names for which scraping failed

    # Check if data_protection.txt already exists in the directory
    if os.path.exists(os.path.join(directory_path, 'data_protection.txt')):
        print(f"Skipping {app_id} - data_protection.txt already exists.")
        return

    # The URL of the Google Play Store page
    url = f'https://play.google.com/store/apps/datasafety?id={app_id}&hl=en_US&gl=US'

    try:
        # Send a GET request to the URL with headers and a timeout
        response = requests.get(url, headers=headers, timeout=10)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find the div with the class 'i1GNIe'
            main_div = soup.find(class_='i1GNIe')

            if main_div:
                # Extract and split the text content within the div by newlines
                div_text = main_div.get_text().strip()

                # Remove the initial part of the text
                start_index = div_text.lower().find("data shared")
                if start_index != -1:
                    div_text = div_text[start_index:]

                # Use regular expression to replace 'expand_more' with an empty string
                formatted_text = re.sub(r'expand_more', '', div_text, flags=re.IGNORECASE)

                # Split words using wordninja
                split_words = wordninja.split(formatted_text)

                # Join the split words with spaces
                formatted_text = ' '.join(split_words)

                # Create the directory if it doesn't exist
                os.makedirs(directory_path, exist_ok=True)

                # Define the file path
                file_path = os.path.join(directory_path, 'data_protection.txt')

                # Write the formatted text to the file
                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(formatted_text)

                print(f"Output written to {file_path}")
            else:
                print(f"Failed to retrieve the page. Unable to find the div with class 'i1GNIe' for {app_id}")
                failed_apps.append(app_id)
        else:
            print(f"Failed to retrieve the page. Status code: {response.status_code} for {app_id}")
            failed_apps.append(app_id)
    except requests.exceptions.Timeout:
        print(f"Timeout error: The request timed out for {app_id}")
        failed_apps.append(app_id)
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e} for {app_id}")
        failed_apps.append(app_id)
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        failed_apps.append(app_id)
    # Write the failed app ids/names to a text file
    with open('failed_apps.txt', 'w') as failed_apps_file:
        for app_id in failed_apps:
            failed_apps_file.write(app_id + '\n')

'''# Loop through app directories
for app_id in os.listdir('policies_descriptions'):
    # Define the directory structure
    directory_path = os.path.join('policies_descriptions', app_id)

    
    # Process the app page
    process_app_page(app_id, directory_path)'''

app_id = 'ai.blueplate.app'
directory_path = '.'
process_app_page(app_id, directory_path)




Output written to .\data_protection.txt


In [5]:
# Loop through app directories
for country in os.listdir('eu_top_100/output'):
    for app in os.listdir(f'eu_top_100/output/{country}/free'):
            
        # Define the directory structure
        directory_path = os.path.join(f'eu_top_100/output/{country}/free', app)

        # Process the app page
        process_app_page(app, directory_path)
    for app in os.listdir(f'eu_top_100/output/{country}/paid'):
            
        # Define the directory structure
        directory_path = os.path.join(f'eu_top_100/output/{country}/paid', app)

        # Process the app page
        process_app_page(app, directory_path)


Output written to eu_top_100/output/Austria/free\ai.chat.gpt.bot\data_protection.txt
Output written to eu_top_100/output/Austria/free\aplicacion.tiempo\data_protection.txt
Output written to eu_top_100/output/Austria/free\at.apptec.europlasma\data_protection.txt
Output written to eu_top_100/output/Austria/free\at.atrust.tanapp\data_protection.txt
Output written to eu_top_100/output/Austria/free\at.austrosoft.t4me.MB_Wien40100\data_protection.txt
Output written to eu_top_100/output/Austria/free\at.bipa.bipaapp2\data_protection.txt
Output written to eu_top_100/output/Austria/free\at.bluesource.mobilepocket\data_protection.txt
Output written to eu_top_100/output/Austria/free\at.drei.up3\data_protection.txt
Output written to eu_top_100/output/Austria/free\at.erstebank.george\data_protection.txt
Output written to eu_top_100/output/Austria/free\at.erstebank.securityapp\data_protection.txt
Output written to eu_top_100/output/Austria/free\at.gv.oe.app\data_protection.txt
Output written to eu_to

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'eu_top_100/output/missing_privacy_policy.txt/free'