# **App scraper**

## **[Setup]**

In [8]:
!pip install google-play-scraper
import requests
import csv
import time
import logging

import pandas as pd
from google_play_scraper import search




[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: C:\Users\Pc\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [9]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [10]:
KEYWORDS = [
    'asthma',
    'asthma tracker',
    'inhaler tracker',
    'peak flow',
    'asthma management',
    'asthma log',
    'allergy induced asthma',
    'asthma symptoms',
    'asthma monitoring app',
    'asthma action plan'
]

## **[App Store - Apple]**

**[Configuration]**

In [11]:
# The base URL for the iTunes Search API
API_BASE_URL = 'https://itunes.apple.com/search'

# Output CSV file name
CSV_FILE = 'data/app_store_asthma_apps_desc.csv'

# Number of top results to fetch for each keyword
RESULT_LIMIT = 20

**[Scraper Function]**

In [12]:
def scrape_app_store():
    logging.info(f"Starting App Store scrape. Output will be saved to {CSV_FILE}")

    csv_headers = [
        'Keyword Searched',
        'App Name',
        'App ID',
        'Seller',
        'Price',
        'Average Rating',
        'Rating Count',
        'Genre',
        'App URL',
        'Description'
    ]

    try:
        with open(CSV_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(csv_headers)

            for keyword in KEYWORDS:
                logging.info(f"Searching for keyword: '{keyword}'...")

                params = {
                    'term': keyword,
                    'country': 'US',
                    'media': 'software',
                    'limit': RESULT_LIMIT
                }

                try:
                    response = requests.get(API_BASE_URL, params=params, timeout=10)
                    
                    response.raise_for_status() 

                    data = response.json()
                    results = data.get('results', [])

                    if not results:
                        logging.warning(f"No results found for keyword: '{keyword}'")
                        continue

                    for app in results:
                        writer.writerow([
                            keyword,
                            app.get('trackName'),
                            app.get('trackId'),
                            app.get('artistName'),
                            app.get('formattedPrice', app.get('price', 'N/A')),
                            app.get('averageUserRating', 'N/A'),
                            app.get('userRatingCount', 'N/A'),
                            app.get('primaryGenreName', 'N/A'),
                            app.get('trackViewUrl'),
                            app.get('description', 'N/A')
                        ])

                    logging.info(f"Successfully saved {len(results)} apps for '{keyword}'")

                except requests.exceptions.RequestException as e:
                    logging.error(f"HTTP Request failed for keyword '{keyword}': {e}")
                except Exception as e:
                    logging.error(f"An error occurred processing keyword '{keyword}': {e}")

                time.sleep(2)

    except IOError as e:
        logging.critical(f"Failed to open or write to CSV file {CSV_FILE}: {e}")
        return

    logging.info(f"--- Scraping complete. Data saved to {CSV_FILE} ---")

In [13]:
scrape_app_store()

2025-11-26 19:43:51,350 - INFO - Starting App Store scrape. Output will be saved to data/app_store_asthma_apps_desc.csv
2025-11-26 19:43:51,352 - INFO - Searching for keyword: 'asthma'...
2025-11-26 19:43:51,394 - INFO - Successfully saved 20 apps for 'asthma'
2025-11-26 19:43:53,402 - INFO - Searching for keyword: 'asthma tracker'...
2025-11-26 19:43:53,451 - INFO - Successfully saved 20 apps for 'asthma tracker'
2025-11-26 19:43:55,457 - INFO - Searching for keyword: 'inhaler tracker'...
2025-11-26 19:43:55,506 - INFO - Successfully saved 19 apps for 'inhaler tracker'
2025-11-26 19:43:57,521 - INFO - Searching for keyword: 'peak flow'...
2025-11-26 19:43:57,586 - INFO - Successfully saved 20 apps for 'peak flow'
2025-11-26 19:43:59,591 - INFO - Searching for keyword: 'asthma management'...
2025-11-26 19:43:59,654 - INFO - Successfully saved 17 apps for 'asthma management'
2025-11-26 19:44:01,655 - INFO - Searching for keyword: 'asthma log'...
2025-11-26 19:44:01,703 - INFO - Successf

In [14]:
app_store_list = pd.read_csv('data/app_store_asthma_apps_desc.csv')
app_store_list.head(5)

Unnamed: 0,Keyword Searched,App Name,App ID,Seller,Price,Average Rating,Rating Count,Genre,App URL,Description
0,asthma,Asthma Tracker゜,6444343217,Adam Cziko,Free,4.6087,69,Lifestyle,https://apps.apple.com/us/app/asthma-tracker/i...,Track asthma and take charge of your health.\n...
1,asthma,Asthma: Tracker & Reminders,6744072783,"No Worries! Lifestyle, LLC",Free,0.0,0,Medical,https://apps.apple.com/us/app/asthma-tracker-r...,Asthma Care Companion is a comprehensive asthm...
2,asthma,My Pollen Forecast - Allergies,1244428929,JRustonApps B.V.,Free,4.69123,25450,Health & Fitness,https://apps.apple.com/us/app/my-pollen-foreca...,My Pollen Forecast is the best app for trackin...
3,asthma,AsthmaTime,1658824279,SCIENTIFIC COMPUTING AND IMAGING INSTITUTE (SC...,Free,5.0,1,Health & Fitness,https://apps.apple.com/us/app/asthmatime/id165...,AsthmaTime is an app to assist in the self-man...
4,asthma,FindAir – Asthma Diary,1515944881,Findair sp. z o.o.,Free,2.75,4,Health & Fitness,https://apps.apple.com/us/app/findair-asthma-d...,FindAir application is a smart asthma diary fo...


In [15]:
print(f"Length of app_store_list: {len(app_store_list)}")

app_store_list = app_store_list.drop_duplicates(subset=['App ID'])

print(f"Length of app_store_list after dropping duplicates: {len(app_store_list)}")

Length of app_store_list: 183
Length of app_store_list after dropping duplicates: 87


In [16]:
app_store_list.to_csv('data/app_store_asthma_apps_desc_set.csv', index=False)

---

## **[Google Play - Android]**

In [17]:
# [Configuration]
GOOGLE_PLAY_CSV_FILE = 'data/google_play_asthma_apps_desc.csv'
GOOGLE_PLAY_RESULT_LIMIT = 20

# [Scraper Helpers]
def fetch_apps(keyword: str, limit: int = GOOGLE_PLAY_RESULT_LIMIT):
    return search(keyword, lang="en", country="us", n_hits=limit)


def extract_info(app, keyword):
    return {
        "Keyword Searched": keyword,
        "App ID": app.get("appId"),
        "App Name": app.get("title"),
        "Developer": app.get("developer"),
        "Rating": app.get("score"),
        "Installs": app.get("installs"),
        "Price": app.get("priceText"),
        "Genre": app.get("genre"),
        "Summary": app.get("summary"),
        "Content Rating": app.get("contentRating"),
        "App URL": app.get("url"),
    }


In [20]:
def scrape_google_play():
    logging.info(f"Starting Google Play scrape. Output will be saved to {GOOGLE_PLAY_CSV_FILE}")

    results = []

    for kw in KEYWORDS:
        logging.info(f"Searching Google Play for keyword: '{kw}'...")
        try:
            apps = fetch_apps(kw)
            if not apps:
                logging.warning(f"No results found on Google Play for keyword: '{kw}'")
                continue

            results.extend(extract_info(a, kw) for a in apps)
            logging.info(f"Successfully collected {len(apps)} apps for '{kw}' from Google Play")
        except Exception as e:
            logging.error(f"An error occurred while fetching Google Play apps for keyword '{kw}': {e}")

        time.sleep(2)

    if not results:
        logging.warning("No Google Play results collected. Nothing to write.")
        return

    try:
        with open(GOOGLE_PLAY_CSV_FILE, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=results[0].keys())
            writer.writeheader()
            writer.writerows(results)

        logging.info(f"--- Google Play scraping complete. Data saved to {GOOGLE_PLAY_CSV_FILE} ---")
    except IOError as e:
        logging.critical(f"Failed to open or write to CSV file {GOOGLE_PLAY_CSV_FILE}: {e}")

In [21]:
scrape_google_play()

2025-11-26 19:47:44,475 - INFO - Starting Google Play scrape. Output will be saved to data/google_play_asthma_apps_desc.csv
2025-11-26 19:47:44,476 - INFO - Searching Google Play for keyword: 'asthma'...
2025-11-26 19:47:44,821 - INFO - Successfully collected 20 apps for 'asthma' from Google Play
2025-11-26 19:47:46,828 - INFO - Searching Google Play for keyword: 'asthma tracker'...
2025-11-26 19:47:47,199 - INFO - Successfully collected 20 apps for 'asthma tracker' from Google Play
2025-11-26 19:47:49,208 - INFO - Searching Google Play for keyword: 'inhaler tracker'...
2025-11-26 19:47:49,560 - INFO - Successfully collected 20 apps for 'inhaler tracker' from Google Play
2025-11-26 19:47:51,563 - INFO - Searching Google Play for keyword: 'peak flow'...
2025-11-26 19:47:51,926 - INFO - Successfully collected 19 apps for 'peak flow' from Google Play
2025-11-26 19:47:53,935 - INFO - Searching Google Play for keyword: 'asthma management'...
2025-11-26 19:47:54,315 - INFO - Successfully col

In [22]:
google_play_list = pd.read_csv(GOOGLE_PLAY_CSV_FILE)
print(f"Length of google_play_list: {len(google_play_list)}")

google_play_list = google_play_list.drop_duplicates(subset=['App ID'])

print(f"Length of google_play_list after dropping duplicates: {len(google_play_list)}")

google_play_list.to_csv('data/google_play_asthma_apps_desc_set.csv', index=False)

Length of google_play_list: 199
Length of google_play_list after dropping duplicates: 83


---

## **[Github Repositories]**

**[Configuration]**

In [23]:
API_BASE_URL = 'https://api.github.com/search/repositories'

CSV_FILE = 'data/github_asthma_repos.csv'

RESULT_LIMIT = 20

**[Scraper Function]**

In [24]:
def scrape_github():
    logging.info(f"Starting GitHub scrape. Output will be saved to {CSV_FILE}")

    csv_headers = [
        'Keyword Searched',
        'Repository Name',
        'Owner',
        'Stars',
        'Language',
        'URL',
        'Description'
    ]

    try:
        with open(CSV_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(csv_headers)

            for keyword in KEYWORDS:
                logging.info(f"Searching for keyword: '{keyword}'...")

                params = {
                    'q': keyword,
                    'sort': 'stars',
                    'order': 'desc',
                    'per_page': RESULT_LIMIT
                }

                headers = {
                    'Accept': 'application/vnd.github.v3+json',
                    'User-Agent': 'python-requests/scraps' 
                }

                try:
                    response = requests.get(API_BASE_URL, params=params, headers=headers, timeout=10)
                    response.raise_for_status()

                    data = response.json()
                    results = data.get('items', [])

                    if not results:
                        logging.warning(f"No results found for keyword: '{keyword}'")
                        continue

                    for repo in results:
                        writer.writerow([
                            keyword,
                            repo.get('name'),
                            repo.get('owner', {}).get('login'),
                            repo.get('stargazers_count'),
                            repo.get('language', 'N/A'),
                            repo.get('html_url'),
                            repo.get('description', 'No description available')
                        ])

                    logging.info(f"Successfully saved {len(results)} repos for '{keyword}'")

                except requests.exceptions.RequestException as e:
                    logging.error(f"HTTP Request failed for keyword '{keyword}': {e}")
                except Exception as e:
                    logging.error(f"An error occurred processing keyword '{keyword}': {e}")

                time.sleep(6)

    except IOError as e:
        logging.critical(f"Failed to open or write to CSV file {CSV_FILE}: {e}")
        return

    logging.info(f"--- Scraping complete. Data saved to {CSV_FILE} ---")

In [25]:
scrape_github()

2025-11-26 19:48:45,290 - INFO - Starting GitHub scrape. Output will be saved to data/github_asthma_repos.csv
2025-11-26 19:48:45,291 - INFO - Searching for keyword: 'asthma'...
2025-11-26 19:48:46,065 - INFO - Successfully saved 20 repos for 'asthma'
2025-11-26 19:48:52,066 - INFO - Searching for keyword: 'asthma tracker'...
2025-11-26 19:48:52,897 - INFO - Successfully saved 20 repos for 'asthma tracker'
2025-11-26 19:48:58,902 - INFO - Searching for keyword: 'inhaler tracker'...
2025-11-26 19:48:59,499 - INFO - Successfully saved 11 repos for 'inhaler tracker'
2025-11-26 19:49:05,503 - INFO - Searching for keyword: 'peak flow'...
2025-11-26 19:49:06,184 - INFO - Successfully saved 20 repos for 'peak flow'
2025-11-26 19:49:12,196 - INFO - Searching for keyword: 'asthma management'...
2025-11-26 19:49:13,124 - INFO - Successfully saved 20 repos for 'asthma management'
2025-11-26 19:49:19,139 - INFO - Searching for keyword: 'asthma log'...
2025-11-26 19:49:19,672 - INFO - Successfully 

In [26]:
github_repo_list = pd.read_csv('data/github_asthma_repos.csv')
github_repo_list.head(5)

Unnamed: 0,Keyword Searched,Repository Name,Owner,Stars,Language,URL,Description
0,asthma,AsthmaHealth,ResearchKit,102,Objective-C,https://github.com/ResearchKit/AsthmaHealth,"ResearchKit app studying Asthma, developed by ..."
1,asthma,Gen-AI-Hackathon,shivam6862,26,Jupyter Notebook,https://github.com/shivam6862/Gen-AI-Hackathon,The Gen AI Hackathon project aims to utilize m...
2,asthma,Building-Breather,alexbaramilis,23,,https://github.com/alexbaramilis/Building-Brea...,Breather is an open-source iOS app that shows ...
3,asthma,asthma,mikelove,19,R,https://github.com/mikelove/asthma,RNA-seq quantifications: gene expression respo...
4,asthma,avatree,snousias,16,C++,https://github.com/snousias/avatree,"This paper presents AVATREE, a computational m..."


In [27]:
print(f"Length of github_repo_list: {len(github_repo_list)}")

github_repo_list = github_repo_list.drop_duplicates(subset=['Repository Name', 'Owner'])

print(f"Length of github_repo_list after dropping duplicates: {len(github_repo_list)}")

Length of github_repo_list: 147
Length of github_repo_list after dropping duplicates: 132


## **[GitLab Repositories]**

In [28]:
# [Configuration]
GITLAB_API_BASE_URL = 'https://gitlab.com/api/v4/projects'
GITLAB_CSV_FILE = 'data/gitlab_asthma_repos.csv'
GITLAB_RESULT_LIMIT = 20


In [29]:
def scrape_gitlab():
    logging.info(f"Starting GitLab scrape. Output will be saved to {GITLAB_CSV_FILE}")

    csv_headers = [
        'Keyword Searched',
        'Project Name',
        'Namespace',
        'Stars',
        'Language',
        'URL',
        'Description'
    ]

    try:
        with open(GITLAB_CSV_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(csv_headers)

            for keyword in KEYWORDS:
                logging.info(f"Searching GitLab for keyword: '{keyword}'...")

                params = {
                    'search': keyword,
                    'order_by': 'star_count',
                    'sort': 'desc',
                    'per_page': GITLAB_RESULT_LIMIT,
                    'simple': 'true'
                }

                try:
                    response = requests.get(GITLAB_API_BASE_URL, params=params, timeout=10)
                    response.raise_for_status()

                    results = response.json()
                    if not results:
                        logging.warning(f"No results found for keyword: '{keyword}' on GitLab")
                        continue

                    for project in results:
                        writer.writerow([
                            keyword,
                            project.get('name'),
                            project.get('path_with_namespace'),
                            project.get('star_count'),
                            project.get('language', 'N/A'),
                            project.get('web_url'),
                            project.get('description') or 'No description available'
                        ])

                    logging.info(f"Successfully saved {len(results)} GitLab projects for '{keyword}'")

                except requests.exceptions.RequestException as e:
                    logging.error(f"HTTP Request failed for GitLab keyword '{keyword}': {e}")
                except Exception as e:
                    logging.error(f"An error occurred processing GitLab keyword '{keyword}': {e}")

                time.sleep(6)

    except IOError as e:
        logging.critical(f"Failed to open or write to CSV file {GITLAB_CSV_FILE}: {e}")
        return

    logging.info(f"--- GitLab scraping complete. Data saved to {GITLAB_CSV_FILE} ---")

In [30]:
scrape_gitlab()

2025-11-26 19:50:05,564 - INFO - Starting GitLab scrape. Output will be saved to data/gitlab_asthma_repos.csv
2025-11-26 19:50:05,564 - INFO - Searching GitLab for keyword: 'asthma'...
2025-11-26 19:50:08,168 - INFO - Successfully saved 20 GitLab projects for 'asthma'
2025-11-26 19:50:14,169 - INFO - Searching GitLab for keyword: 'asthma tracker'...
2025-11-26 19:50:14,551 - INFO - Searching GitLab for keyword: 'inhaler tracker'...
2025-11-26 19:50:15,033 - INFO - Searching GitLab for keyword: 'peak flow'...
2025-11-26 19:50:17,268 - INFO - Successfully saved 17 GitLab projects for 'peak flow'
2025-11-26 19:50:23,274 - INFO - Searching GitLab for keyword: 'asthma management'...
2025-11-26 19:50:23,762 - INFO - Searching GitLab for keyword: 'asthma log'...
2025-11-26 19:50:24,513 - INFO - Successfully saved 1 GitLab projects for 'asthma log'
2025-11-26 19:50:30,515 - INFO - Searching GitLab for keyword: 'allergy induced asthma'...
2025-11-26 19:50:31,078 - INFO - Searching GitLab for ke

In [31]:
gitlab_repo_list = pd.read_csv(GITLAB_CSV_FILE)
print(f"Length of gitlab_repo_list: {len(gitlab_repo_list)}")

# Drop duplicates based on project name and namespace
gitlab_repo_list = gitlab_repo_list.drop_duplicates(subset=['Project Name', 'Namespace'])

print(f"Length of gitlab_repo_list after dropping duplicates: {len(gitlab_repo_list)}")

gitlab_repo_list.to_csv('data/gitlab_asthma_repos_set.csv', index=False)

Length of gitlab_repo_list: 38
Length of gitlab_repo_list after dropping duplicates: 37


In [32]:
github_repo_list.to_csv('data/github_asthma_repos_set.csv', index=False)