# **App scraper - Article methodology**

This methodology is based on the methodology detailed in the article "Automatic market research of mobile health apps for the self‐management of allergic rhinitis".

## **[Setup]**

In [None]:
!pip install google-play-scraper pandas requests python-dateutil

import requests
import time
import logging
import pandas as pd
import re
from datetime import datetime
from dateutil.relativedelta import relativedelta
from dateutil import parser
from google_play_scraper import search



## **[Configuration - Methodology Alignment]**

In [None]:
# [Methodology: Search Scope]
KEYWORDS = [
    'hay fever',
    'hayfever',
    'asthma',
    'rhinitis',
    'allergic rhinitis'
]

# Countries defined in the article: USA, UK, Australia
COUNTRIES = ['us', 'gb', 'au']

# Result limits defined in the article
LIMIT_APPLE = 200
LIMIT_GOOGLE = 250

CSV_FILE_ALL = 'data/all_asthma_apps_raw.csv'
CSV_FILE_FILTERED = 'data/asthma_apps_filtered_relevant.csv'

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

## **[Filtering Logic]**

In [None]:
def check_eligibility(app_data):
    """
    Implements the exclusion criteria (Table 1 & 2 of the article).
    Returns True if the app is 'Potentially Relevant'.
    """
    
    title = str(app_data.get('title', '')).lower()
    desc = str(app_data.get('description', '')).lower()
    full_text = title + " " + desc
    
    # 1. CATEGORY CHECK 
    # Allowed: Medical, Health & Fitness, Weather
    allowed_cats = ['medical', 'health & fitness', 'health_and_fitness', 'weather']
    app_cat = str(app_data.get('genre', '')).lower()
    if app_cat not in allowed_cats:
        return False, "Category mismatch"

    # 2. RECENCY CHECK 
    # Must be updated within last 3 years
    last_updated = app_data.get('updated')
    if last_updated:
        three_years_ago = datetime.now() - relativedelta(years=3)
        if last_updated < three_years_ago:
            return False, "Outdated (>3 years)"
    else:
        return False, "No date provided"

    # --- LOGICAL CONDITIONS (Table 2) ---
    
    # Condition 2: "symptom" in description
    if "symptom" not in desc:
        return False, "Condition 2 failed (No 'symptom' in desc)"

    # Condition 3: Exclusion of alternative medicine
    # "home remed*", "natural remed*", "homeopath*", "alternative med*", "acupressure"
    bad_keywords = r"home remed|natural remed|homeopath|alternative med|acupressure"
    if re.search(bad_keywords, full_text):
        return False, "Condition 3 failed (Alternative med)"

    # 5.1: Category Weather OR "pollen" in title
    cond_5_1 = (app_cat == 'weather') or ('pollen' in title)
    
    # 5.2: Specific keywords in DESCRIPTION
    # "symptom", "rhinitis", "hay fever", "hayfever", "asthma", "allerg*"
    kw_5_2 = r"symptom|rhinitis|hay fever|hayfever|asthma|allerg"
    cond_5_2 = bool(re.search(kw_5_2, desc))
    
    # Condition 5 Met? (5.1 AND 5.2)
    cond_5 = cond_5_1 and cond_5_2

    # 6.1: Specific keywords in TITLE or DESCRIPTION
    # "rhinitis", "hay fever", "hayfever", "respiratory allerg*"
    kw_6_1 = r"rhinitis|hay fever|hayfever|respiratory allerg"
    cond_6_1 = bool(re.search(kw_6_1, full_text))

    # Condition 6 Met? (NOT 5 AND 6.1)
    cond_6 = (not cond_5) and cond_6_1

    # Condition 1: 5 OR 6 is met
    if not (cond_5 or cond_6):
        return False, "Condition 1 failed (Relevance)"

    # Condition 4: Food exclusion
    # Exclude if "food" in text AND Condition 6.1 is NOT met
    if "food" in full_text and not cond_6_1:
         return False, "Condition 4 failed (Food app)"

    return True, "Relevant"

## **[Unified Scraper Function]**

In [None]:
def scrape_and_filter():
    all_apps = {}

    # --- APPLE APP STORE ---
    logging.info("Starting Apple App Store Scrape...")
    for country in COUNTRIES:
        for keyword in KEYWORDS:
            logging.info(f"Apple: Searching '{keyword}' in '{country}'...")
            try:
                url = "https://itunes.apple.com/search"
                params = {
                    'term': keyword,
                    'country': country,
                    'media': 'software',
                    'limit': LIMIT_APPLE
                }
                response = requests.get(url, params=params, timeout=10)
                if response.status_code == 200:
                    results = response.json().get('results', [])
                    for item in results:
                        app_id = str(item.get('trackId'))

                        date_str = item.get('currentVersionReleaseDate', item.get('releaseDate'))
                        try:
                            updated_dt = parser.parse(date_str).replace(tzinfo=None) if date_str else None
                        except:
                            updated_dt = None

                        app_obj = {
                            'store': 'Apple',
                            'id': app_id,
                            'title': item.get('trackName'),
                            'developer': item.get('artistName'),
                            'description': item.get('description', ''),
                            'genre': item.get('primaryGenreName'),
                            'price': item.get('formattedPrice'),
                            'rating': item.get('averageUserRating'),
                            'updated': updated_dt,
                            'url': item.get('trackViewUrl'),
                            'country_found': country,
                            'keyword_found': keyword
                        }

                        key = f"apple_{app_id}"
                        if key not in all_apps:
                            all_apps[key] = app_obj
            except Exception as e:
                logging.error(f"Apple Error ({country}/{keyword}): {e}")
            time.sleep(1)

    # --- GOOGLE PLAY STORE ---
    logging.info("Starting Google Play Store Scrape...")
    for country in COUNTRIES:
        for keyword in KEYWORDS:
            logging.info(f"Google: Searching '{keyword}' in '{country}'...")
            try:
                results = search(
                    keyword,
                    lang='en', # Article implies English apps
                    country=country,
                    n_hits=LIMIT_GOOGLE
                )
                
                for item in results:
                    app_id = item.get('appId')
                    
                    app_obj = {
                        'store': 'Google',
                        'id': app_id,
                        'title': item.get('title'),
                        'developer': item.get('developer'),
                        'description': item.get('summary', '') + " " + item.get('description', ''),
                        'genre': item.get('genre', 'Health & Fitness'),
                        'price': item.get('priceText'),
                        'rating': item.get('score'),
                        'updated': None,
                        'url': item.get('url'),
                        'country_found': country,
                        'keyword_found': keyword
                    }
                    key = f"google_{app_id}"
                    if key not in all_apps:
                        all_apps[key] = app_obj
                        
            except Exception as e:
                logging.error(f"Google Error ({country}/{keyword}): {e}")
            time.sleep(1)

    # --- PROCESSING & FILTERING ---
    logging.info(f"Total raw unique apps found: {len(all_apps)}")
    
    filtered_apps = []

    for key, app in all_apps.items():
        if app['store'] == 'Google' and app['updated'] is None:
            try:
                from google_play_scraper import app as get_app_details
                details = get_app_details(app['id'], lang='en', country=app['country_found'])
                app['description'] = details.get('description', app['description'])
                app['genre'] = details.get('genre', app['genre'])
                if details.get('updated'):
                    app['updated'] = datetime.fromtimestamp(details['updated'])
            except:
                pass

        is_relevant, reason = check_eligibility(app)
        
        if is_relevant:
            filtered_apps.append(app)

    logging.info(f"Total apps after filtering: {len(filtered_apps)}")
    
    df = pd.DataFrame(filtered_apps)
    df.to_csv(CSV_FILE_FILTERED, index=False)
    logging.info(f"Saved relevant apps to {CSV_FILE_FILTERED}")


scrape_and_filter()

2026-01-14 01:48:11,774 - INFO - Starting Apple App Store Scrape...
2026-01-14 01:48:11,775 - INFO - Apple: Searching 'hay fever' in 'us'...
2026-01-14 01:48:22,605 - INFO - Apple: Searching 'hayfever' in 'us'...
2026-01-14 01:48:26,546 - INFO - Apple: Searching 'asthma' in 'us'...
2026-01-14 01:48:32,601 - INFO - Apple: Searching 'rhinitis' in 'us'...
2026-01-14 01:48:38,311 - INFO - Apple: Searching 'allergic rhinitis' in 'us'...
2026-01-14 01:48:42,863 - INFO - Apple: Searching 'hay fever' in 'gb'...
2026-01-14 01:48:47,689 - INFO - Apple: Searching 'hayfever' in 'gb'...
2026-01-14 01:48:51,903 - INFO - Apple: Searching 'asthma' in 'gb'...
2026-01-14 01:48:56,968 - INFO - Apple: Searching 'rhinitis' in 'gb'...
2026-01-14 01:49:02,546 - INFO - Apple: Searching 'allergic rhinitis' in 'gb'...
2026-01-14 01:49:09,092 - INFO - Apple: Searching 'hay fever' in 'au'...
2026-01-14 01:49:13,314 - INFO - Apple: Searching 'hayfever' in 'au'...
2026-01-14 01:49:17,010 - INFO - Apple: Searching '

## **[Read relevant & remove duplicates]**

In [4]:
relevant = pd.read_csv('data/asthma_apps_filtered_relevant.csv')
relevant_dedup = relevant.drop_duplicates(subset=['title'])
display(relevant_dedup)

Unnamed: 0,store,id,title,developer,description,genre,price,rating,updated,url,country_found,keyword_found
0,Apple,1244428929,My Pollen Forecast - Allergies,JRustonApps B.V.,My Pollen Forecast is the best app for trackin...,Health & Fitness,Free,4.69147,2026-01-04 01:31:39,https://apps.apple.com/us/app/my-pollen-foreca...,us,hay fever
1,Apple,707461899,Melbourne Pollen Count,AirHealth Pty Ltd,The Melbourne Pollen App provides Victorians w...,Weather,Free,4.5,2025-11-25 01:29:44,https://apps.apple.com/us/app/melbourne-pollen...,us,hay fever
2,Apple,1380057946,"klarify: Pollen app, Hay fever",Alk-Abello AS,Stay ahead of your hay fever with klarify. Use...,Medical,Free,4.52325,2025-11-17 09:46:10,https://apps.apple.com/us/app/klarify-pollen-a...,us,hay fever
3,Apple,1640644173,Perth Pollen Count,AirHealth Pty Ltd,Empower yourself against allergies with the cu...,Weather,Free,4.66667,2025-11-25 01:30:09,https://apps.apple.com/us/app/perth-pollen-cou...,us,hay fever
4,Apple,914338280,Canberra Pollen Count,AirHealth Pty Ltd,Canberra Pollen Count & Forecast: Your Allergy...,Weather,Free,4.8,2025-11-25 01:30:03,https://apps.apple.com/us/app/canberra-pollen-...,us,hay fever
5,Apple,1244429473,My Pollen Forecast Pro,JRustonApps B.V.,My Pollen Forecast Pro is the best app for tra...,Health & Fitness,$4.99,4.57172,2026-01-04 01:29:10,https://apps.apple.com/us/app/my-pollen-foreca...,us,hay fever
6,Apple,1474856970,Pollen Wise,Pollen Sense,Pollen Wise has reached version 5! We've been ...,Weather,Free,4.54631,2025-12-09 03:45:00,https://apps.apple.com/us/app/pollen-wise/id14...,us,hay fever
7,Apple,1610856212,Pollen Count & Forecast,kevin penture,Pollen Count & Alert is the ultimate companion...,Health & Fitness,Free,4.42856,2025-12-23 08:57:33,https://apps.apple.com/us/app/pollen-count-for...,us,hay fever
8,Apple,983596216,MASK-air,MASK-AIR,The best application for monitoring allergic r...,Medical,Free,0.0,2026-01-13 08:29:18,https://apps.apple.com/us/app/mask-air/id98359...,us,hay fever
9,Apple,1642592299,DocGo,DocGo Inc.,URGENT CARE IN-HOME VISITS\n\nAvoid crowded wa...,Medical,Free,4.5,2025-09-25 18:41:26,https://apps.apple.com/us/app/docgo/id16425922...,us,hay fever


In [5]:
CSV_FILE_FILTERED_DEDUP = 'data/asthma_apps_filtered_relevant_dedup.csv'
relevant_dedup.to_csv(CSV_FILE_FILTERED_DEDUP, index=False)