<a href="https://colab.research.google.com/github/velgaks/Course-Practice/blob/main/UkraineEduDataCollector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tqdm requests pandas openpyxl



In [2]:
import requests
import pandas as pd
from io import BytesIO
import time
from tqdm import tqdm
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
class EDBODataCollector:
    def __init__(self, max_workers=3, delay_between_requests=2):
        self.max_workers = max_workers
        self.delay_between_requests = delay_between_requests
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

    def fetch_university_data(self, university_info, retry_count=3):
        """Fetch data for a single university with error handling"""
        university_id = university_info["university_id"]
        university_name = university_info["university_name"]

        for attempt in range(retry_count):
            try:
                url = f"https://registry.edbo.gov.ua/api/university/?id={university_id}&exp=xlsx"
                print(f"Fetching: {university_name}")

                response = self.session.get(url, timeout=30)
                response.raise_for_status()

                if response.headers.get('content-type', '').startswith('application/'):
                    xlsx_data = BytesIO(response.content)
                    xl = pd.ExcelFile(xlsx_data)

                    if "–û—Å–≤—ñ—Ç–Ω—ñ –ø—Ä–æ–≥—Ä–∞–º–∏" in xl.sheet_names:
                        df = xl.parse("–û—Å–≤—ñ—Ç–Ω—ñ –ø—Ä–æ–≥—Ä–∞–º–∏")
                        df["university_id"] = university_id
                        df["university_name"] = university_name
                        df.columns = df.columns.str.strip()

                        print(f"‚úì Got {len(df)} programs from {university_name}")
                        return df
                    else:
                        print(f"‚ö† No programs sheet found for {university_name}")
                        return None
                else:
                    print(f"‚úó Invalid response for {university_name}")
                    return None

            except Exception as e:
                print(f"‚úó Error for {university_name}, attempt {attempt + 1}: {str(e)}")
                if attempt < retry_count - 1:
                    time.sleep(self.delay_between_requests * (attempt + 1))
                else:
                    return None
        return None

In [4]:
def collect_all_data(self, uni_list_response, use_simple_method=True):
    """Collect data from all universities"""
    # Filter state universities
    state_universities = [
        uni for uni in uni_list_response.json()
        if uni.get("university_financing_type_name") == "–î–µ—Ä–∂–∞–≤–Ω–∞"
    ]

    print(f"Found {len(state_universities)} state universities")

    all_dataframes = []
    failed_universities = []

    if use_simple_method:
        # Simple sequential processing
        for i, uni in enumerate(state_universities):
            print(f"\n[{i+1}/{len(state_universities)}] Processing: {uni['university_name']}")

            result = self.fetch_university_data(uni)
            if result is not None:
                all_dataframes.append(result)
            else:
                failed_universities.append(uni["university_name"])

            # Wait between requests
            time.sleep(self.delay_between_requests)

    # Combine results
    if all_dataframes:
        combined_df = pd.concat(all_dataframes, ignore_index=True)
        print(f"\nüéâ Success! Collected {len(combined_df)} programs from {len(all_dataframes)} universities")
    else:
        combined_df = pd.DataFrame()
        print("\nüòû No data collected")

    if failed_universities:
        print(f"\n‚ö† Failed universities ({len(failed_universities)}):")
        for name in failed_universities[:5]:  # Show first 5
            print(f"  - {name}")
        if len(failed_universities) > 5:
            print(f"  ... and {len(failed_universities) - 5} more")

    return combined_df, failed_universities

# Add method to class
EDBODataCollector.collect_all_data = collect_all_data

In [None]:
# Cell 5 - FIXED VERSION: Get University List from Excel
print("Getting university list from Excel endpoint...")

try:
    # The working endpoint returns Excel data
    excel_url = "https://registry.edbo.gov.ua/api/universities/"
    response = requests.get(excel_url, timeout=30)

    print(f"Status code: {response.status_code}")
    print(f"Content type: {response.headers.get('content-type')}")
    print(f"Response length: {len(response.content):,} bytes")

    if response.status_code == 200:
        # Parse the Excel file
        xlsx_data = BytesIO(response.content)
        xl = pd.ExcelFile(xlsx_data)

        print(f"‚úì Excel file loaded successfully!")
        print(f"Available sheets: {xl.sheet_names}")

        # Usually the main data is in the first sheet or a sheet with obvious name
        # Let's check each sheet
        university_data = None

        for sheet_name in xl.sheet_names:
            try:
                df = xl.parse(sheet_name)
                print(f"\nSheet '{sheet_name}': {len(df)} rows, {len(df.columns)} columns")

                if len(df) > 0:
                    print(f"  Columns: {list(df.columns)[:5]}...")  # Show first 5 columns

                    # Look for university-like data
                    potential_id_cols = [col for col in df.columns if 'id' in col.lower() or '–∫–æ–¥' in col.lower()]
                    potential_name_cols = [col for col in df.columns if 'name' in col.lower() or '–Ω–∞–∑–≤' in col.lower() or '–Ω–∞–π–º–µ–Ω—É–≤–∞–Ω–Ω—è' in col.lower()]
                    potential_type_cols = [col for col in df.columns if '—Ç–∏–ø' in col.lower() or 'type' in col.lower() or '—Ñ—ñ–Ω–∞–Ω—Å—É–≤–∞–Ω–Ω—è' in col.lower()]

                    print(f"  Potential ID columns: {potential_id_cols}")
                    print(f"  Potential name columns: {potential_name_cols}")
                    print(f"  Potential type columns: {potential_type_cols}")

                    # If this looks like the main university list, save it
                    if len(df) > 100 and potential_name_cols:  # Likely the main list
                        university_data = df
                        main_sheet = sheet_name
                        print(f"  ‚Üí This looks like the main university list!")

            except Exception as e:
                print(f"  Error reading sheet '{sheet_name}': {e}")

        if university_data is not None:
            print(f"\nüéâ Found university data in sheet '{main_sheet}'!")
            print(f"Total universities: {len(university_data)}")

            # Show sample data
            print(f"\nSample row:")
            sample = university_data.iloc[0]
            for col in university_data.columns[:8]:  # Show first 8 columns
                print(f"  {col}: {sample[col]}")

            # Try to find state universities
            # Look for columns that might indicate financing type
            state_filter_applied = False
            for col in university_data.columns:
                if '—Ñ—ñ–Ω–∞–Ω—Å—É–≤–∞–Ω–Ω—è' in col.lower() or 'financing' in col.lower() or '—Ç–∏–ø' in col.lower():
                    unique_values = university_data[col].value_counts()
                    print(f"\nValues in '{col}':")
                    print(unique_values.head(10))

                    # Look for "–î–µ—Ä–∂–∞–≤–Ω–∞" or similar
                    state_mask = university_data[col].astype(str).str.contains('–î–µ—Ä–∂–∞–≤–Ω', case=False, na=False)
                    if state_mask.sum() > 0:
                        state_universities = university_data[state_mask].copy()
                        print(f"‚úì Found {len(state_universities)} state universities using column '{col}'")
                        state_filter_applied = True
                        break

            if not state_filter_applied:
                print(f"‚ö† Couldn't auto-detect state universities, using all {len(university_data)} universities")
                state_universities = university_data.copy()

            # Save the university list for later use
            uni_list_df = state_universities
            print(f"\n‚úì Ready to collect data from {len(uni_list_df)} universities")

        else:
            print("‚ùå Couldn't find university data in any sheet")

except Exception as e:
    print(f"‚ùå Error loading university list: {e}")

# Let's also create a simple function to convert our DataFrame to the format expected by our collector
def df_to_university_list(df):
    """Convert DataFrame to list format expected by our collector"""
    universities = []

    # Try to find the right columns
    id_col = None
    name_col = None

    for col in df.columns:
        if not id_col and ('id' in col.lower() or '–∫–æ–¥' in col.lower()):
            id_col = col
        if not name_col and ('name' in col.lower() or '–Ω–∞–∑–≤' in col.lower() or '–Ω–∞–π–º–µ–Ω—É–≤–∞–Ω–Ω—è' in col.lower()):
            name_col = col

    if id_col and name_col:
        for _, row in df.iterrows():
            universities.append({
                "university_id": row[id_col],
                "university_name": row[name_col],
                "university_financing_type_name": "–î–µ—Ä–∂–∞–≤–Ω–∞"  # We already filtered for state unis
            })
        print(f"‚úì Converted {len(universities)} universities to expected format")
        print(f"Using ID column: '{id_col}', Name column: '{name_col}'")
    else:
        print(f"‚ùå Couldn't find ID column ({id_col}) or Name column ({name_col})")
        return []

    return universities

# Convert our DataFrame if we have it
if 'uni_list_df' in locals():
    uni_list_converted = df_to_university_list(uni_list_df)
    print(f"\nSample converted university:")
    if uni_list_converted:
        sample = uni_list_converted[0]
        for key, value in sample.items():
            print(f"  {key}: {value}")
else:
    print("No university data available to convert")

In [8]:
# Cell 5.5 - FILTER AND EXPLORE THE DATA BETTER
print("üîç Let's explore and filter the data better...")

# Check what categories of institutions we have
print("\n=== CATEGORIES OF INSTITUTIONS ===")
if '–ö–∞—Ç–µ–≥–æ—Ä—ñ—è –∑–∞–∫–ª–∞–¥—É –æ—Å–≤—ñ—Ç–∏' in uni_list_df.columns:
    categories = uni_list_df['–ö–∞—Ç–µ–≥–æ—Ä—ñ—è –∑–∞–∫–ª–∞–¥—É –æ—Å–≤—ñ—Ç–∏'].value_counts()
    print(categories)

    # Filter for higher education institutions (universities)
    university_keywords = ['–≤–∏—â–æ—ó –æ—Å–≤—ñ—Ç–∏', '—É–Ω—ñ–≤–µ—Ä—Å–∏—Ç–µ—Ç', '—ñ–Ω—Å—Ç–∏—Ç—É—Ç', '–∞–∫–∞–¥–µ–º—ñ—è']
    is_university = uni_list_df['–ö–∞—Ç–µ–≥–æ—Ä—ñ—è –∑–∞–∫–ª–∞–¥—É –æ—Å–≤—ñ—Ç–∏'].astype(str).str.contains('|'.join(university_keywords), case=False, na=False)

    universities_only = uni_list_df[is_university].copy()
    print(f"\n‚úì Filtered to {len(universities_only)} higher education institutions")
else:
    # If no category column, filter by name patterns
    print("No category column found, filtering by institution names...")
    university_keywords = ['—É–Ω—ñ–≤–µ—Ä—Å–∏—Ç–µ—Ç', '—ñ–Ω—Å—Ç–∏—Ç—É—Ç', '–∞–∫–∞–¥–µ–º—ñ—è', '–∫–æ–ª–µ–¥–∂']
    is_university = uni_list_df['–ù–∞–∑–≤–∞ –∑–∞–∫–ª–∞–¥—É –æ—Å–≤—ñ—Ç–∏'].astype(str).str.contains('|'.join(university_keywords), case=False, na=False)
    universities_only = uni_list_df[is_university].copy()
    print(f"‚úì Filtered to {len(universities_only)} institutions with university-like names")

# Check for state/private distinction
print(f"\n=== LOOKING FOR STATE VS PRIVATE DISTINCTION ===")
potential_ownership_columns = [col for col in uni_list_df.columns if any(keyword in col.lower() for keyword in ['–≤–ª–∞—Å–Ω', '—Ñ–æ—Ä–º–∞', '—Ç–∏–ø', '—Å—Ç–∞—Ç—É—Å', '—Ñ—ñ–Ω–∞–Ω—Å'])]
print(f"Potential ownership columns: {potential_ownership_columns}")

state_universities = universities_only  # Default to all universities
for col in potential_ownership_columns:
    print(f"\nValues in '{col}':")
    values = universities_only[col].value_counts().head(10)
    print(values)

    # Look for state indicators
    state_indicators = ['–¥–µ—Ä–∂–∞–≤–Ω', '–∫–æ–º—É–Ω–∞–ª—å–Ω', 'public', 'state']
    for indicator in state_indicators:
        mask = universities_only[col].astype(str).str.contains(indicator, case=False, na=False)
        if mask.sum() > 0:
            state_universities = universities_only[mask].copy()
            print(f"‚úì Found {len(state_universities)} state institutions using '{indicator}' in column '{col}'")
            break
    if len(state_universities) < len(universities_only):
        break

print(f"\nüéØ FINAL SELECTION: {len(state_universities)} institutions to process")

# Show some examples
print(f"\n=== SAMPLE INSTITUTIONS TO PROCESS ===")
for i in range(min(5, len(state_universities))):
    row = state_universities.iloc[i]
    print(f"{i+1}. {row['–ù–∞–∑–≤–∞ –∑–∞–∫–ª–∞–¥—É –æ—Å–≤—ñ—Ç–∏']} (ID: {row['–ö–æ–¥']})")

# Convert to the format our collector expects
def df_to_university_list_v2(df):
    """Convert DataFrame to list format expected by our collector"""
    universities = []

    for _, row in df.iterrows():
        universities.append({
            "university_id": row['–ö–æ–¥'],
            "university_name": row['–ù–∞–∑–≤–∞ –∑–∞–∫–ª–∞–¥—É –æ—Å–≤—ñ—Ç–∏'],
            "university_financing_type_name": "–î–µ—Ä–∂–∞–≤–Ω–∞"
        })

    return universities

# Convert our filtered data
final_university_list = df_to_university_list_v2(state_universities)
print(f"\n‚úÖ Converted {len(final_university_list)} universities to expected format")

# Create a mock response object to work with our existing collector
class MockResponse:
    def __init__(self, data):
        self.data = data

    def json(self):
        return self.data

uni_list = MockResponse(final_university_list)
print("‚úÖ Created mock response object for collector")

print(f"\nüöÄ Ready to start collection with {len(final_university_list)} institutions!")

üîç Let's explore and filter the data better...

=== CATEGORIES OF INSTITUTIONS ===
–ö–∞—Ç–µ–≥–æ—Ä—ñ—è –∑–∞–∫–ª–∞–¥—É –æ—Å–≤—ñ—Ç–∏
–ó–∞–∫–ª–∞–¥ –ø—Ä–æ—Ñ–µ—Å—ñ–π–Ω–æ—ó (–ø—Ä–æ—Ñ–µ—Å—ñ–π–Ω–æ-—Ç–µ—Ö–Ω—ñ—á–Ω–æ—ó) –æ—Å–≤—ñ—Ç–∏                         1147
–ó–∞–∫–ª–∞–¥ —Ñ–∞—Ö–æ–≤–æ—ó –ø–µ—Ä–µ–¥–≤–∏—â–æ—ó –æ—Å–≤—ñ—Ç–∏                                          694
–ó–∞–∫–ª–∞–¥ –≤–∏—â–æ—ó –æ—Å–≤—ñ—Ç–∏                                                       521
–Ü–Ω—à–∏–π –∑–∞–∫–ª–∞–¥ –æ—Å–≤—ñ—Ç–∏, —â–æ –Ω–∞–¥–∞—î –ø—Ä–æ—Ñ–µ—Å—ñ–π–Ω—É (–ø—Ä–æ—Ñ–µ—Å—ñ–π–Ω–æ-—Ç–µ—Ö–Ω—ñ—á–Ω—É –æ—Å–≤—ñ—Ç—É)     318
–ù–∞—É–∫–æ–≤—ñ —ñ–Ω—Å—Ç–∏—Ç—É—Ç–∏ (—É—Å—Ç–∞–Ω–æ–≤–∏)                                              204
–ó–∞–∫–ª–∞–¥ –ø—ñ—Å–ª—è–¥–∏–ø–ª–æ–º–Ω–æ—ó –æ—Å–≤—ñ—Ç–∏                                               24
–ó–∞–∫–ª–∞–¥ –∑–∞–≥–∞–ª—å–Ω–æ—ó —Å–µ—Ä–µ–¥–Ω—å–æ—ó –æ—Å–≤—ñ—Ç–∏                                           4
Name: count, dtype: int64

‚úì Filtered to 1419 higher education institutions

=== LOOKING FOR S

In [9]:
# Cell 6 - UPDATED: Initialize Collector and Start Collection

# Initialize the collector with conservative settings
collector = EDBODataCollector(
    max_workers=2,              # Keep low to be nice to server
    delay_between_requests=4    # 4 seconds between requests - be respectful!
)

# Let's start with just a small test first
TEST_MODE = True  # Set to False when you want to process all universities

if TEST_MODE:
    # Test with first 5 institutions
    test_data = final_university_list[:5]
    test_response = MockResponse(test_data)

    print(f"üß™ TEST MODE: Processing {len(test_data)} institutions first")
    for i, uni in enumerate(test_data):
        print(f"  {i+1}. {uni['university_name']} (ID: {uni['university_id']})")

    print(f"\nStarting TEST collection...")
    univ_data, failed_unis = collector.collect_all_data(test_response, use_simple_method=True)

    if not univ_data.empty:
        print(f"\nüéâ TEST SUCCESSFUL!")
        print(f"Collected {len(univ_data)} programs from test institutions")
        print(f"Sample data preview:")
        print(univ_data.head(2))

        print(f"\n‚úÖ Test worked! Set TEST_MODE = False to process all {len(final_university_list)} institutions")
        print(f"‚ö†Ô∏è  Full processing will take several hours - make sure you're ready!")
    else:
        print(f"\n‚ùå Test failed - let's debug before processing all institutions")

else:
    # Full processing mode
    print(f"üöÄ FULL MODE: Processing ALL {len(final_university_list)} institutions")
    print(f"‚è∞ This will take 3-4+ hours - go get dinner! üçΩÔ∏è")
    print(f"üí° You can interrupt anytime with Ctrl+C and restart later")

    # Confirm before starting
    import time
    print(f"\nStarting in 10 seconds... (interrupt now if you want to test first)")
    for i in range(10, 0, -1):
        print(f"‚è≥ {i}...", end=" ", flush=True)
        time.sleep(1)
    print(f"\nüèÅ Starting full collection!")

    univ_data, failed_unis = collector.collect_all_data(uni_list, use_simple_method=True)

üß™ TEST MODE: Processing 5 institutions first
  1. –íi–¥–æ–∫—Ä–µ–º–ª–µ–Ω–∏–π —Å—Ç—Ä—É–∫—Ç—É—Ä–Ω–∏–π –øi–¥—Ä–æ–∑–¥i–ª ¬´–ö–∏—ó–≤—Å—å–∫–∏–π —Ñ–∞—Ö–æ–≤–∏–π –∫–æ–ª–µ–¥–∂ —Ä–µ—Å—Ç–æ—Ä–∞–Ω–Ω–æ–≥–æ –≥–æ—Å–ø–æ–¥–∞—Ä—Å—Ç–≤–∞ –ù–∞—Üi–æ–Ω–∞–ª—å–Ω–æ–≥–æ —É–Ωi–≤–µ—Ä—Å–∏—Ç–µ—Ç—É —Ö–∞—Ä—á–æ–≤–∏—Ö —Ç–µ—Ö–Ω–æ–ª–æ–≥i–π¬ª (ID: 1014)
  2. –í—ñ–¥–æ–∫—Ä–µ–º–ª–µ–Ω–∏–π —Å—Ç—Ä—É–∫—Ç—É—Ä–Ω–∏–π –ø—ñ–¥—Ä–æ–∑–¥—ñ–ª ¬´–ö–∏—ó–≤—Å—å–∫–∏–π —Ç–æ—Ä–≥–æ–≤–µ–ª—å–Ω–æ-–µ–∫–æ–Ω–æ–º—ñ—á–Ω–∏–π —Ñ–∞—Ö–æ–≤–∏–π –∫–æ–ª–µ–¥–∂ –î–µ—Ä–∂–∞–≤–Ω–æ–≥–æ —Ç–æ—Ä–≥–æ–≤–µ–ª—å–Ω–æ-–µ–∫–æ–Ω–æ–º—ñ—á–Ω–æ–≥–æ —É–Ω—ñ–≤–µ—Ä—Å–∏—Ç–µ—Ç—É¬ª (ID: 6626)
  3. –í—ñ–¥–æ–∫—Ä–µ–º–ª–µ–Ω–∏–π —Å—Ç—Ä—É–∫—Ç—É—Ä–Ω–∏–π –ø—ñ–¥—Ä–æ–∑–¥—ñ–ª "–í—É–≥–ª–µ–¥–∞—Ä—Å—å–∫–∏–π —Ñ–∞—Ö–æ–≤–∏–π –∫–æ–ª–µ–¥–∂ –ú–∞—Ä—ñ—É–ø–æ–ª—å—Å—å–∫–æ–≥–æ –¥–µ—Ä–∂–∞–≤–Ω–æ–≥–æ —É–Ω—ñ–≤–µ—Ä—Å–∏—Ç–µ—Ç—É" (ID: 6502)
  4. –í—ñ–¥–æ–∫—Ä–µ–º–ª–µ–Ω–∏–π —Å—Ç—Ä—É–∫—Ç—É—Ä–Ω–∏–π –ø—ñ–¥—Ä–æ–∑–¥—ñ–ª "–Ü–Ω—Å—Ç–∏—Ç—É—Ç —ñ–Ω–Ω–æ–≤–∞—Ü—ñ–π–Ω–æ—ó –æ—Å–≤—ñ—Ç–∏ –ö–∏—ó–≤—Å—å–∫–æ–≥–æ –Ω–∞—Ü—ñ–æ–Ω–∞–ª—å–Ω–æ–≥–æ —É–Ω—ñ–≤–

In [10]:
# BEFORE STARTING FULL COLLECTION - RUN THIS CELL FIRST

print("üöÄ PREPARATION FOR FULL COLLECTION")
print("=" * 50)

# Check our current status
print(f"‚úÖ Test completed: 138 programs from 5 institutions")
print(f"üéØ Ready to process: {len(final_university_list)} total institutions")
print(f"‚è±Ô∏è  Estimated time: {len(final_university_list) * 4 / 60:.1f} minutes minimum")
print(f"üíæ Expected data size: ~{len(final_university_list) * 30:,} programs (rough estimate)")

# Set up automatic saving during collection
import os
from datetime import datetime

# Create a timestamp for this collection run
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
save_dir = f"edbo_collection_{timestamp}"

print(f"\nüìÅ Will save data to: {save_dir}/")
print(f"üìä Backup files will be created every 100 institutions")

# Enhanced collector with auto-save capability
class EnhancedEDBOCollector(EDBODataCollector):
    def __init__(self, *args, save_every=100, save_dir=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.save_every = save_every
        self.save_dir = save_dir or f"edbo_collection_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

        # Create save directory
        if not os.path.exists(self.save_dir):
            os.makedirs(self.save_dir)
            print(f"üìÅ Created directory: {self.save_dir}")

    def collect_all_data(self, uni_list_response, use_simple_method=True):
        """Enhanced collection with auto-save"""
        state_universities = [
            uni for uni in uni_list_response.json()
            if uni.get("university_financing_type_name") == "–î–µ—Ä–∂–∞–≤–Ω–∞"
        ]

        print(f"Found {len(state_universities)} state universities")

        all_dataframes = []
        failed_universities = []

        for i, uni in enumerate(state_universities):
            print(f"\n[{i+1}/{len(state_universities)}] Processing: {uni['university_name'][:60]}...")

            result = self.fetch_university_data(uni)
            if result is not None:
                all_dataframes.append(result)
                print(f"  ‚úÖ Success: {len(result)} programs")
            else:
                failed_universities.append(uni["university_name"])
                print(f"  ‚ùå Failed")

            # Auto-save every N institutions
            if (i + 1) % self.save_every == 0 and all_dataframes:
                try:
                    partial_df = pd.concat(all_dataframes, ignore_index=True)
                    filename = f"{self.save_dir}/partial_backup_{i+1}_institutions.csv"
                    partial_df.to_csv(filename, index=False, encoding='utf-8')
                    print(f"  üíæ Backup saved: {len(partial_df)} programs to {filename}")
                except Exception as save_error:
                    print(f"  ‚ö†Ô∏è Backup save failed: {save_error}")

            # Progress summary every 50 institutions
            if (i + 1) % 50 == 0:
                total_programs = sum(len(df) for df in all_dataframes)
                success_rate = len(all_dataframes) / (i + 1) * 100
                print(f"  üìä Progress: {total_programs:,} programs collected, {success_rate:.1f}% success rate")

            time.sleep(self.delay_between_requests)

        # Final combining and saving
        if all_dataframes:
            combined_df = pd.concat(all_dataframes, ignore_index=True)

            # Save final results
            final_filename = f"{self.save_dir}/final_results.csv"
            combined_df.to_csv(final_filename, index=False, encoding='utf-8')

            # Save failed institutions list
            if failed_universities:
                failed_filename = f"{self.save_dir}/failed_institutions.txt"
                with open(failed_filename, 'w', encoding='utf-8') as f:
                    f.write(f"Failed institutions ({len(failed_universities)} total):\n")
                    for name in failed_universities:
                        f.write(f"- {name}\n")

            print(f"\nüéâ COLLECTION COMPLETE!")
            print(f"üìä Final results: {len(combined_df):,} programs from {len(all_dataframes)} institutions")
            print(f"üíæ Saved to: {final_filename}")
            print(f"‚ùå Failed: {len(failed_universities)} institutions")

        else:
            combined_df = pd.DataFrame()
            print(f"\nüòû No data collected")

        return combined_df, failed_universities

# Create enhanced collector
enhanced_collector = EnhancedEDBOCollector(
    max_workers=2,
    delay_between_requests=4,
    save_every=100,  # Save backup every 100 institutions
    save_dir=save_dir
)

print(f"\n‚úÖ Enhanced collector ready!")
print(f"üìÅ Save directory: {save_dir}")
print(f"üíæ Auto-backup every 100 institutions")
print(f"‚è±Ô∏è  Delay: 4 seconds between requests")

print(f"\nüö® IMPORTANT REMINDERS:")
print(f"1. This will take 3-4+ hours")
print(f"2. Keep your computer/browser open")
print(f"3. You can interrupt with Ctrl+C anytime")
print(f"4. Data is auto-saved every 100 institutions")
print(f"5. Don't refresh the browser during collection")

print(f"\nüöÄ Ready to start? Run the next cell!")

üöÄ PREPARATION FOR FULL COLLECTION
‚úÖ Test completed: 138 programs from 5 institutions
üéØ Ready to process: 858 total institutions
‚è±Ô∏è  Estimated time: 57.2 minutes minimum
üíæ Expected data size: ~25,740 programs (rough estimate)

üìÅ Will save data to: edbo_collection_20250826_175246/
üìä Backup files will be created every 100 institutions
üìÅ Created directory: edbo_collection_20250826_175246

‚úÖ Enhanced collector ready!
üìÅ Save directory: edbo_collection_20250826_175246
üíæ Auto-backup every 100 institutions
‚è±Ô∏è  Delay: 4 seconds between requests

üö® IMPORTANT REMINDERS:
1. This will take 3-4+ hours
2. Keep your computer/browser open
3. You can interrupt with Ctrl+C anytime
4. Data is auto-saved every 100 institutions
5. Don't refresh the browser during collection

üöÄ Ready to start? Run the next cell!


In [None]:
# FINAL COLLECTION CELL - RUN WHEN READY

print("üöÄ STARTING FULL COLLECTION OF UKRAINIAN UNIVERSITY DATA")
print("=" * 60)

# Final confirmation
import time
print(f"üìä About to process: {len(final_university_list)} institutions")
print(f"‚è±Ô∏è  Estimated time: {len(final_university_list) * 4 / 60:.0f}-{len(final_university_list) * 6 / 60:.0f} minutes")
print(f"üíæ Auto-save directory: {enhanced_collector.save_dir}")

print(f"\n‚ö†Ô∏è  LAST CHANCE TO CANCEL!")
print(f"Starting in 15 seconds... Press Ctrl+C to cancel")

try:
    for i in range(15, 0, -1):
        print(f"‚è≥ {i}...", end=" ", flush=True)
        time.sleep(1)

    print(f"\n\nüèÅ COLLECTION STARTED!")
    print(f"üìÖ Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    # Start the full collection
    start_time = time.time()
    final_data, failed_institutions = enhanced_collector.collect_all_data(uni_list, use_simple_method=True)
    end_time = time.time()

    # Final summary
    duration = end_time - start_time
    print(f"\n" + "="*60)
    print(f"üéä COLLECTION COMPLETED!")
    print(f"‚è±Ô∏è  Duration: {duration/60:.1f} minutes ({duration/3600:.1f} hours)")
    print(f"üìä Programs collected: {len(final_data):,}")
    print(f"üè´ Successful institutions: {final_data['university_name'].nunique() if not final_data.empty else 0}")
    print(f"‚ùå Failed institutions: {len(failed_institutions)}")
    print(f"üíæ Data saved in: {enhanced_collector.save_dir}/")

    if not final_data.empty:
        print(f"\nüìà QUICK STATS:")
        print(f"- Average programs per institution: {len(final_data) / final_data['university_name'].nunique():.1f}")
        print(f"- Most common degree: {final_data['–û—Å–≤—ñ—Ç–Ω—ñ–π —Å—Ç—É–ø—ñ–Ω—å'].mode().iloc[0] if '–û—Å–≤—ñ—Ç–Ω—ñ–π —Å—Ç—É–ø—ñ–Ω—å' in final_data.columns else 'N/A'}")
        print(f"- Total specialties covered: {final_data['–ù–∞–∑–≤–∞ —Å–ø–µ—Ü—ñ–∞–ª—å–Ω–æ—Å—Ç—ñ'].nunique() if '–ù–∞–∑–≤–∞ —Å–ø–µ—Ü—ñ–∞–ª—å–Ω–æ—Å—Ç—ñ' in final_data.columns else 'N/A'}")

        # Quick preview
        print(f"\nüìã SAMPLE DATA:")
        print(final_data[['university_name', '–ù–∞–∑–≤–∞ —Å–ø–µ—Ü—ñ–∞–ª—å–Ω–æ—Å—Ç—ñ', '–û—Å–≤—ñ—Ç–Ω—ñ–π —Å—Ç—É–ø—ñ–Ω—å']].head())

    print(f"\n‚úÖ SUCCESS! Your Ukrainian university data is ready for analysis!")

except KeyboardInterrupt:
    print(f"\n\n‚èπÔ∏è  Collection interrupted by user")
    print(f"üíæ Check {enhanced_collector.save_dir}/ for any partial data saved")
    print(f"üîÑ You can resume by adjusting the university list and running again")

except Exception as e:
    print(f"\n\n‚ùå Unexpected error: {e}")
    print(f"üíæ Check {enhanced_collector.save_dir}/ for any partial data saved")

In [None]:
# Check what we got
if not univ_data.empty:
    print("=== COLLECTION SUMMARY ===")
    print(f"üìä Total programs: {len(univ_data):,}")
    print(f"üè´ Universities: {univ_data['university_name'].nunique()}")
    print(f"üìã Columns available: {len(univ_data.columns)}")

    print(f"\n=== SAMPLE DATA ===")
    print(univ_data.head())

    print(f"\n=== COLUMN NAMES ===")
    for i, col in enumerate(univ_data.columns):
        print(f"{i+1:2d}. {col}")
else:
    print("üòû No data collected")

In [None]:
# Save to different formats
if not univ_data.empty:
    # Save as CSV
    univ_data.to_csv('ukrainian_universities_data.csv', index=False, encoding='utf-8')
    print("‚úì Saved as CSV")

    # Save as Excel
    univ_data.to_excel('ukrainian_universities_data.xlsx', index=False)
    print("‚úì Saved as Excel")

    # Quick stats
    print(f"\n=== QUICK STATS ===")
    print(f"Most programs: {univ_data.groupby('university_name').size().max()}")
    print(f"Average programs per uni: {univ_data.groupby('university_name').size().mean():.1f}")

    # Download files to your computer
    from google.colab import files
    files.download('ukrainian_universities_data.csv')
else:
    print("No data to save")

In [None]:
# Explore the data you collected
if not univ_data.empty:
    print("=== TOP UNIVERSITIES BY PROGRAM COUNT ===")
    top_unis = univ_data.groupby('university_name').size().sort_values(ascending=False).head(10)
    for uni, count in top_unis.items():
        print(f"{count:3d} programs - {uni}")

    print(f"\n=== SAMPLE PROGRAM INFO ===")
    sample_program = univ_data.iloc[0]
    for col in univ_data.columns[:10]:  # Show first 10 columns
        print(f"{col}: {sample_program[col]}")