In [0]:
#!/usr/bin/env python3
"""
Simple API Data Extraction Example
65,000 records in chunks of 100
"""

import pandas as pd
import requests
import time
from datetime import datetime
import math

#def extract_api_data_simple(base_url, headers=None, total_records=65000, chunk_size=100):
def extract_api_data_simple( total_records=65000, chunk_size=100):

    """
    Simple function to extract API data in chunks
    
    Args:
        base_url: API endpoint URL
        headers: HTTP headers (dict)
        total_records: Total number of records to extract
        chunk_size: Records per API call
        
    Returns:
        pandas.DataFrame: All extracted data
    """
    
    all_data = []
    total_chunks = math.ceil(total_records / chunk_size)
    
    print(f"üöÄ Extracting {total_records} records in {total_chunks} chunks of {chunk_size}")
    
    for chunk in range(total_chunks):
        
        # Calculate offset and limit
        offset = chunk * chunk_size
        print('offset', offset)
        print('chunk', chunk)
        print('chunk_size', chunk_size)
        limit = min(chunk_size, total_records - offset)
        print('limit', limit)
        
        try:
            # API request parameters
            params = {
                'offset': offset,
                'limit': limit,
                # Add other parameters as needed:
                # 'page': chunk + 1,
                # 'per_page': chunk_size,
            }
            
            # Make API call
            print(f"üì• Fetching chunk {chunk + 1}/{total_chunks} (records {offset}-{offset + limit - 1})")
            
            #response = requests.get(base_url, params=params, headers=headers, timeout=30)
            #response.raise_for_status()
            
            # Parse response
            #data = response.json()
            
            # Extract records (adjust based on your API response structure)
            if isinstance(data, list):
                records = data
            elif 'data' in data:
                records = data['data']
            elif 'results' in data:
                records = data['results']
            else:
                records = data.get('items', [])
            
            if records:
                # Convert to DataFrame and process
                chunk_df = pd.DataFrame(records)
                
                # Add metadata
                chunk_df['chunk_number'] = chunk
                chunk_df['extracted_at'] = datetime.now()
                
                all_data.append(chunk_df)
                print(f"‚úÖ Processed {len(records)} records")
            else:
                print(f"‚ö†Ô∏è No records in chunk {chunk + 1}")
            
            # Rate limiting
            time.sleep(0.1)  # 100ms delay between requests
            
            # Progress update
            if (chunk + 1) % 50 == 0:
                progress = ((chunk + 1) / total_chunks) * 100
                print(f"üìä Progress: {progress:.1f}%")
                
        except Exception as e:
            print(f"‚ùå Error in chunk {chunk + 1}: {e}")
            continue
    
    # Combine all chunks
    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)
        print(f"üéâ Extraction complete! Total records: {len(final_df)}")
        return final_df
    else:
        print("üí• No data extracted!")
        return pd.DataFrame()

# =============================================================================
# USAGE EXAMPLES
# =============================================================================

if __name__ == "__main__":
    
    # Example 1: Your configuration
    print("üìä API Data Extraction - 65,000 Records")
    print("=" * 50)
    
    # CUSTOMIZE THESE VALUES FOR YOUR API:
    API_URL = "https://your-api-endpoint.com/data"
    API_HEADERS = {
        'Authorization': 'Bearer your-token-here',
        'Content-Type': 'application/json'
    }
    
    # Extract data
    # df = extract_api_data_simple(
    #     base_url=API_URL,
    #     headers=API_HEADERS,
    #     total_records=65000,
    #     chunk_size=100
    # )
    
    # Save to file
    # if not df.empty:
    #     filename = f"api_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    #     df.to_csv(filename, index=False)
    #     print(f"üíæ Data saved to: {filename}")
    
    # Example 2: Test with public API
    print("\nüß™ Testing with JSONPlaceholder API (100 records)")
    test_df = extract_api_data_simple(
        base_url="https://jsonplaceholder.typicode.com/posts",
        total_records=100,
        chunk_size=10
    )
    
    if not test_df.empty:
        print(f"\nüìã Test Results:")
        print(f"Records: {len(test_df)}")
        print(f"Columns: {list(test_df.columns)}")
        print(f"\nFirst 3 records:")
        print(test_df.head(3))


