# FabManager Data Scraper

This notebook provides an interactive interface to scrape different types of data from the FabManager platform using the Open API.

## Supported Data Types
- **Users**: All user accounts in the system
- **Machines**: All available machines/equipment
- **Reservations**: All bookings (automatically divided by type: Machine, Training, Event)
- **Trainings**: All training sessions

## Features
- Automatic pagination handling
- RFC-5988 compliant pagination
- Timestamped output files
- Clean JSON export (removes unusual line terminators)
- Connection testing before scraping

## 1. Import Required Libraries

In [None]:
import requests
import json
import re
from datetime import datetime
from typing import Dict, List, Optional, Generator
from pathlib import Path
from collections import defaultdict
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

print("✓ Libraries imported successfully")

## 2. Configuration

Set your FabManager API credentials here:

In [None]:
# FabManager API Configuration
BASE_URL = 'Insert_your_FabManager_URL_here'
API_TOKEN = 'API_Token_Goes_Here'

# Setup export directory
EXPORT_DIR = Path.cwd() / 'exports'
EXPORT_DIR.mkdir(exist_ok=True)

print(f"✓ Configuration set")
print(f"  - Base URL: {BASE_URL}")
print(f"  - Export directory: {EXPORT_DIR}")

## 3. Define API Client Class

In [None]:
class FabManagerAPIClient:
    """Client for interacting with the FabManager Open API."""
    
    def __init__(self, base_url: str, api_token: str):
        self.base_url = base_url.rstrip('/')
        self.api_token = api_token
        self.session = requests.Session()
        
        # Set default headers
        self.session.headers.update({
            'Authorization': f'Token token={self.api_token}',
            'Accept': 'application/json'
        })
    
    def test_connection(self) -> tuple[bool, str]:
        """Test the API connection and authentication."""
        try:
            endpoint = f'{self.base_url}/open_api/v1/users'
            params = {'page': 1, 'per_page': 1}
            
            logger.info("Testing API connection...")
            response = self.session.get(endpoint, params=params, timeout=10)
            
            if response.status_code == 200:
                return True, "Connection successful"
            elif response.status_code == 401:
                return False, "Authentication failed - Invalid API token"
            elif response.status_code == 403:
                return False, "Access forbidden - Check API permissions"
            elif response.status_code == 404:
                return False, "API endpoint not found - Check base URL"
            else:
                return False, f"Unexpected response: HTTP {response.status_code}"
                
        except requests.exceptions.ConnectionError:
            return False, "Connection error - Check base URL and internet connection"
        except requests.exceptions.Timeout:
            return False, "Connection timeout - Server not responding"
        except requests.exceptions.RequestException as e:
            return False, f"Request error: {str(e)}"
        except Exception as e:
            return False, f"Unexpected error: {str(e)}"
    
    def _get_endpoint_data(self, endpoint: str, data_key: str, page: int = 1, per_page: int = 100) -> Dict:
        """Generic method to fetch data from any endpoint."""
        full_endpoint = f'{self.base_url}{endpoint}'
        params = {'page': page, 'per_page': per_page}
        
        try:
            logger.info(f"Fetching page {page} from {endpoint}")
            response = self.session.get(full_endpoint, params=params)
            response.raise_for_status()
            
            pagination_info = self._extract_pagination_info(response.headers)
            response_data = response.json()
            
            if isinstance(response_data, dict) and data_key in response_data:
                data = response_data[data_key]
            else:
                data = response_data
            
            return {'data': data, 'pagination': pagination_info}
            
        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching data from {endpoint}: {e}")
            raise
    
    def _get_all_data(self, endpoint: str, data_key: str, per_page: int = 100, max_pages: Optional[int] = None) -> Generator[Dict, None, None]:
        """Generator that fetches all data across all pages."""
        page = 1
        
        while True:
            if max_pages and page > max_pages:
                logger.info(f"Reached max_pages limit ({max_pages})")
                break
                
            result = self._get_endpoint_data(endpoint, data_key, page, per_page)
            data = result['data']
            pagination = result['pagination']
            
            if data:
                for item in data:
                    yield item
            
            if not pagination.get('has_next') or not data:
                logger.info(f"Fetched all pages (total: {page})")
                break
            
            page += 1
    
    def fetch_all_as_list(self, endpoint: str, data_key: str, per_page: int = 100, max_pages: Optional[int] = None) -> List[Dict]:
        """Fetch all data and return as a list."""
        all_data = list(self._get_all_data(endpoint, data_key, per_page, max_pages))
        logger.info(f"Total items fetched: {len(all_data)}")
        return all_data
    
    def _extract_pagination_info(self, headers: Dict) -> Dict:
        """Extract pagination information from response headers."""
        pagination = {'total': None, 'per_page': None, 'has_next': False, 'has_prev': False, 'links': {}}
        
        if 'Total' in headers:
            try:
                pagination['total'] = int(headers['Total'])
            except ValueError:
                pass
        
        if 'Per-Page' in headers:
            try:
                pagination['per_page'] = int(headers['Per-Page'])
            except ValueError:
                pass
        
        if 'Link' in headers:
            links = self._parse_link_header(headers['Link'])
            pagination['links'] = links
            pagination['has_next'] = 'next' in links
            pagination['has_prev'] = 'prev' in links
        
        return pagination
    
    def _parse_link_header(self, link_header: str) -> Dict[str, str]:
        """Parse RFC-5988 Link header."""
        links = {}
        
        for link in link_header.split(','):
            link = link.strip()
            if not link:
                continue
            
            parts = link.split(';')
            if len(parts) >= 2:
                url = parts[0].strip().strip('<>')
                rel = None
                
                for part in parts[1:]:
                    if 'rel=' in part:
                        rel = part.split('=')[1].strip().strip('"\'')
                        break
                
                if rel:
                    links[rel] = url
        
        return links

print("✓ FabManagerAPIClient class defined")

## 4. Define Helper Functions

In [None]:
def clean_data_for_json(data):
    """Recursively clean data to remove unusual line terminators."""
    if isinstance(data, str):
        return data.replace('\u2028', '\n').replace('\u2029', '\n')
    elif isinstance(data, dict):
        return {key: clean_data_for_json(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [clean_data_for_json(item) for item in data]
    else:
        return data

def sanitize_filename(name: str) -> str:
    """Return a safe filename string."""
    sanitized = re.sub(r"[^A-Za-z0-9_.-]", "_", name)
    return sanitized or "unknown"

def save_data(data: List[Dict], data_key: str, filename_prefix: str, export_dir: Path) -> str:
    """Save scraped data to a JSON file with timestamp."""
    timestamp = datetime.now().strftime("%d_%m_%Y_%H-%M")
    filename = f"FabManager_ExportedData_{filename_prefix}_{timestamp}.json"
    filepath = export_dir / filename
    
    logger.info(f"Cleaning data to remove unusual line terminators...")
    cleaned_data = clean_data_for_json(data)
    
    output_data = {data_key: cleaned_data}
    
    logger.info(f"Saving {len(data)} items to {filename}...")
    with open(filepath, 'w', encoding='utf-8', newline='\n') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    
    return str(filepath)

def divide_reservations_by_type(reservations: List[Dict], export_dir: Path) -> Dict[str, str]:
    """Split reservations into three separate files by reservable_type."""
    logger.info("Dividing reservations by type...")
    
    groups = defaultdict(list)
    for reservation in reservations:
        rtype = reservation.get("reservable_type") or reservation.get("type") or "unknown"
        if not isinstance(rtype, str):
            rtype = str(rtype)
        groups[rtype].append(reservation)
    
    timestamp = datetime.now().strftime("%d_%m_%Y_%H-%M")
    saved_files = {}
    
    for rtype, items in groups.items():
        filename = f"FabManager_ExportedData_Reservations_{sanitize_filename(rtype)}_{timestamp}.json"
        outpath = export_dir / filename
        
        cleaned_items = clean_data_for_json(items)
        
        with open(outpath, 'w', encoding='utf-8', newline='\n') as f:
            json.dump({"reservations": cleaned_items}, f, ensure_ascii=False, indent=2)
        
        saved_files[rtype] = str(outpath)
        logger.info(f"  - {rtype}: {len(items)} items saved to {filename}")
    
    return saved_files

print("✓ Helper functions defined")

## 5. Initialize API Client and Test Connection

In [None]:
# Initialize API client
print("Initializing FabManager API client...")
client = FabManagerAPIClient(base_url=BASE_URL, api_token=API_TOKEN)

# Test connection
print("Testing API connection...")
success, message = client.test_connection()

if not success:
    print(f"\n✗ API Connection Failed: {message}")
    print("\nPlease check your configuration in the Configuration cell above.")
else:
    print(f"✓ {message}")
    print("✓ API client ready")
    print("\nYou can now proceed to scrape data!")

## 6. Scrape Data

Run the cells below to scrape different types of data. You can run them individually or all at once.

### 6.1 Scrape Users

In [None]:
print("="*60)
print("Scraping Users".center(60))
print("="*60)

try:
    users_data = client.fetch_all_as_list(
        endpoint='/open_api/v1/users',
        data_key='users',
        per_page=100
    )
    
    print(f"\n✓ Total users fetched: {len(users_data)}")
    
    if users_data:
        print(f"\nSample user data:")
        print(json.dumps(users_data[0], indent=2, ensure_ascii=False)[:500] + "...")
    
    filepath = save_data(users_data, 'users', 'Users', EXPORT_DIR)
    print(f"\n✓ Successfully saved to: {filepath}")
    
except Exception as e:
    print(f"\n✗ Error scraping users: {e}")

### 6.2 Scrape Machines

In [None]:
print("="*60)
print("Scraping Machines".center(60))
print("="*60)

try:
    machines_data = client.fetch_all_as_list(
        endpoint='/open_api/v1/machines',
        data_key='machines',
        per_page=100
    )
    
    print(f"\n✓ Total machines fetched: {len(machines_data)}")
    
    if machines_data:
        print(f"\nSample machine data:")
        print(json.dumps(machines_data[0], indent=2, ensure_ascii=False)[:500] + "...")
    
    filepath = save_data(machines_data, 'machines', 'Machines', EXPORT_DIR)
    print(f"\n✓ Successfully saved to: {filepath}")
    
except Exception as e:
    print(f"\n✗ Error scraping machines: {e}")

### 6.3 Scrape Reservations

This will create **three separate files** divided by type (Machine, Training, Event).

In [None]:
print("="*60)
print("Scraping Reservations".center(60))
print("="*60)

try:
    reservations_data = client.fetch_all_as_list(
        endpoint='/open_api/v1/reservations',
        data_key='reservations',
        per_page=100
    )
    
    print(f"\n✓ Total reservations fetched: {len(reservations_data)}")
    
    if reservations_data:
        print(f"\nSample reservation data:")
        print(json.dumps(reservations_data[0], indent=2, ensure_ascii=False)[:500] + "...")
    
    print("\nDividing reservations by type and saving...")
    saved_files = divide_reservations_by_type(reservations_data, EXPORT_DIR)
    
    print(f"\n✓ Reservations divided and saved by type:")
    for rtype, filepath in saved_files.items():
        print(f"  - {rtype}: {filepath}")
    
except Exception as e:
    print(f"\n✗ Error scraping reservations: {e}")

### 6.4 Scrape Trainings

In [None]:
print("="*60)
print("Scraping Trainings".center(60))
print("="*60)

try:
    trainings_data = client.fetch_all_as_list(
        endpoint='/open_api/v1/trainings',
        data_key='trainings',
        per_page=100
    )
    
    print(f"\n✓ Total trainings fetched: {len(trainings_data)}")
    
    if trainings_data:
        print(f"\nSample training data:")
        print(json.dumps(trainings_data[0], indent=2, ensure_ascii=False)[:500] + "...")
    
    filepath = save_data(trainings_data, 'trainings', 'Trainings', EXPORT_DIR)
    print(f"\n✓ Successfully saved to: {filepath}")
    
except Exception as e:
    print(f"\n✗ Error scraping trainings: {e}")

## 7. Scrape All Data Types at Once

Run this cell to scrape all data types in one go:

In [None]:
print("="*60)
print("Scraping ALL Data Types".center(60))
print("="*60)
print()

data_types = [
    {'name': 'Users', 'endpoint': '/open_api/v1/users', 'data_key': 'users', 'prefix': 'Users'},
    {'name': 'Machines', 'endpoint': '/open_api/v1/machines', 'data_key': 'machines', 'prefix': 'Machines'},
    {'name': 'Reservations', 'endpoint': '/open_api/v1/reservations', 'data_key': 'reservations', 'prefix': 'Reservations'},
    {'name': 'Trainings', 'endpoint': '/open_api/v1/trainings', 'data_key': 'trainings', 'prefix': 'Trainings'}
]

for data_type in data_types:
    print(f"\n{'='*60}")
    print(f"Scraping {data_type['name']}...".center(60))
    print(f"{'='*60}\n")
    
    try:
        data = client.fetch_all_as_list(
            endpoint=data_type['endpoint'],
            data_key=data_type['data_key'],
            per_page=100
        )
        
        print(f"\n✓ Total {data_type['name'].lower()} fetched: {len(data)}")
        
        if data:
            print(f"\nSample data:")
            print(json.dumps(data[0], indent=2, ensure_ascii=False)[:300] + "...")
        
        # Special handling for reservations
        if data_type['data_key'] == 'reservations' and data:
            print("\nDividing reservations by type and saving...")
            saved_files = divide_reservations_by_type(data, EXPORT_DIR)
            print(f"\n✓ Reservations divided and saved by type:")
            for rtype, filepath in saved_files.items():
                print(f"  - {rtype}")
        else:
            filepath = save_data(data, data_type['data_key'], data_type['prefix'], EXPORT_DIR)
            print(f"\n✓ Successfully saved")
        
    except Exception as e:
        print(f"\n✗ Error scraping {data_type['name']}: {e}")
    
    print()

print(f"\n{'='*60}")
print("All data types scraped successfully!".center(60))
print(f"{'='*60}")
print(f"\nAll files saved to: {EXPORT_DIR}")

## 8. Summary

Run this cell to see a summary of exported files:

In [None]:
print("Exported Files Summary")
print("="*60)
print(f"Export directory: {EXPORT_DIR}\n")

if EXPORT_DIR.exists():
    files = sorted(EXPORT_DIR.glob('*.json'), key=lambda x: x.stat().st_mtime, reverse=True)
    
    if files:
        print(f"Total files: {len(files)}\n")
        for i, file in enumerate(files[:20], 1):  # Show last 20 files
            size = file.stat().st_size / 1024  # Size in KB
            modified = datetime.fromtimestamp(file.stat().st_mtime).strftime('%Y-%m-%d %H:%M:%S')
            print(f"{i:2d}. {file.name}")
            print(f"    Size: {size:.2f} KB | Modified: {modified}")
        
        if len(files) > 20:
            print(f"\n... and {len(files) - 20} more files")
    else:
        print("No exported files found yet.")
else:
    print("Export directory doesn't exist yet.")