# Notebook 1: Batch Import

## ðŸŽ¯ Objective

Create datasets and collections on NAKALA from CSV files.

## ðŸ“‹ What This Notebook Does

1. Reads `folder_data_items.csv` and `folder_collections.csv`
2. Uploads files to NAKALA
3. Creates datasets with metadata
4. Creates collections linking datasets
5. **Auto-generates** CSVs for modification and deletion

## ðŸ”„ Workflow

```
Input CSVs â†’ Upload Files â†’ Create Datasets â†’ Create Collections â†’ Generate CSVs for Next Steps
```

---

## Step 1: Setup and Imports

In [None]:
import sys
import os
from pathlib import Path

# Add parent directory to path to import nakala package
sys.path.insert(0, str(Path.cwd().parent))

# Import required libraries
import csv
import json
import logging
from typing import Dict

# Import from nakala package
from nakala import CsvConverter, API_URL, API_KEY
from nakala.api_client import upload_file as upload_file_to_nakala
from nakala.api_client import create_dataset as create_nakala_dataset
from nakala.api_client import create_collection as create_nakala_collection

print("âœ“ Imports successful")
print(f"âœ“ API URL: {API_URL}")
print(f"âœ“ Using test API key")

## Step 2: Configure Paths

In [None]:
# Set up paths
BASE_PATH = Path.cwd().parent
DATA_PATH = BASE_PATH / 'data'
FILES_DIR = BASE_PATH / 'files'

# Input CSVs
DATASETS_CSV = DATA_PATH / 'folder_data_items.csv'
COLLECTIONS_CSV = DATA_PATH / 'folder_collections.csv'

# Output CSVs
OUTPUT_DATASETS_CSV = DATA_PATH / 'output_datasets.csv'
OUTPUT_COLLECTIONS_CSV = DATA_PATH / 'output_collections.csv'

print(f"âœ“ Base path: {BASE_PATH}")
print(f"âœ“ Data path: {DATA_PATH}")
print(f"âœ“ Files directory: {FILES_DIR}")
print(f"\nâœ“ Input CSVs:")
print(f"  - {DATASETS_CSV.name}: {'âœ“ exists' if DATASETS_CSV.exists() else 'âœ— missing'}")
print(f"  - {COLLECTIONS_CSV.name}: {'âœ“ exists' if COLLECTIONS_CSV.exists() else 'âœ— missing'}")

## Step 3: Preview Input CSVs

Let's see what we're going to import:

In [None]:
# Preview datasets CSV
print("=" * 80)
print("DATASETS TO IMPORT (folder_data_items.csv)")
print("=" * 80)

with open(DATASETS_CSV, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for i, row in enumerate(reader, 1):
        print(f"\nDataset {i}:")
        print(f"  Title: {row.get('title', 'N/A')[:60]}...")
        print(f"  Files: {row.get('file', 'N/A')}")
        print(f"  Type: {row.get('type', 'N/A')}")
        print(f"  Status: {row.get('status', 'N/A')}")

In [None]:
# Preview collections CSV
print("\n" + "=" * 80)
print("COLLECTIONS TO CREATE (folder_collections.csv)")
print("=" * 80)

with open(COLLECTIONS_CSV, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for i, row in enumerate(reader, 1):
        print(f"\nCollection {i}:")
        print(f"  Title: {row.get('title', 'N/A')[:60]}...")
        print(f"  Data items: {row.get('data_items', 'N/A')}")
        print(f"  Status: {row.get('status', 'N/A')}")

## Step 4: Import Datasets

This will:
1. Upload files to NAKALA
2. Create datasets with metadata
3. Save results to `output_datasets.csv`

In [None]:
def import_datasets(csv_path: Path, base_path: Path, api_key: str) -> Dict[str, str]:
    """
    Import datasets from CSV file
    Returns: Dictionary mapping folder paths to dataset IDs
    """
    converter = CsvConverter()
    dataset_map = {}  # folder_path -> dataset_id

    # Prepare output CSV
    output = open(OUTPUT_DATASETS_CSV, 'w', encoding='utf-8')
    output_writer = csv.writer(output)
    output_writer.writerow(['dataset_id', 'files', 'title', 'folder_path', 'status', 'response'])

    print("=" * 80)
    print("STARTING DATASET IMPORT")
    print("=" * 80)

    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)

        for row_num, row in enumerate(reader, 1):
            try:
                print(f"\n{'=' * 80}")
                print(f"ROW {row_num}: {row.get('title', 'Untitled')[:50]}")
                print(f"{'=' * 80}")

                output_data = ['', '', row.get('title', ''), row.get('file', ''), '', '']

                # Parse file paths
                folder_path = row.get('file', '')
                file_paths = converter.parse_files(folder_path, base_path)

                if not file_paths:
                    print(f"âš  No files found for: {folder_path}")
                    output_data[4] = 'ERROR'
                    output_data[5] = 'No files found'
                    output_writer.writerow(output_data)
                    continue

                print(f"âœ“ Found {len(file_paths)} file(s)")

                # Upload files
                files_metadata = []
                output_files = []

                for file_path in file_paths:
                    print(f"  Uploading: {file_path.name}")
                    file_info = upload_file_to_nakala(file_path, api_key)
                    if file_info:
                        files_metadata.append(file_info)
                        output_files.append(f"{file_path.name},{file_info['sha1']}")
                        print(f"    âœ“ Uploaded: {file_info['sha1'][:16]}...")
                    else:
                        output_files.append(file_path.name)
                        output_data[1] = ';'.join(output_files)
                        output_data[4] = 'ERROR'
                        output_data[5] = f'Failed to upload: {file_path.name}'
                        output_writer.writerow(output_data)
                        print(f"    âœ— Upload failed")
                        break

                # Check if all files uploaded successfully
                if len(files_metadata) != len(file_paths):
                    continue

                output_data[1] = ';'.join(output_files)

                # Convert CSV row to NAKALA metadata
                metas = converter.csv_row_to_nakala_metas(row)

                # Build dataset JSON
                dataset = {
                    'status': row.get('status', 'pending').strip(),
                    'files': files_metadata,
                    'metas': metas
                }

                print(f"âœ“ Dataset JSON prepared ({len(metas)} metadata objects)")

                # Create dataset on NAKALA
                print("  Creating dataset on NAKALA...")
                response = create_nakala_dataset(dataset, api_key)

                if response.status_code == 201:
                    parsed = response.json()
                    dataset_id = parsed['payload']['id']
                    print(f"  âœ“ Dataset created successfully: {dataset_id}")

                    output_data[0] = dataset_id
                    output_data[4] = 'OK'
                    output_data[5] = response.text

                    # Store mapping
                    dataset_map[folder_path] = dataset_id

                else:
                    print(f"  âœ— Dataset creation failed: {response.status_code}")
                    print(f"  Response: {response.text}")
                    output_data[4] = 'ERROR'
                    output_data[5] = response.text

                output_writer.writerow(output_data)

            except Exception as e:
                print(f"âœ— Error processing row {row_num}: {str(e)}")
                output_data[4] = 'ERROR'
                output_data[5] = str(e)
                output_writer.writerow(output_data)

    output.close()
    print(f"\nâœ“ Dataset import complete. Results saved to: {OUTPUT_DATASETS_CSV.name}")

    return dataset_map

# Execute import
dataset_map = import_datasets(DATASETS_CSV, BASE_PATH, API_KEY)
print(f"\n{'=' * 80}")
print(f"SUMMARY: Created {len(dataset_map)} datasets")
print(f"{'=' * 80}")

## Step 5: Import Collections

This will:
1. Create collections with metadata
2. Link datasets to collections
3. Save results to `output_collections.csv`

In [None]:
def import_collections(csv_path: Path, dataset_map: Dict[str, str], api_key: str):
    """
    Import collections from CSV file
    """
    converter = CsvConverter()

    # Prepare output CSV
    output = open(OUTPUT_COLLECTIONS_CSV, 'w', encoding='utf-8')
    output_writer = csv.writer(output)
    output_writer.writerow(['collection_id', 'title', 'datasets', 'status', 'response'])

    print("\n" + "=" * 80)
    print("STARTING COLLECTION IMPORT")
    print("=" * 80)

    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)

        for row_num, row in enumerate(reader, 1):
            try:
                print(f"\n{'=' * 80}")
                print(f"COLLECTION {row_num}: {row.get('title', 'Untitled')[:50]}")
                print(f"{'=' * 80}")

                output_data = ['', row.get('title', ''), '', '', '']

                # Convert CSV row to NAKALA metadata
                metas = []

                # Title (multilingual)
                if row.get('title'):
                    lang_parts = converter.parse_multilingual_field(row['title'])
                    for part in lang_parts:
                        meta = {
                            "propertyUri": converter.property_uris['title'],
                            "value": part['value']
                        }
                        if part['lang']:
                            meta['lang'] = part['lang']
                        metas.append(meta)

                # Description (multilingual)
                if row.get('description'):
                    lang_parts = converter.parse_multilingual_field(row['description'])
                    for part in lang_parts:
                        meta = {
                            "propertyUri": converter.property_uris['description'],
                            "value": part['value'],
                            "typeUri": "http://www.w3.org/2001/XMLSchema#string"
                        }
                        if part['lang']:
                            meta['lang'] = part['lang']
                        metas.append(meta)

                # Keywords
                if row.get('keywords'):
                    lang_parts = converter.parse_multilingual_field(row['keywords'])
                    for part in lang_parts:
                        keywords = converter.parse_multiple_values(part['value'])
                        for keyword in keywords:
                            meta = {
                                "propertyUri": converter.property_uris['subject'],
                                "value": keyword,
                                "typeUri": "http://www.w3.org/2001/XMLSchema#string"
                            }
                            if part['lang']:
                                meta['lang'] = part['lang']
                            metas.append(meta)

                # Language
                if row.get('language'):
                    metas.append({
                        "propertyUri": converter.property_uris['language'],
                        "value": row['language'].strip(),
                        "typeUri": "http://www.w3.org/2001/XMLSchema#string"
                    })

                # Creators
                if row.get('creator'):
                    creators = converter.parse_creator(row['creator'])
                    metas.extend(creators)

                # Map folder paths to dataset IDs
                data_items_str = row.get('data_items', '')
                dataset_ids = []

                if data_items_str:
                    folder_paths = [p.strip() for p in data_items_str.split('|') if p.strip()]
                    for folder_path in folder_paths:
                        if folder_path in dataset_map:
                            dataset_ids.append(dataset_map[folder_path])
                            print(f"  âœ“ Linked dataset: {dataset_map[folder_path]} ({folder_path})")
                        else:
                            print(f"  âš  Dataset not found for folder: {folder_path}")

                output_data[2] = ' | '.join(dataset_ids)

                # Build collection JSON
                collection = {
                    'status': row.get('status', 'private').strip(),
                    'metas': metas
                }

                # Add datasets if any were found
                if dataset_ids:
                    collection['datas'] = dataset_ids

                print(f"âœ“ Collection JSON prepared ({len(metas)} metadata objects, {len(dataset_ids)} datasets)")

                # Create collection on NAKALA
                print("  Creating collection on NAKALA...")
                response = create_nakala_collection(collection, api_key)

                if response.status_code == 201:
                    parsed = response.json()
                    collection_id = parsed['payload']['id']
                    print(f"  âœ“ Collection created successfully: {collection_id}")

                    output_data[0] = collection_id
                    output_data[3] = 'OK'
                    output_data[4] = response.text

                else:
                    print(f"  âœ— Collection creation failed: {response.status_code}")
                    print(f"  Response: {response.text}")
                    output_data[3] = 'ERROR'
                    output_data[4] = response.text

                output_writer.writerow(output_data)

            except Exception as e:
                print(f"âœ— Error processing collection {row_num}: {str(e)}")
                output_data[3] = 'ERROR'
                output_data[4] = str(e)
                output_writer.writerow(output_data)

    output.close()
    print(f"\nâœ“ Collection import complete. Results saved to: {OUTPUT_COLLECTIONS_CSV.name}")

# Execute import
if dataset_map:
    import_collections(COLLECTIONS_CSV, dataset_map, API_KEY)
else:
    print("âš  No datasets created, skipping collection import")

## Step 6: Generate CSVs for Next Steps

This will auto-generate:
- `modification_data_items.csv` - For modifying datasets (Notebook 2)
- `modification_collections.csv` - For modifying collections (Notebook 2)
- `delete_data_items.csv` - For deleting datasets (Notebook 3)
- `delete_collections.csv` - For deleting collections (Notebook 3)

In [None]:
def update_downstream_csvs():
    """
    Update modification and deletion CSVs with IDs from successful imports.
    """
    print("\n" + "=" * 80)
    print("GENERATING CSVs FOR NEXT STEPS")
    print("=" * 80)

    # Parse successful imports from output CSVs
    dataset_ids = []
    collection_ids = []

    # Read dataset IDs from output
    if OUTPUT_DATASETS_CSV.exists():
        with open(OUTPUT_DATASETS_CSV, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                if row.get('status') == 'OK' and row.get('dataset_id'):
                    dataset_ids.append({
                        'dataset_id': row['dataset_id'],
                        'folder_path': row['folder_path'],
                        'title': row['title']
                    })

    # Read collection IDs from output
    if OUTPUT_COLLECTIONS_CSV.exists():
        with open(OUTPUT_COLLECTIONS_CSV, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                if row.get('status') == 'OK' and row.get('collection_id'):
                    collection_ids.append({
                        'collection_id': row['collection_id'],
                        'title': row['title']
                    })

    print(f"âœ“ Found {len(dataset_ids)} successfully imported datasets")
    print(f"âœ“ Found {len(collection_ids)} successfully imported collections")

    if not dataset_ids and not collection_ids:
        print("âš  No successful imports found, skipping CSV generation")
        return

    # Read original metadata from folder CSVs
    folder_datasets = {}
    if DATASETS_CSV.exists():
        with open(DATASETS_CSV, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                folder_path = row.get('file', row.get('folder_name', ''))
                if not folder_path.endswith('/'):
                    folder_path = folder_path + '/'
                folder_datasets[folder_path] = row

    folder_collections = {}
    if COLLECTIONS_CSV.exists():
        with open(COLLECTIONS_CSV, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                folder_collections[row.get('title', '')] = row

    # Generate modification_data_items.csv
    if dataset_ids:
        mod_datasets_path = DATA_PATH / 'modification_data_items.csv'
        with open(mod_datasets_path, 'w', encoding='utf-8', newline='') as f:
            fieldnames = ['dataset_id', 'title', 'creator', 'date', 'license', 'type', 'description', 'keywords', 'status']
            writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
            writer.writeheader()

            for item in dataset_ids:
                dataset_id = item['dataset_id']
                folder_path = item['folder_path']
                original = folder_datasets.get(folder_path, {})

                title = original.get('title', item.get('title', ''))
                description = original.get('description', '')

                writer.writerow({
                    'dataset_id': dataset_id,
                    'title': title.replace('|', ' v2|') if '|' in title else title + ' v2',
                    'creator': original.get('creator', ''),
                    'date': original.get('date', ''),
                    'license': original.get('license', ''),
                    'type': original.get('type', ''),
                    'description': description.replace('|', ' (updated)|') if '|' in description else description + ' (updated)',
                    'keywords': original.get('keywords', ''),
                    'status': 'pending'
                })

        print(f"âœ“ Generated: {mod_datasets_path.name}")

    # Generate modification_collections.csv
    if collection_ids:
        mod_collections_path = DATA_PATH / 'modification_collections.csv'
        with open(mod_collections_path, 'w', encoding='utf-8', newline='') as f:
            fieldnames = ['collection_id', 'title', 'description', 'keywords', 'status']
            writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
            writer.writeheader()

            for item in collection_ids:
                collection_id = item['collection_id']
                title = item['title']
                original = folder_collections.get(title, {})
                description = original.get('description', '')

                writer.writerow({
                    'collection_id': collection_id,
                    'title': title.replace('|', ' v2|') if '|' in title else title + ' v2',
                    'description': description.replace('|', ' (updated)|') if '|' in description else description + ' (updated)',
                    'keywords': original.get('keywords', ''),
                    'status': 'private'
                })

        print(f"âœ“ Generated: {mod_collections_path.name}")

    # Generate delete_data_items.csv
    if dataset_ids:
        del_datasets_path = DATA_PATH / 'delete_data_items.csv'
        with open(del_datasets_path, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['dataset_id', 'confirm_delete'])
            for item in dataset_ids:
                writer.writerow([item['dataset_id'], 'YES'])

        print(f"âœ“ Generated: {del_datasets_path.name}")

    # Generate delete_collections.csv
    if collection_ids:
        del_collections_path = DATA_PATH / 'delete_collections.csv'
        with open(del_collections_path, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['collection_id', 'confirm_delete'])
            for item in collection_ids:
                writer.writerow([item['collection_id'], 'YES'])

        print(f"âœ“ Generated: {del_collections_path.name}")

    print("\n" + "=" * 80)
    print("CSV GENERATION COMPLETE")
    print("=" * 80)

# Execute CSV generation
update_downstream_csvs()

## âœ… Summary

### What Was Created

Check the `data/` directory for:
- `output_datasets.csv` - Import results for datasets
- `output_collections.csv` - Import results for collections
- `modification_data_items.csv` - **Ready for Notebook 2**
- `modification_collections.csv` - **Ready for Notebook 2**
- `delete_data_items.csv` - **Ready for Notebook 3**
- `delete_collections.csv` - **Ready for Notebook 3**

### Next Steps

1. **Review** the generated `modification_*.csv` files
2. **Edit** them if you want to change metadata (e.g., modify titles, descriptions)
3. **Run** `2_batch_modify.ipynb` to apply your changes

---

**Note**: The generated CSVs have "v2" and "(updated)" markers as examples. You can edit these to any values you want before running Notebook 2!