# TravelPurpose Advanced Usage

Advanced features including custom ontologies, pipeline execution, and data analysis.

In [None]:
from travelpurpose import predict_purpose, tags
from travelpurpose.utils.io import load_ontology, load_cities_data
from travelpurpose.utils.scoring import calculate_tag_weights, aggregate_scores_by_category
import pandas as pd
import json

## 1. Understanding Tag Sources and Weights

Examine how tags from different sources are weighted:

In [None]:
# Get tags for a city
city_tags = tags("Paris", use_cache=False)

# Calculate weights
tag_weights = calculate_tag_weights(city_tags)

# Show top weighted tags
sorted_tags = sorted(tag_weights.items(), key=lambda x: x[1], reverse=True)
print("Top 15 weighted tags:")
for tag, weight in sorted_tags[:15]:
    print(f"{tag:25} | Weight: {weight:.2f}")

## 2. Custom Tag Source Weights

Customize weights for different sources:

In [None]:
# Define custom weights
custom_weights = {
    "wikidata": 2.0,  # Trust Wikidata more
    "booking": 1.2,
    "agoda": 1.0,
    "trivago": 0.8,
}

# Recalculate with custom weights
custom_tag_weights = calculate_tag_weights(city_tags, source_weights=custom_weights)

print("Comparison of default vs custom weights:")
for tag in sorted_tags[:10]:
    default_w = tag_weights.get(tag[0], 0)
    custom_w = custom_tag_weights.get(tag[0], 0)
    print(f"{tag[0]:20} | Default: {default_w:.2f} | Custom: {custom_w:.2f}")

## 3. Exploring the Ontology Structure

In [None]:
ontology = load_ontology()

print("Ontology Statistics:")
print(f"Main Categories: {len(ontology['main_categories'])}")
print(f"Total Subcategories: {sum(len(subs) for subs in ontology['subcategories'].values())}")
print(f"Tag Mappings: {len(ontology['tag_mappings'])}")

# Show category hierarchy
print("\nCategory Hierarchy:")
for main_cat in ontology['main_categories'][:5]:
    subs = ontology['subcategories'].get(main_cat, [])
    print(f"\n{main_cat} ({len(subs)} subcategories):")
    for sub in subs[:3]:
        print(f"  - {sub}")
    if len(subs) > 3:
        print(f"  ... and {len(subs)-3} more")

## 4. Analyzing Tag-to-Category Mappings

In [None]:
# Aggregate tags to categories
tag_mappings = ontology.get('tag_mappings', {})
main_scores, sub_scores = aggregate_scores_by_category(tag_weights, tag_mappings)

print("Main Category Scores:")
for cat, score in sorted(main_scores.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"{cat:25} | Score: {score:.3f}")

print("\nTop Subcategory Scores:")
for cat, score in sorted(sub_scores.items(), key=lambda x: x[1], reverse=True)[:8]:
    print(f"{cat:25} | Score: {score:.3f}")

## 5. Dataset Analysis

Analyze the cities dataset (if available):

In [None]:
# Load cities dataset
df = load_cities_data()

if df is not None:
    print(f"Total cities: {len(df)}")
    print(f"\nColumns: {df.columns.tolist()}")
    print(f"\nSample:")
    display(df.head())
else:
    print("No dataset available. Run the pipeline to generate data.")

## 6. Running the Data Pipeline

Execute the data pipeline programmatically:

In [None]:
# Note: This will make network requests. Use sample_size to limit.
# Uncomment to run:

# from scripts.pipeline import run_pipeline
# 
# run_pipeline(
#     nbd_path=None,
#     output_dir="./output",
#     min_population=200000,
#     sample_size=5  # Small sample for testing
# )

## 7. Custom Classification Logic

Build your own classifier using the library components:

In [None]:
from travelpurpose.utils.normalize import normalize_tag
from travelpurpose.utils.scoring import normalize_scores, select_top_labels

def custom_classifier(city_name: str, min_confidence: float = 0.20):
    """
    Custom classifier with stricter confidence threshold.
    """
    # Get tags
    city_tags = tags(city_name, use_cache=True)
    
    if not city_tags:
        return None
    
    # Calculate weights
    tag_weights = calculate_tag_weights(city_tags)
    
    # Aggregate
    ontology = load_ontology()
    main_scores, sub_scores = aggregate_scores_by_category(
        tag_weights, 
        ontology['tag_mappings']
    )
    
    # Normalize
    main_scores = normalize_scores(main_scores)
    sub_scores = normalize_scores(sub_scores)
    
    # Select with custom threshold
    top_main = select_top_labels(main_scores, threshold=min_confidence, max_labels=3)
    top_sub = select_top_labels(sub_scores, threshold=min_confidence*0.8, max_labels=5)
    
    return {
        'main': [label for label, _ in top_main],
        'sub': [label for label, _ in top_sub],
        'main_scores': dict(top_main),
        'sub_scores': dict(top_sub)
    }

# Test custom classifier
result = custom_classifier("Dubai", min_confidence=0.25)
print(json.dumps(result, indent=2))

## 8. Comparative Analysis

Compare classifications across similar cities:

In [None]:
# Compare similar cities
city_groups = {
    'Beach Resorts': ['Antalya', 'Cancun', 'Phuket', 'Maldives'],
    'Business Hubs': ['Singapore', 'Frankfurt', 'Dubai', 'Hong Kong'],
    'Cultural Cities': ['Rome', 'Athens', 'Kyoto', 'Jerusalem']
}

for group_name, cities in city_groups.items():
    print(f"\n{group_name}:")
    for city in cities:
        result = predict_purpose(city, use_cache=True)
        main_cats = ', '.join(result['main'][:2])
        print(f"  {city:15} | {main_cats:40} | Conf: {result['confidence']:.2f}")

## Conclusion

This notebook demonstrated:
- Custom weighting of tag sources
- Ontology exploration and customization
- Building custom classifiers
- Dataset analysis
- Comparative city analysis

For more information, see the [documentation](https://github.com/teyfikoz/Travel_Purpose-City_Tags).