In [37]:
# Install required libraries
!pip install spacy geopy pandas transformers sentence-transformers folium
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m68.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [38]:
import json
import pandas as pd
import spacy
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import shelve
import logging
from datetime import datetime

# Hugging Face transformers
from transformers import pipeline

# For semantic similarity (optional)
from sentence_transformers import SentenceTransformer, util

# For map visualization
import folium

from huggingface_hub import login
login("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")

# Configure logging to capture warnings and errors to a file and console
logging.basicConfig(
    level=logging.INFO,
    format='%(levelname)s: %(message)s',
    handlers=[
        logging.FileHandler("cycling_tours_debug.log"),
        logging.StreamHandler()
    ]
)

# Initialize geolocator with a unique user agent
geolocator = Nominatim(user_agent="cycling_tour_app")

# Initialize Hugging Face pipelines
bert_ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

#"sk-proj-fePlCq2Vh56F1KglJDXbT3BlbkFJTdLnvyq7Y9mrinJFs1u3"
# Initialize Sentence Transformer model (optional for semantic similarity)
# sentence_model = SentenceTransformer('all-MiniLM-L6-v2')


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [39]:
import json
import pandas as pd
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)

# Define a mapping from abbreviated to full month names
month_abbreviation_mapping = {
    'Jan': 'January',
    'Feb': 'February',
    'Mar': 'March',
    'Apr': 'April',
    'May': 'May',
    'Jun': 'June',
    'Jul': 'July',
    'Aug': 'August',
    'Sep': 'September',
    'Oct': 'October',
    'Nov': 'November',
    'Dec': 'December'
}

def load_tours(json_file):
    """
    Loads cycling tours from a GeoJSON file and normalizes them into a pandas DataFrame.
    Consolidates multiple LineStrings into a single route and extracts start/end coordinates.

    Parameters:
        json_file (str): Path to the GeoJSON file.

    Returns:
        pd.DataFrame: DataFrame containing normalized tour data.
    """
    with open(json_file, 'r', encoding='utf-8') as file:
        try:
            tours_data = json.load(file)
            logging.info(f"Successfully loaded JSON data from {json_file}.")
        except json.JSONDecodeError as e:
            logging.error(f"Error decoding JSON: {e}")
            return pd.DataFrame()  # Return empty DataFrame on error

    features = tours_data.get('features', [])
    normalized_data = []
    skipped_features = 0  # Counter for skipped features

    for feature in features:
        properties = feature.get('properties', {})
        geometry = feature.get('geometry', {})
        feature_id = feature.get('id', 'Unknown ID')

        # Initialize list to hold LineString coordinates
        line_strings = []

        geom_type = geometry.get('type', None)

        if not geom_type:
            logging.warning(f"Feature ID {feature_id} skipped: Missing 'type' in geometry.")
            skipped_features += 1
            continue

        if geom_type == 'LineString':
            coordinates = geometry.get('coordinates', [])
            if not coordinates:
                logging.warning(f"Feature ID {feature_id} skipped: Empty 'coordinates' for LineString.")
                skipped_features += 1
                continue
            line_strings.append(coordinates)
        elif geom_type == 'GeometryCollection':
            geometries = geometry.get('geometries', [])
            if not geometries:
                logging.warning(f"Feature ID {feature_id} skipped: Empty 'geometries' in GeometryCollection.")
                skipped_features += 1
                continue
            for geom in geometries:
                if geom.get('type') == 'LineString':
                    coords = geom.get('coordinates', [])
                    if coords:
                        line_strings.append(coords)
            if not line_strings:
                logging.warning(f"Feature ID {feature_id} skipped: No LineString found in GeometryCollection.")
                skipped_features += 1
                continue
        else:
            logging.warning(f"Feature ID {feature_id} skipped: Unsupported geometry type '{geom_type}'.")
            skipped_features += 1
            continue

        # Consolidate all LineStrings into a single list of coordinates
        consolidated_coords = []
        for coords in line_strings:
            if len(coords) < 2:
                logging.warning(f"Feature ID {feature_id} skipped: LineString has fewer than 2 points.")
                skipped_features += 1
                continue
            # Reorder to (latitude, longitude) and ignore elevation if present
            for coord in coords:
                if len(coord) >= 2:
                    lat, lon = coord[1], coord[0]
                    consolidated_coords.append((lat, lon))
                else:
                    logging.warning(f"Feature ID {feature_id} has invalid coordinate format: {coord}")

        if len(consolidated_coords) < 2:
            logging.warning(f"Feature ID {feature_id} skipped: Consolidated route has fewer than 2 points.")
            skipped_features += 1
            continue

        # Extract month availability
        months = {
            'jan': properties.get('meta.month_jan', '0'),
            'feb': properties.get('meta.month_feb', '0'),
            'mar': properties.get('meta.month_mar', '0'),
            'apr': properties.get('meta.month_apr', '0'),
            'may': properties.get('meta.month_may', '0'),
            'jun': properties.get('meta.month_jun', '0'),
            'jul': properties.get('meta.month_jul', '0'),
            'aug': properties.get('meta.month_aug', '0'),
            'sep': properties.get('meta.month_sep', '0'),
            'oct': properties.get('meta.month_oct', '0'),
            'nov': properties.get('meta.month_nov', '0'),
            'dec': properties.get('meta.month_dec', '0'),
        }

        # Convert abbreviated month names to full names
        available_months = [
            month_abbreviation_mapping.get(month.capitalize(), month.capitalize())
            for month, open_status in months.items() if open_status == '1'
        ]

        # Compile normalized entry
        entry = {
            'id': feature_id,
            'name': properties.get('name', 'Unnamed Tour'),
            'coordinates': consolidated_coords,  # Consolidated coordinates
            'route_length_km': float(properties.get('meta.route_length', 0)),
            'elevation_gain_m': float(properties.get('meta.meter_up', 0)),  # Use 'meter_up' as elevation gain
            'max_altitude_m': float(properties.get('meta.max_altitude', 0)),  # Store max altitude if needed
            'difficulty_level': int(properties.get('meta.difficulty', -1)),  # Assuming 0: easy, 1: medium, 2: hard
            'tour_type': properties.get('meta.tour_type', 'Single'),  # e.g., 'Loop', 'Out-and-Back'
            'round_trip': properties.get('meta.round_trip', '0') == "1",
            'available_months': available_months,  # List of full month names the tour is available
            # Extract start and end coordinates for mapping
            'start_latitude': consolidated_coords[0][0],
            'start_longitude': consolidated_coords[0][1],
            'end_latitude': consolidated_coords[-1][0],
            'end_longitude': consolidated_coords[-1][1],
            # Add other relevant fields as needed
            'stamina': int(properties.get('meta.stamina', 0)),  # Assuming stamina levels
            'difficulty_ebike': int(properties.get('meta.difficulty_ebike', 0)),
            # Add more fields as necessary
        }

        normalized_data.append(entry)

    if skipped_features > 0:
        logging.info(f"Total features skipped due to errors: {skipped_features}")

    logging.info("Tours loaded successfully.")
    df = pd.DataFrame(normalized_data)

    # Verify that 'difficulty_level' was mapped correctly
    print("Unique 'difficulty_level' values after mapping:")
    print(df['difficulty_level'].unique())

    # Additional: Check if 'available_months' are correctly populated
    print("\nUnique 'available_months' combinations:")
    # Convert lists to tuples to make them hashable
    unique_available_months = df['available_months'].apply(tuple).unique()
    for months in unique_available_months:
        print(months)

    return df

# Path to your GeoJSON file
json_path = '/content/bike_tours_tyrol.geojson'  # Update this path as needed

# Load tours data
tours_df = load_tours(json_path)

# Display the first few rows of the DataFrame
if not tours_df.empty:
    print("\nSample of loaded tours after adjustments:")
    display(tours_df.head())
else:
    print("No valid tour data found.")




Unique 'difficulty_level' values after mapping:
[0 1 2]

Unique 'available_months' combinations:
('March', 'April', 'May', 'June', 'July', 'August', 'September', 'October')
('May', 'June', 'July', 'August', 'September', 'October')
('April', 'May', 'June', 'July', 'August', 'September', 'October')
()
('May', 'June', 'July', 'August', 'September')
('June', 'July', 'August', 'September', 'October')
('June', 'July', 'August', 'September')
('April', 'May', 'June', 'July', 'August', 'September', 'October', 'November')
('March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November')

Sample of loaded tours after adjustments:


Unnamed: 0,id,name,coordinates,route_length_km,elevation_gain_m,max_altitude_m,difficulty_level,tour_type,round_trip,available_months,start_latitude,start_longitude,end_latitude,end_longitude,stamina,difficulty_ebike
0,51997,A - Genussradeln in der Unteren Schranne,"[(47.58391, 12.16813), (47.58725, 12.16749), (...",28.3,100.0,542.0,0,Single,False,"[March, April, May, June, July, August, Septem...",47.58391,12.16813,47.58392,12.16814,1,1
1,72850,Langer Grund (MTB Route 226),"[(47.44473, 12.15846), (47.44455, 12.15869), (...",21.1,850.0,1440.0,0,Single,False,"[May, June, July, August, September, October]",47.44473,12.15846,47.29879,12.03792,2,0
2,72858,Gruberberg (MTB Route 274),"[(47.44452, 12.15874), (47.44419, 12.15906), (...",4.8,290.0,890.0,1,Single,False,"[May, June, July, August, September, October]",47.44452,12.15874,47.41754,12.17791,2,1
3,72862,Rundweg Möslalm - Pfaffenberg (MTB 337),"[(47.48013, 12.07169), (47.48032, 12.07219), (...",9.7,480.0,1020.0,2,Single,False,"[May, June, July, August, September, October]",47.48013,12.07169,47.48242,12.11526,1,0
4,72894,Haag Alm Runde (MTB Route 229),"[(47.44453, 12.15871), (47.44421, 12.15905), (...",28.0,950.0,1500.0,2,Single,True,"[May, June, July, August, September, October]",47.44453,12.15871,47.44454,12.15868,3,1


In [40]:
if not tours_df.empty:
    print("\nSample of loaded tours after adjustments:")
    display(tours_df[['name', 'route_length_km', 'elevation_gain_m', 'difficulty_level', 'available_months']].head(10))
else:
    print("No valid tour data found.")




Sample of loaded tours after adjustments:


Unnamed: 0,name,route_length_km,elevation_gain_m,difficulty_level,available_months
0,A - Genussradeln in der Unteren Schranne,28.3,100.0,0,"[March, April, May, June, July, August, Septem..."
1,Langer Grund (MTB Route 226),21.1,850.0,0,"[May, June, July, August, September, October]"
2,Gruberberg (MTB Route 274),4.8,290.0,1,"[May, June, July, August, September, October]"
3,Rundweg Möslalm - Pfaffenberg (MTB 337),9.7,480.0,2,"[May, June, July, August, September, October]"
4,Haag Alm Runde (MTB Route 229),28.0,950.0,2,"[May, June, July, August, September, October]"
5,Salvenbergrunde (MTB Route 269),15.8,725.0,2,"[May, June, July, August, September, October]"
6,Rennrad: Reintaler Seenrunde,62.8,218.0,1,"[May, June, July, August, September, October]"
7,Radtour: Kleine Wildschönaurunde,23.5,370.0,1,"[May, June, July, August, September, October]"
8,Brixentalradweg Nr. 21,44.1,360.0,0,"[May, June, July, August, September, October]"
9,Salvenradrunde,62.7,650.0,1,"[April, May, June, July, August, September, Oc..."


In [41]:
# Cell 4: Define Seasons and Their Corresponding Months

SEASONS = {
    'spring': ['spring', 'springs'],
    'summer': ['summer', 'summers'],
    'autumn': ['autumn', 'autumns', 'fall', 'falls'],
    'winter': ['winter', 'winters']
}

SEASON_MONTHS = {
    'spring': ['March', 'April', 'May'],
    'summer': ['June', 'July', 'August'],
    'autumn': ['September', 'October', 'November'],
    'winter': ['December', 'January', 'February']
}

VALID_MONTHS = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]

TYROLEAN_CITIES = [
    "Innsbruck", "Kufstein", "Seefeld", "Imst", "Hall in Tirol",
    "Sankt Johann in Tirol", "Schwaz", "Sterzing", "Brixlegg",
    "Matrei in Osttirol", "Rum", "Sölden", "Mayrhofen", "Axamer Lizum",
    "Telfs", "Hörsching", "Fügen", "Jenbach", "Bad Gastein", "Zirl",
    "Mittersill", "Pitztal", "Lienz", "Tirol", "Wattens"
]


In [42]:
import spacy
from spacy.pipeline import EntityRuler
import re
import logging

# Initialize spaCy English model
nlp = spacy.load('en_core_web_sm')

# Add the EntityRuler to the spaCy pipeline using its factory name
ruler = nlp.add_pipe("entity_ruler", before="ner")

# Define patterns for known cities
city_patterns = [{"label": "GPE", "pattern": city} for city in TYROLEAN_CITIES]

# Define patterns for elevation-related terms
elevation_patterns = [
    {"label": "ELEVATION_GAIN", "pattern": [{"LOWER": "elevation"}, {"LOWER": "gain"}]},
    {"label": "ELEVATION_GAIN", "pattern": [{"LOWER": "gain"}, {"LOWER": "in"}, {"LOWER": "elevation"}]},
    {"label": "ELEVATION_GAIN", "pattern": [{"LOWER": "uphill"}]},
    {"label": "ELEVATION_GAIN", "pattern": [{"LOWER": "ascend"}]},
    # Add more patterns as necessary
]

# Combine all patterns
all_patterns = city_patterns + elevation_patterns

# Add patterns to the EntityRuler
ruler.add_patterns(all_patterns)

In [43]:
# Cell 6: Define Difficulty Levels and User Types

# Define more descriptive candidate labels for zero-shot classification, excluding "None"
candidate_labels = [
    "Easy: Designed for beginners and those seeking a relaxed cycling experience.",
    "Medium: Geared towards intermediate cyclists looking for moderately challenging routes.",
    "Hard: Intended for experienced cyclists who desire challenging and strenuous routes."
]

# Mapping from descriptive labels to integer difficulty levels, excluding "None"
label_description_mapping = {
    "Easy: Designed for beginners and those seeking a relaxed cycling experience.": 0,
    "Medium: Geared towards intermediate cyclists looking for moderately challenging routes.": 1,
    "Hard: Intended for experienced cyclists who desire challenging and strenuous routes.": 2
}

user_type_to_difficulties = {
    'family': [0, 1],       # Easy and Medium
    'couple': [0, 1],       # Easy and Medium
    'kids': [0],            # Easy
    'elderly': [0],         # Easy
    'solo': [0, 1, 2],      # Easy, Medium, Hard
    'enthusiast': [1, 2],   # Medium and Hard
    'beginner': [0],        # Easy
    'competitive': [2],     # Hard
    'group': [0, 1, 2]      # Easy, Medium, Hard
}


In [44]:
# Cell 7: Map Difficulty Levels Using Zero-Shot Classification

import re
import logging

def map_difficulty(query, classifier, candidate_labels, label_mapping):
    """
    Maps user-specified difficulty terms from the query to predefined difficulty levels using descriptive label mapping.

    Parameters:
        query (str): The user's query.
        classifier: The initialized zero-shot classifier pipeline.
        candidate_labels (list): List of descriptive difficulty labels.
        label_mapping (dict): Mapping from descriptive labels to integer difficulty levels.

    Returns:
        list or None: A list of mapped difficulty levels ([0,1], etc.) or None if no specific difficulty is detected.
    """
    # Define phrases that explicitly indicate no specific difficulty
    general_phrases = [
        "for anyone", "anyone", "for everybody", "everybody",
        "no specific", "no preference", "any difficulty", "no requirements", "all",
        "list all", "show me all", "include all", "no particular", "without specifying"
    ]

    # Check for general phrases indicating no difficulty preference
    if any(phrase in query.lower() for phrase in general_phrases):
        return None  # Indicate that all difficulty levels should be included

    # Perform zero-shot classification using descriptive labels
    result = classifier(query, candidate_labels)

    # Extract labels and their scores
    labels = result['labels']
    scores = result['scores']

    # Log the classification result
    logging.info(f"Zero-shot classification result: {result}")

    # Initialize an empty list for difficulty levels
    difficulty_levels = []

    # Define a threshold for considering a label as relevant
    threshold = 0.3  # Adjust based on desired sensitivity

    for label, score in zip(labels, scores):
        if score >= threshold:
            level = label_mapping.get(label, None)
            if level is not None:
                difficulty_levels.append(level)

    # Post-processing overrides to ensure accurate difficulty mapping
    user_type_present = False
    for user_type in user_type_to_difficulties.keys():
        if user_type in query.lower():
            difficulty_levels = user_type_to_difficulties[user_type]
            user_type_present = True
            logging.info(f"Mapped difficulty levels based on user type '{user_type}': {difficulty_levels}")
            break

    if not user_type_present and difficulty_levels:
        # Remove duplicates and sort the list
        difficulty_levels = sorted(list(set(difficulty_levels)))

    if difficulty_levels:
        return difficulty_levels
    else:
        return None  # If no relevant difficulty levels are found


In [45]:
# Cell 8: Define Utility Functions

import re
import logging

def capitalize_cities(query, cities):
    """
    Capitalizes the city names in the query to ensure accurate entity recognition.

    Parameters:
        query (str): The user input query.
        cities (list): List of known city names.

    Returns:
        str: Query with city names capitalized.
    """
    for city in cities:
        # Use regex to match whole words, case-insensitive
        query = re.sub(r'\b' + re.escape(city.lower()) + r'\b', city, query, flags=re.IGNORECASE)
    return query

def remove_location_prefix(query):
    """
    Removes prefixes like 'City', 'Town', etc., from location names in the query.

    Parameters:
        query (str): The user input query.

    Returns:
        str: Query with location prefixes removed.
    """
    prefixes = ['city', 'town', 'village', 'municipality']
    for prefix in prefixes:
        pattern = rf'\b{prefix}\s+({ "|".join([re.escape(city) for city in TYROLEAN_CITIES]) })\b'
        query = re.sub(pattern, r'\1', query, flags=re.IGNORECASE)
    return query

def extract_months_and_seasons(query):
    """
    Extracts seasons and months from the user query, handling both singular and plural forms.

    Parameters:
        query (str): The user input query.

    Returns:
        list: A list of detected months.
    """
    detected_seasons = []
    detected_months = []

    # Tokenize the query using spaCy
    doc = nlp(query)

    for token in doc:
        token_lower = token.text.lower()
        # Check for seasons (handling plural forms)
        for season, variants in SEASONS.items():
            if token_lower in variants:
                detected_seasons.append(season)
                break  # Avoid multiple matches for the same token

        # Check for months
        if token.text.capitalize() in VALID_MONTHS:
            detected_months.append(token.text.capitalize())

    # Map detected seasons to their corresponding months
    mapped_season_months = []
    for season in detected_seasons:
        mapped_season_months.extend(SEASON_MONTHS.get(season, []))

    # Combine mapped season months with directly detected months
    combined_months = list(set(mapped_season_months + detected_months))

    return combined_months


In [46]:
# Cell 9: Parse User Query to Extract Criteria

import re
import logging

def parse_query(query):
    """
    Parses the user query to extract search criteria, including user type, difficulty levels, months, and seasons.

    Parameters:
        query (str): The natural language query input by the user.

    Returns:
        dict: Dictionary containing extracted criteria.
    """
    criteria = {
        'type': None,
        'location': None,  # Single location
        'user_type': None,  # e.g., 'family', 'couple', 'solo', 'kids', etc.
        'difficulty_levels': None,  # List of integers [0,1], [0], [1], etc.
        'preferred_month': None,  # Can be a single month or a list of months
        'min_route_length_km': None,
        'max_route_length_km': None,
        'min_elevation_gain': None,
        'max_elevation_gain': None
    }

    # Preprocessing: Capitalize city names and remove prefixes
    query = capitalize_cities(query, TYROLEAN_CITIES)
    query = remove_location_prefix(query)

    # 1. Entity Extraction using BERT-Based NER
    bert_entities = bert_ner(query)
    logging.info(f"BERT NER Results: {bert_entities}")

    # Extract location from BERT NER results
    bert_locations = [ent['word'] for ent in bert_entities if ent['entity_group'] == 'LOC']
    # Filter to include only known cities
    bert_locations = [loc for loc in bert_locations if loc in TYROLEAN_CITIES]
    if bert_locations:
        criteria['location'] = bert_locations[0]  # Take the first detected location
        logging.info(f"Detected location via BERT NER: {criteria['location']}")
    else:
        # 2. Fallback to spaCy's NER if BERT-Based NER fails to detect a location
        doc = nlp(query)
        spaCy_locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
        if spaCy_locations:
            criteria['location'] = spaCy_locations[0]  # Take the first detected location
            logging.info(f"Detected location via spaCy NER: {criteria['location']}")

    # 3. Numerical Extraction for Route Length
    # **Modified Line: Added 'over' to the operator group**
    route_length_pattern = r'\b(?:length|route length|distance|route)\b.*?(<|<=|>|>=|less than|more than|over)\s*([\d\.]+)\s*(km|kilometers|meters|m|metres|kms)'
    route_length_matches = re.findall(route_length_pattern, query.lower())
    for match in route_length_matches:
        operator, value, unit = match
        value = float(value)  # Allow decimal values
        unit = unit.lower()
        # Convert to kilometers if necessary
        if unit in ['meter', 'meters', 'm', 'metres']:
            value_km = value / 1000
        else:
            value_km = value  # Already in km
        # Map operator to min or max route length
        if operator in ['<', 'less than', '<=']:
            criteria['max_route_length_km'] = value_km
            logging.info(f"Detected maximum route length: {value_km} km")
        elif operator in ['>', 'more than', 'greater than', '>=', 'over']:
            criteria['min_route_length_km'] = value_km
            logging.info(f"Detected minimum route length: {value_km} km")

    # 4. Numerical Extraction for Elevation Gain
    # **Modified Line: Added 'over' to the operator group**
    elevation_gain_pattern = r'\b(?:elevation gain|gain in elevation|ascend|uphill)\b.*?(<|<=|>|>=|less than|more than|over)\s*([\d\.]+)\s*(km|kilometers|meters|m|metres)'
    elevation_gain_matches = re.findall(elevation_gain_pattern, query.lower())
    for match in elevation_gain_matches:
        operator, value, unit = match
        value = float(value)  # Allow decimal values
        unit = unit.lower()
        # Convert to meters if necessary
        if unit in ['km', 'kilometers']:
            value_m = value * 1000
        elif unit in ['meter', 'meters', 'm', 'metres']:
            value_m = value
        else:
            value_m = value  # Default to meters if unit unrecognized
        # Map operator to min or max elevation gain
        if operator in ['<', 'less than', '<=']:
            criteria['max_elevation_gain'] = value_m
            logging.info(f"Detected maximum elevation gain: {value_m} meters")
        elif operator in ['>', 'more than', 'greater than', '>=', 'over']:
            criteria['min_elevation_gain'] = value_m
            logging.info(f"Detected minimum elevation gain: {value_m} meters")

    # 5. Determine if any numerical constraints are present
    numerical_constraints = any([
        criteria['min_route_length_km'] is not None,
        criteria['max_route_length_km'] is not None,
        criteria['min_elevation_gain'] is not None,
        criteria['max_elevation_gain'] is not None
    ])

    # 6. User Type Detection
    user_type_keywords = {
        'family': ['family-friendly', 'families', 'family'],
        'couple': ['couples', 'couple'],
        'kids': ['kids', 'children', 'child-friendly'],
        'elderly': ['elderly', 'senior', 'seniors'],
        'solo': ['solo', 'individual', 'by myself', 'alone'],
        'enthusiast': ['enthusiast', 'enthusiastic', 'enthusiasts'],
        'beginner': ['beginner', 'new to cycling'],
        'competitive': ['competitive', 'race', 'training'],
        'group': ['group', 'friends', 'club']
    }

    for user_type, keywords in user_type_keywords.items():
        for keyword in keywords:
            if keyword in query.lower():
                criteria['user_type'] = user_type
                logging.info(f"Detected user type: {user_type}")
                break
        if criteria['user_type']:
            break

    # 7. Difficulty Classification based on user type and difficulty mentions
    if criteria['user_type'] and not numerical_constraints:
        # Map user_type to difficulty_levels
        criteria['difficulty_levels'] = user_type_to_difficulties.get(criteria['user_type'], None)
        logging.info(f"Mapped difficulty levels based on user type '{criteria['user_type']}': {criteria['difficulty_levels']}")
    elif not criteria['user_type'] and not numerical_constraints:
        # Proceed with existing difficulty classification
        CATEGORIES = candidate_labels  # Descriptive labels excluding "None"
        mapped_difficulty = map_difficulty(query, classifier, CATEGORIES, label_description_mapping)
        criteria['difficulty_levels'] = mapped_difficulty
        if mapped_difficulty is not None:
            logging.info(f"Mapped difficulty level(s): {mapped_difficulty}")
        else:
            logging.info("No specific difficulty detected; including all categories.")
    else:
        # If numerical constraints are present, set difficulty_levels to None
        criteria['difficulty_levels'] = None
        logging.info("Numerical constraints detected; setting 'difficulty_levels' to None to include all difficulty levels.")

    # 8. Detecting Seasons and Months using dedicated extraction
    detected_months = extract_months_and_seasons(query)

    # Set preferred_month
    if detected_months:
        criteria['preferred_month'] = detected_months
        logging.info(f"Set preferred_month: {criteria['preferred_month']}")
    else:
        criteria['preferred_month'] = None
        logging.info("No specific month or season detected.")

    # 9. Keyword-based extraction for tour type using regular expressions
    # Define a regex pattern to capture various forms of "loop" and "round-trip"
    tour_type_pattern = re.compile(r'\b(loop|looped|loops|round-trip|round trip|out-and-back|out and back)\b', re.IGNORECASE)

    # Update the tour type detection logic
    if tour_type_pattern.search(query):
        criteria['type'] = 'loop'
        logging.info(f"Detected tour type: loop")

    return criteria



In [47]:
# Cell 10: Geocoding with Caching

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import shelve
import logging

# Initialize geolocator with a unique user agent
geolocator = Nominatim(user_agent="cycling_tour_app")

# Implement rate limiter to avoid overwhelming the geocoding service
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

def get_coordinates_cached(city_name):
    """
    Retrieves the geographical coordinates for a given city name, using caching to minimize API calls.

    Parameters:
        city_name (str): The name of the city to geocode.

    Returns:
        tuple or None: (latitude, longitude) of the city or None if geocoding fails.
    """
    with shelve.open('geocoding_cache') as cache:
        if city_name in cache:
            return cache[city_name]
        else:
            location = geocode(city_name)
            if location:
                coords = (location.latitude, location.longitude)
                cache[city_name] = coords
                return coords
            else:
                logging.error(f"Geocoding failed for city: {city_name}")
                return None


In [48]:
# Cell 11: Check if Route is Near City

import logging

def is_route_near_city(route_coords, city_coords, radius_km=50):
    """
    Determines if any point in the route is within the specified radius of the city.

    Parameters:
        route_coords (list): List of coordinates defining the route.
        city_coords (tuple): (latitude, longitude) of the city.
        radius_km (int): Radius in kilometers.

    Returns:
        bool: True if any point is within the radius, else False.
    """
    for coord in route_coords:
        if not isinstance(coord, tuple) or len(coord) != 2:
            continue  # Skip invalid coordinate entries
        try:
            distance = geodesic(city_coords, coord).kilometers
            if distance <= radius_km:
                return True
        except ValueError as e:
            logging.error(f"Invalid coordinate {coord}: {e}")
            continue
    return False


In [49]:
# Cell 12: Ranking and Selecting Top Tours
import geopy.distance


def rank_and_select_top_n(filtered_df, n=5, query_location_coords=None):
    """
    Ranks the filtered tours based on proximity, route length, and elevation gain,
    and selects the top n tours.

    Parameters:
        filtered_df (pd.DataFrame): DataFrame containing filtered tours.
        n (int): Number of top tours to select.
        query_location_coords (tuple or None): Latitude and longitude of the query location.

    Returns:
        pd.DataFrame: DataFrame containing the top n ranked tours.
    """
    if query_location_coords:
        # Calculate distance for each tour based on proximity
        def calculate_min_distance(coords_list, query_location_coords):
            """Calculates the minimum distance between a query location and a list of coordinate pairs.

            Args:
                coords_list: A list of (latitude, longitude) tuples.
                query_location_coords: A (latitude, longitude) tuple representing the query location.

            Returns:
                The minimum distance in kilometers.
            """

            if not coords_list:
                return float('inf')  # Return infinity if the list is empty

            distances = [geopy.distance.distance(query_location_coords, coord).km for coord in coords_list]
            return min(distances)

        # Apply the function with both arguments using a lambda
        filtered_df['distance_km'] = filtered_df['coordinates'].apply(
            lambda coords: calculate_min_distance(coords, query_location_coords)
        )

    # Define sorting keys and order based on available criteria
    sort_keys = []
    ascending = []

    if query_location_coords and 'distance_km' in filtered_df.columns:
        sort_keys.append('distance_km')
        ascending.append(True)  # Closer distance preferred

    if 'route_length_km' in filtered_df.columns:
        sort_keys.append('route_length_km')
        ascending.append(False)  # Longer routes preferred

    if 'elevation_gain_m' in filtered_df.columns:
        sort_keys.append('elevation_gain_m')
        ascending.append(False)  # Higher elevation gain preferred

    if sort_keys:
        # Sort the DataFrame based on the defined keys and order
        filtered_df = filtered_df.sort_values(by=sort_keys, ascending=ascending)

    # Select the top n tours
    top_tours = filtered_df.head(n).copy()

    return top_tours


In [50]:
# Cell 13: Find Cycling Tours Based on Query

def find_cycling_tours(query, tours_df, radius_km=50, apply_season_filter=False, current_month=None):
    """
    Finds cycling tours based on the user's query and specified criteria.

    Parameters:
        query (str): The user input query.
        tours_df (pd.DataFrame): DataFrame containing cycling tours data.
        radius_km (float): Radius in kilometers for proximity filtering.
        apply_season_filter (bool): Whether to filter tours by season.
        current_month (str): The current month for seasonal filtering.

    Returns:
        pd.DataFrame: DataFrame containing the top 5 matched cycling tours.
    """
    # Parse the user query to extract search criteria
    criteria = parse_query(query)
    print(f"\nParsed Criteria: {criteria}")

    # Initialize the filtered DataFrame
    filtered_df = tours_df.copy()
    initial_count = len(filtered_df)
    print(f"Initial number of tours: {initial_count}")

    # Apply location filter if location is specified
    if criteria['location']:
        # Get coordinates of the location
        location_coords = get_coordinates_cached(criteria['location'])

        if location_coords is None:
            logging.error(f"Could not retrieve coordinates for location: {criteria['location']}")
            return pd.DataFrame()  # Return empty DataFrame if geocoding fails

        # Apply the proximity filter
        filtered_df = filtered_df[filtered_df['coordinates'].apply(
            lambda coords: is_route_near_city(coords, location_coords, radius_km)
        )]
        after_proximity = len(filtered_df)
        print(f"Number of tours after proximity filter: {after_proximity}")

    # Apply route length filter if specified
    if criteria['min_route_length_km']:
        filtered_df = filtered_df[filtered_df['route_length_km'] >= criteria['min_route_length_km']]
        after_min_length = len(filtered_df)
        print(f"Number of tours after applying minimum route length ({criteria['min_route_length_km']} km): {after_min_length}")
    if criteria['max_route_length_km']:
        filtered_df = filtered_df[filtered_df['route_length_km'] <= criteria['max_route_length_km']]
        after_max_length = len(filtered_df)
        print(f"Number of tours after applying maximum route length ({criteria['max_route_length_km']} km): {after_max_length}")

    # Apply elevation gain filter if specified
    if criteria['min_elevation_gain']:
        filtered_df = filtered_df[filtered_df['elevation_gain_m'] >= criteria['min_elevation_gain']]
        after_min_elevation = len(filtered_df)
        print(f"Number of tours after applying minimum elevation gain ({criteria['min_elevation_gain']} m): {after_min_elevation}")
    if criteria['max_elevation_gain']:
        filtered_df = filtered_df[filtered_df['elevation_gain_m'] <= criteria['max_elevation_gain']]
        after_max_elevation = len(filtered_df)
        print(f"Number of tours after applying maximum elevation gain ({criteria['max_elevation_gain']} m): {after_max_elevation}")

    # Apply difficulty level filter if specified
    if criteria['difficulty_levels'] is not None:
        filtered_df = filtered_df[filtered_df['difficulty_level'].isin(criteria['difficulty_levels'])]
        after_difficulty = len(filtered_df)
        print(f"Number of tours after applying difficulty levels {criteria['difficulty_levels']}: {after_difficulty}")

    # **Apply Additional Constraints Based on User Type**
    if criteria['user_type']:
        user_type = criteria['user_type']
        logging.info(f"Applying additional constraints for user type: {user_type}")

        # Define additional constraints for specific user types
        # Example constraints:
        # - 'family': route_length <= 15km for difficulty 0 and 1, elevation_gain <= 500m
        # - 'beginner': route_length <= 10km for difficulty 0, elevation_gain <= 300m
        additional_constraints = {}

        if user_type == 'family':
            # Apply route length and elevation gain constraints
            condition = (
                ((filtered_df['difficulty_level'] == 0) & (filtered_df['route_length_km'] <= 10)) |
                ((filtered_df['difficulty_level'] == 1) & (filtered_df['route_length_km'] <= 15))
            )
            filtered_df = filtered_df[condition & (filtered_df['elevation_gain_m'] <= 500)]
            logging.info("Applied family-specific constraints: route_length <=10km for difficulty 0 and <=15km for difficulty 1, elevation_gain <=500m")
            print(f"Number of tours after applying family-specific constraints: {len(filtered_df)}")

        elif user_type == 'beginner':
            condition = (
                (filtered_df['difficulty_level'] == 0) & (filtered_df['route_length_km'] <= 10)
            )
            filtered_df = filtered_df[condition & (filtered_df['elevation_gain_m'] <= 300)]
            logging.info("Applied beginner-specific constraints: route_length <=10km for difficulty 0, elevation_gain <=300m")
            print(f"Number of tours after applying beginner-specific constraints: {len(filtered_df)}")

        # Add more user types and their constraints as needed
        # elif user_type == 'another_type':
        #     ...

    # Apply season filter if specified
    if apply_season_filter and criteria['preferred_month']:
        # Since 'available_months' is a list of full month names, we can directly filter
        filtered_df = filtered_df[filtered_df['available_months'].apply(
            lambda months: any(month in months for month in criteria['preferred_month'])
        )]
        after_season = len(filtered_df)
        print(f"Number of tours after applying preferred months {criteria['preferred_month']}: {after_season}")

    # Apply tour type filter if specified
    if criteria['type']:
        filtered_df = filtered_df[filtered_df['tour_type'].str.lower() == criteria['type'].lower()]
        after_tour_type = len(filtered_df)
        print(f"Number of tours after applying tour type '{criteria['type']}': {after_tour_type}")

    # Determine ranking criteria based on presence of location, route length, and elevation gain
    ranking_present = False
    query_location_coords = get_coordinates_cached(criteria['location']) if criteria['location'] else None

    # Implement the ranking logic based on priority
    if query_location_coords:
        # If location is present, prioritize proximity
        ranking_present = True
    elif criteria['min_route_length_km'] or criteria['max_route_length_km']:
        # If route length is specified, prioritize longer routes
        ranking_present = 'route_length'
    elif criteria['min_elevation_gain'] or criteria['max_elevation_gain']:
        # If elevation gain is specified, prioritize higher elevation gains
        ranking_present = 'elevation_gain'

    # Rank and select top 5 tours based on the updated ranking logic
    top_tours = rank_and_select_top_n(filtered_df, n=5, query_location_coords=query_location_coords)
    final_count = len(top_tours)
    print(f"Number of top tours selected: {final_count}")

    return top_tours


In [51]:
def present_results(tours, location_coords=None):
    """
    Presents the matching cycling tours.

    Parameters:
        tours (pd.DataFrame): DataFrame containing matched tours.
        location_coords (tuple or None): Coordinates of the user's location for mapping.

    Returns:
        None
    """
    if tours.empty:
        print("No matching tours found based on your criteria.")
        return

    print("Matching Cycling Tours:\n")
    for idx, tour in tours.iterrows():
        print(f"Tour Name: {tour['name']}")
        print(f"Route Length: {tour['route_length_km']} km")
        print(f"Difficulty Level: {tour['difficulty_level']}")
        print(f"Elevation Gain: {tour['elevation_gain_m']} meters")
        print(f"Tour Type: {tour['tour_type']}")
        print(f"Round Trip: {'Yes' if tour['round_trip'] else 'No'}")
        print(f"Available Months: {', '.join(tour['available_months'])}")
        print(f"Coordinates: Start at ({tour['start_latitude']}, {tour['start_longitude']}), "
              f"End at ({tour['end_latitude']}, {tour['end_longitude']})\n")


In [52]:
# # Cell 15: Display Interactive Map

# def display_map(location_coords, tours_df):
#     """
#     Displays an interactive map with the user's location and matched cycling tours.

#     Parameters:
#         location_coords (tuple or None): (latitude, longitude) of the recognized location.
#         tours_df (pd.DataFrame): DataFrame containing the matched cycling tours.
#     """
#     if not location_coords or not all(location_coords):
#         print("No valid location coordinates to display on the map.")
#         return

#     # Initialize the map centered around the user's location
#     m = folium.Map(location=location_coords, zoom_start=12)

#     # Add a marker for the user's location
#     folium.Marker(
#         location=location_coords,
#         popup="Your Location",
#         icon=folium.Icon(color='blue', icon='user')
#     ).add_to(m)

#     # Add routes to the map
#     for index, row in tours_df.iterrows():
#         # Extract the route coordinates
#         route_coords = row['coordinates']

#         # Add the route as a polyline
#         folium.PolyLine(
#             locations=route_coords,
#             popup=row['name'],
#             color='green' if row['difficulty_level'].lower() == 'easy' else 'orange' if row['difficulty_level'].lower() == 'medium' else 'red',
#             weight=5
#         ).add_to(m)

#     # Display the map
#     display(m)


In [53]:
# Cell 15: Display Interactive Map

def display_map(location_coords, tours_df):
    """
    Displays an interactive map with the recognized location and matching tours, including route paths.

    Parameters:
        location_coords (tuple): (latitude, longitude) of the recognized location.
        tours_df (pd.DataFrame): DataFrame containing matching tours.
    """
    if not location_coords or not all(location_coords):
        print("No valid location coordinates to display on the map.")
        return

    # Initialize the map centered at the recognized location
    m = folium.Map(location=location_coords, zoom_start=12)

    # Add a marker for the recognized location with a larger icon
    folium.Marker(
        location=location_coords,
        popup="Search Location",
        icon=folium.Icon(color='red', icon='info-sign', icon_size=(40, 40))
    ).add_to(m)

    # Add markers and route paths for each tour
    for _, row in tours_df.iterrows():
        # Add start marker with smaller icon
        folium.Marker(
            location=(row['start_latitude'], row['start_longitude']),
            popup=f"Start: {row['name']}",
            icon=folium.Icon(color='green', icon='play', prefix='fa', icon_size=(30, 30))
        ).add_to(m)

        # Add end marker with smaller icon
        folium.Marker(
            location=(row['end_latitude'], row['end_longitude']),
            popup=f"End: {row['name']}",
            icon=folium.Icon(color='blue', icon='stop', prefix='fa', icon_size=(35, 35))
        ).add_to(m)

        # Add route path using PolyLine
        route_coords = row['coordinates']
        # Convert to (latitude, longitude) tuples
        route_latlon = [(coord[0], coord[1]) for coord in route_coords]

        folium.PolyLine(
            locations=route_latlon,
            color='blue',
            weight=2,
            opacity=0.6,
            tooltip=row['name']
        ).add_to(m)

    # Display the map
    display(m)


In [54]:
# Cell 16: Example Queries and Processing

# Example query with user type and no location
user_query_1 = "Find cycling routes near Landeck suitable for families."
# Example query with location and numerical constraints
user_query_2 = "Find a cycling tour near Kufstein with a route length more than 1.2km"

# Parameters for Query 1
radius_km_1 = 50  # Irrelevant as location is specified
apply_season_filter_1 = True
current_month_1 = None  # Not needed since preferred_month is extracted from query

# Process Query 1
print("\n--- Query 1:---")
try:
    # Find matching tours
    matching_tours_1 = find_cycling_tours(
        query=user_query_1,
        tours_df=tours_df,
        radius_km=radius_km_1,
        apply_season_filter=apply_season_filter_1,
        current_month=current_month_1
    )

    # Extract coordinates of the recognized location for presentation
    parsed_criteria_1 = parse_query(user_query_1)
    location_1 = parsed_criteria_1['location']
    if location_1:
        location_coords_1 = get_coordinates_cached(location_1)
    else:
        location_coords_1 = (None, None)

    # Present results
    present_results(matching_tours_1, location_coords_1)

    # Display on a map
    if location_coords_1 and all(location_coords_1):
        display_map(location_coords_1, matching_tours_1)
except Exception as e:
    logging.error(f"Error finding tours: {e}")

# Process Query 2
print("\n--- Query 2:---")
try:
    # Find matching tours
    matching_tours_2 = find_cycling_tours(
        query=user_query_2,
        tours_df=tours_df,
        radius_km=radius_km_1,
        apply_season_filter=True,
        current_month=current_month_1
    )

    # Extract coordinates of the recognized location for presentation
    parsed_criteria_2 = parse_query(user_query_2)
    location_2 = parsed_criteria_2['location']
    if location_2:
        location_coords_2 = get_coordinates_cached(location_2)
    else:
        location_coords_2 = (None, None)

    # Present results
    present_results(matching_tours_2, location_coords_2)

    # Display on a map
    if location_coords_2 and all(location_coords_2):
        display_map(location_coords_2, matching_tours_2)
except Exception as e:
    logging.error(f"Error finding tours: {e}")



--- Query 1:---

Parsed Criteria: {'type': None, 'location': 'Landeck', 'user_type': 'family', 'difficulty_levels': [0, 1], 'preferred_month': None, 'min_route_length_km': None, 'max_route_length_km': None, 'min_elevation_gain': None, 'max_elevation_gain': None}
Initial number of tours: 249
Number of tours after proximity filter: 43
Number of tours after applying difficulty levels [0, 1]: 32
Number of tours after applying family-specific constraints: 12
Number of top tours selected: 5
Matching Cycling Tours:

Tour Name: Fallwandweg/Tiefentalalm 612
Route Length: 4.7 km
Difficulty Level: 1
Elevation Gain: 500.0 meters
Tour Type: Single
Round Trip: No
Available Months: May, June, July, August, September, October
Coordinates: Start at (47.05962, 10.85135), End at (47.06229, 10.83038)

Tour Name: Arzler Alm Weg 611
Route Length: 3.9 km
Difficulty Level: 1
Elevation Gain: 500.0 meters
Tour Type: Single
Round Trip: No
Available Months: May, June, July, August, September, October
Coordinates


--- Query 2:---

Parsed Criteria: {'type': None, 'location': 'Kufstein', 'user_type': None, 'difficulty_levels': None, 'preferred_month': None, 'min_route_length_km': 1.2, 'max_route_length_km': None, 'min_elevation_gain': None, 'max_elevation_gain': None}
Initial number of tours: 249
Number of tours after proximity filter: 159
Number of tours after applying minimum route length (1.2 km): 159
Number of top tours selected: 5
Matching Cycling Tours:

Tour Name: 11 - Kitzbüheler Horn
Route Length: 129.8 km
Difficulty Level: 2
Elevation Gain: 1863.0 meters
Tour Type: Single
Round Trip: Yes
Available Months: May, June, July, August, September, October
Coordinates: Start at (47.58286, 12.16923), End at (47.58288, 12.16917)

Tour Name: 06A - Variante Hochtal-Runde
Route Length: 79.0 km
Difficulty Level: 1
Elevation Gain: 1348.0 meters
Tour Type: Single
Round Trip: Yes
Available Months: May, June, July, August, September, October
Coordinates: Start at (47.58288, 12.16896), End at (47.58293, 1

In [55]:
# Cell 17: Test Queries

# Test Query 2: Cycling routes near Innsbruck for summers
test_query_2 = "List me cycling tours near Kufstein with a route length > 20 km & elevation gain > 600 meters"
parsed_criteria_2 = parse_query(test_query_2)
print("\nParsed and Mapped Criteria for Query 2:")
print(parsed_criteria_2)

# Test Query 11: Family-Friendly Cycling Tour
test_query_11 = "Find cycling routes near Schwaz with route length greater than 30km and elevation gain over 800 meters."
parsed_criteria_11 = parse_query(test_query_11)
print("\nParsed and Mapped Criteria for Query 11:")
print(parsed_criteria_11)

# Test Query 11: Family-Friendly Cycling Tour
test_query_11 = "Find a cycling tour near Kufstein with a route length over 1.2km"
parsed_criteria_11 = parse_query(test_query_11)
print("\nParsed and Mapped Criteria for Query 11:")
print(parsed_criteria_11)



Parsed and Mapped Criteria for Query 2:
{'type': None, 'location': 'Kufstein', 'user_type': None, 'difficulty_levels': None, 'preferred_month': None, 'min_route_length_km': 20.0, 'max_route_length_km': None, 'min_elevation_gain': 600.0, 'max_elevation_gain': None}

Parsed and Mapped Criteria for Query 11:
{'type': None, 'location': 'Schwaz', 'user_type': None, 'difficulty_levels': None, 'preferred_month': None, 'min_route_length_km': 0.8, 'max_route_length_km': None, 'min_elevation_gain': 800.0, 'max_elevation_gain': None}

Parsed and Mapped Criteria for Query 11:
{'type': None, 'location': 'Kufstein', 'user_type': None, 'difficulty_levels': None, 'preferred_month': None, 'min_route_length_km': 1.2, 'max_route_length_km': None, 'min_elevation_gain': None, 'max_elevation_gain': None}


In [56]:
# Cell 18: Inspect Entity Detection

def inspect_BertNER_entities(query):
    """
    Inspects and prints the entities detected by Hugging Face's BERT-based NER pipeline in the given query.

    Parameters:
        query (str): The natural language query to inspect.
    """
    bert_results = bert_ner(query)
    print(f"\nQuery: '{query}'")
    print("Entities (BERT NER):")
    for ent in bert_results:
        print(f"  - {ent['word']} ({ent['entity_group']})")

def inspect_SpaCyNER_entities(query):
    """
    Inspects and prints the entities detected by spaCy's NER pipeline in the given query.

    Parameters:
        query (str): The natural language query to inspect.
    """
    doc = nlp(query)
    print(f"\nQuery: '{query}'")
    print("Entities (spaCy NER):")
    for ent in doc.ents:
        print(f"  - {ent.text} ({ent.label_})")

# Example usage
inspect_BertNER_entities("List me the looped cycling tours near the Hall of Tirol or Innsbruck for young people for the month of October")



Query: 'List me the looped cycling tours near the Hall of Tirol or Innsbruck for young people for the month of October'
Entities (BERT NER):
  - Hall of Tirol (LOC)
  - Innsbruck (LOC)


In [57]:
# Cell 19: Inspect NER with Another Example

# Example query
example_query = "List me the looped cycling tours near Wattens or Innsbruck for young people for the month of October"

# Process the query with spaCy
doc = nlp(example_query)

# Print detected entities
print("\nDetected Entities (spaCy NER):")
for ent in doc.ents:
    print(f"  - {ent.text} ({ent.label_})")



Detected Entities (spaCy NER):
  - Wattens (GPE)
  - Innsbruck (GPE)
  - the month of October (DATE)


In [58]:
# Cell 20: Summary of Loaded Data

if not tours_df.empty:
    print(f"\nTotal tours loaded: {len(tours_df)}")
    # Display summary statistics
    print("\nSummary Statistics:")
    print(tours_df.describe(include='all'))
else:
    print("\nNo valid tour data available for analysis.")



Total tours loaded: 249

Summary Statistics:
           id            name  \
count     249             249   
unique    249             248   
top     51997  Salvenradrunde   
freq        1               2   
mean      NaN             NaN   
std       NaN             NaN   
min       NaN             NaN   
25%       NaN             NaN   
50%       NaN             NaN   
75%       NaN             NaN   
max       NaN             NaN   

                                              coordinates  route_length_km  \
count                                                 249       249.000000   
unique                                                248              NaN   
top     [(47.40308, 11.56647), (47.40308, 11.56647), (...              NaN   
freq                                                    2              NaN   
mean                                                  NaN        34.698394   
std                                                   NaN        51.520927   
min        

In [59]:
# Cell 21: Comprehensive Testing of Updated System

def inspect_BertNER_entities(query):
    """
    Inspects and prints the entities detected by Hugging Face's BERT-based NER pipeline in the given query.

    Parameters:
        query (str): The natural language query to inspect.
    """
    bert_results = bert_ner(query)
    print(f"\nQuery: '{query}'")
    print("Entities (BERT NER):")
    for ent in bert_results:
        print(f"  - {ent['word']} ({ent['entity_group']})")

def inspect_SpaCyNER_entities(query):
    """
    Inspects and prints the entities detected by spaCy's NER pipeline in the given query.

    Parameters:
        query (str): The natural language query to inspect.
    """
    doc = nlp(query)
    print(f"\nQuery: '{query}'")
    print("Entities (spaCy NER):")
    for ent in doc.ents:
        print(f"  - {ent.text} ({ent.label_})")

# Example Test Queries
test_queries = [
    # 1. Basic Queries Without Location
    "List me all the cycling routes.",

    # 2. Seasonal Availability Queries
    "List me the cycling routes for summer.",
    "List me the cycling routes for fall.",
    "List me the cycling routes for falls.",  # Testing misspelled season

    # 3. Location-Based Queries
    "List me cycling tours near Kufstein where the elevation gain is more than 1.2km.",
    "Find a cycling tour near Innsbruck with a route length more than 15km.",
    "Find a cycling tour near Kufstein with a route length more than 10km and elevation gain less than 500 meters.",
    "Find a cycling tour near Schwaz with route length over 30km and elevation gain over 800 meters.",
    "Find cycling routes near Landeck suitable for families.",

    # 4. Difficulty Level and User Type Queries
    "Find family-friendly cycling routes.",
    "Show me beginner cycling tours with route length less than 10km.",
    "List experienced cycling routes near Landeck with elevation gain over 500m.",
    "Show me cycling tours for kids in summer.",
    "Find cycling routes for high stamina riders in winter.",

    # 5. Combined Criteria Queries
    "List me cycling tours near Kufstein with a route length more than 20km and elevation gain more than 600 meters.",
    "Find round-trip cycling routes available all year round.",
    "List me cycling tours for intermediate riders with route length between 15km and 25km.",
    "Find cycling routes that are loop tours available in September.",
    "Find all cycling routes for autumn with difficulty level medium.",

    # 6. Additional Comprehensive Queries
    "Find cycling tours near Innsbruck suitable for experienced riders with elevation gain less than 300 meters."
]


for query in test_queries:
    print(f"\n---\nQuery: {query}")
    parsed_criteria = parse_query(query)
    print("Parsed and Mapped Criteria:")
    print(parsed_criteria)

    # Find matching tours
    matching_tours = find_cycling_tours(query, tours_df, radius_km=50, apply_season_filter=True)
    present_results(matching_tours, get_coordinates_cached(parsed_criteria['location']) if parsed_criteria['location'] else None)

    # # Optionally, display the map
    # if parsed_criteria['location']:
    #     location_coords = get_coordinates_cached(parsed_criteria['location'])
    #     if location_coords:
    #         display_map(location_coords, matching_tours)



---
Query: List me all the cycling routes.
Parsed and Mapped Criteria:
{'type': None, 'location': None, 'user_type': None, 'difficulty_levels': None, 'preferred_month': None, 'min_route_length_km': None, 'max_route_length_km': None, 'min_elevation_gain': None, 'max_elevation_gain': None}

Parsed Criteria: {'type': None, 'location': None, 'user_type': None, 'difficulty_levels': None, 'preferred_month': None, 'min_route_length_km': None, 'max_route_length_km': None, 'min_elevation_gain': None, 'max_elevation_gain': None}
Initial number of tours: 249
Number of top tours selected: 5
Matching Cycling Tours:

Tour Name: Freundschaftsradroute München–Venedig
Route Length: 560.0 km
Difficulty Level: 2
Elevation Gain: 3000.0 meters
Tour Type: Single
Round Trip: No
Available Months: 
Coordinates: Start at (48.13966, 11.55902), End at (45.43834, 12.31863)

Tour Name: 13 - Glockner-Tour
Route Length: 298.9 km
Difficulty Level: 2
Elevation Gain: 4040.0 meters
Tour Type: Single
Round Trip: Yes
Avai