In [None]:
# Login to Hugging Face
HUGGINGFACE_TOKEN = ""
from huggingface_hub import login
login(token = HUGGINGFACE_TOKEN)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
pip install transformers accelerate bitsandbytes

MISTRAL-LABELLED




In [None]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import random

# Define model repo
MODEL_NAME = "matteanedda/Itinerary_Selection_Mistral"#"matteanedda/mistral-7b-instruct-logical-itinerary-selection-4bit-v5"

# POI Type Mapping
TYPE_MAPPING = {
    "Museo": "museum",
    "FermataDellaMetropolitana": "metro_station",
    "GalleriaDArte": "art_gallery",
    "Monumento": "monument",
    "Scultura": "sculpture",
    "Teatro": "theater"
}

def initialize_model():
    """Load the model with 4-bit quantization."""
    try:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            device_map="auto",
            trust_remote_code=True
        )
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        return model, tokenizer
    except Exception as e:
        print(f"Error initializing model: {e}")
        exit()

def map_types_numeric(df):
    """Convert POI type names to numeric categories."""
    df["From_Type"] = df["From_Type"].map(TYPE_MAPPING).fillna(0)
    df["To_Type"] = df["To_Type"].map(TYPE_MAPPING).fillna(0)
    return df

def preprocess_pois_numeric(df, allowed_types=None):
    """
    Filter dataset based on max duration and allowed POI types.
    Always keeps 'FermataDellaMetropolitana' type POIs regardless of allowed_types.

    Parameters:
    df (DataFrame): Input dataframe with POI information
    max_duration (int): Maximum duration in seconds
    allowed_types (list): List of allowed POI types. If None or empty, all types are allowed.

    Returns:
    DataFrame: Filtered and processed dataframe
    """

    # Filter by allowed types if the list is not empty
    if allowed_types and len(allowed_types) > 0:
        # Always include FermataDellaMetropolitana
        if "FermataDellaMetropolitana" not in allowed_types:
            allowed_types = allowed_types + ["FermataDellaMetropolitana"]

        # Create masks for the filter conditions
        from_type_allowed = df['From_Type'].isin(allowed_types)
        to_type_allowed = df['To_Type'].isin(allowed_types)

        # Keep connections where both endpoints are in allowed types
        df = df[from_type_allowed & to_type_allowed]

    # Shuffle the dataframe
    df = df.sample(frac=1)
    df = df.reset_index(drop=True)

    return map_types_numeric(df)

def generate_prompt_numeric(filtered_df, interval, numb_itineraries, starting_poi, numb_pois_to_be_visited, additional_informations):
    """Generate a prompt for the model with enhanced bidirectionality explanation."""

    # Extract all valid metro POIs for final stops
    metro_pois = filtered_df[filtered_df["From_Type"] == "metro"]["From_Label"].unique()

    # Create a dictionary of valid connections for faster lookup
    valid_connections = {}
    for _, row in filtered_df.iterrows():
        from_key = (row['From_Label'], row['From_Type'])
        if from_key not in valid_connections:
            valid_connections[from_key] = []
        valid_connections[from_key].append({
            'to_label': row['To_Label'],
            'to_type': row['To_Type'],
            'time': round(row['Duration (s)'] / 60)
        })

    # Format the list of final stops
    final_stops_text = "\n".join([f"METRO: {label}" for label in metro_pois])

    # Generate transition text with numbered connections for easy reference
    transitions = []
    for i, (_, row) in enumerate(filtered_df.iterrows(), 1):
        # Check if either From_Type or To_Type is a metro_station
        is_metro_connection = row['From_Type'] == "metro_station" or row['To_Type'] == "metro_station"

        # Set the appropriate prefix
        prefix = "METRO CONNECTION: " if is_metro_connection else "REGULAR CONNECTION: "

        # Create the full transition string with the prefix
        transition = (
            f"{prefix}{row['From_Label']}, {row['From_Type']} - "
            f"{row['To_Label']}, {row['To_Type']} ----"
            f"WALKING TIME: {round(row['Duration (s)'] / 60)} min"
        )
        transitions.append(transition)

    transitions_text = "\n".join(transitions)

    prompt = f"""Generate {numb_itineraries} distinct walking itineraries by finding EXACT string matches.

The itineraries must visit {numb_pois_to_be_visited -1} cultural locations before arriving at a metro_station

[CRITICAL RULES]
- ONLY use connections where your Current Location appears EXACTLY (including type)
- Each itinerary must visit exactly {numb_pois_to_be_visited - 1} points before reaching metro_station
- You CANNOT use connections that don't contain your Current Location
- METRO CONNECTION must not appear in intermediate legs connections
- The itineraries should have a total time difference of around {interval} min.

[STEP BY STEP PROCESS]
For each step:

1. CURRENT LOCATION CHECK
Take your Current Location
Search the connections for the EXACT string
Search REGULAR CONNECTION
You can ONLY use connections where the EXACT string appears
Do not consider METRO CONNECTION

2. VERIFY CONNECTION
When you find a matching connection in the provided list:
- Copy the FULL connection string including walking time from the provided list
- Verify your Current Location appears exactly as one of the two locations
- Verify the walking time has been copied correctly
- Take the other location as your Next Location

3. CONTINUE PATH
- Your Next Location becomes your new Current Location
- Repeat the process
- Stop reaching a metro station at step {numb_pois_to_be_visited}

**IMPORTANT**
1. The connection for Leg 1 must contain {starting_poi} since the itinerary start from there. Otherwise it's invalid
2. Only the last leg (Leg {numb_pois_to_be_visited}) must be a METRO CONNECTION, all the other LEGs must not.
3. When you look for the connection containing a certain POI, check the full list. Do not suppose the connection.
4. When you select the connection to be used keep the walking time as well. Do not generate the time by yourself
5. Make sure the connection is the same as listed in the connection list
6. METRO CONNECTION must not be considered for the intermediate leg in each itinerary.
7. Leg 1 connection must be unique across all itineraries
8. Total Time must differ by around {interval} min across the {numb_itineraries} itineraries.

[OUTPUT FORMAT]
Itinerary #[N]:
Leg 1:
Current Location: {starting_poi}, type
Looking for REGULAR CONNECTION containing exactly: "{starting_poi}, type";
Found Connection: "[paste entire connection containing {starting_poi}]"
→ Next Location: [other location from connection, type]

Leg 2:
Current Location: [previous Next Location, type]
Looking for REGULAR CONNECTION containing exactly: "[previous Next Location, type]";
Found Connection: "[paste entire connection]"
→ Next Location: [other location from connection, type]

.....
Final Leg (Leg {numb_pois_to_be_visited}):
Current Location: [previous Next Location, type]
Looking for METRO CONNECTION containing exactly: "[previous Next Location]";
VERIFICATION: This connection DOES contain metro_station (REQUIRED)
Found Connection: "[paste entire connection]"
Final location: [metro_station]

Total Time: [time leg 1 + time leg 2 + ..... + time leg {numb_pois_to_be_visited}] = h min
------
Make sure to follow the logic for the previous location and next location.
Make sure to find the exact connection from the connection list and do not generate a connection by yourself
Make sure METRO CONNECTION DOES NOT APPEAR IN ANY INTERMEDIATE LEG
Make sure to use type METRO CONNECTION only as final leg (destination of the itinerary)
Make sure to retrive also the walking time from the connection list
Make sure to select different connection string for leg 1 across the different itineraries
Make sure to correctly sum the time of each leg as total time
Make sure Leg 1 connection is unique across all itineraries
Make sure to show the maths for the total time
Make sure to have {starting_poi} in the first Leg connection
------

[AVAILABLE CONNECTIONS]
All the existing connections are listed below:
{transitions_text}

Build exactly {numb_itineraries} itineraries with {numb_pois_to_be_visited} legs starting from {starting_poi}
[SOLUTION]
"""
    return prompt


def get_model_response(model, tokenizer, prompt, max_tokens=2000):
    """Generate a response from the model."""
    try:
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=max_tokens)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error calling model: {e}")
        return None

def display_response(response):
    """Print the formatted itinerary."""
    print("\n--- Generated Itineraries ---\n")
    print(response if response else "No valid response received.")


In [None]:
def main():
    interval = 8  # Allowed duration difference (minutes)
    numb_itineraries = 3  # Number of itineraries
    starting_poi = 'Museo del Novecento'  # Starting POI
    numb_pois_to_be_visited = 4
    additional_informations = '' #add additional infos as needed
    allowed_types = ['Monumento', 'Museo', 'GalleriaDArte']      #MUST BE: EMPTY, 'Teatro', 'Scultura', 'Monumento', 'GalleriaDArte', 'Museo'

    # Initialize the model
    model, tokenizer = initialize_model()

    # Load and preprocess the dataset
    df_path = r"/content/gdrive/MyDrive/BICOCCA/TESI/poi_distances_with_types_and_wikidata_POI_final_subset_20ina_POIs.csv"
    df = pd.read_csv(df_path)
    filtered_df = preprocess_pois_numeric(df, allowed_types)

    # Generate the prompt
    prompt = generate_prompt_numeric(filtered_df, interval, numb_itineraries, starting_poi, numb_pois_to_be_visited, additional_informations)

    # Get response
    itinerary_response = get_model_response(model, tokenizer, prompt)

    # Display itinerary
    display_response(itinerary_response)
    return itinerary_response

if __name__ == "__main__":
   response = main()

Unused kwargs: ['quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.



--- Generated Itineraries ---

Generate 3 distinct walking itineraries by finding EXACT string matches.

The itineraries must visit 3 cultural locations before arriving at a metro_station

[CRITICAL RULES]
- ONLY use connections where your Current Location appears EXACTLY (including type)
- Each itinerary must visit exactly 3 points before reaching metro_station
- You CANNOT use connections that don't contain your Current Location
- METRO CONNECTION must not appear in intermediate legs connections
- The itineraries should have a total time difference of around 8 min.

[STEP BY STEP PROCESS]
For each step:

1. CURRENT LOCATION CHECK
Take your Current Location
Search the connections for the EXACT string
Search REGULAR CONNECTION
You can ONLY use connections where the EXACT string appears
Do not consider METRO CONNECTION

2. VERIFY CONNECTION
When you find a matching connection in the provided list:
- Copy the FULL connection string including walking time from the provided list
- Verify 

LLM DOMANDE


In [None]:
import pandas as pd
import re

def parse_itineraries(text):
    # Initialize lists to store data
    data = []

    # First get only the part after [SOLUTION]
    if '[SOLUTION]' in text:
        text = text.split('[SOLUTION]')[1]

    # Split the text by "Itinerary #" keyword to get individual itineraries
    itinerary_blocks = re.split(r'Itinerary #(\d+):', text)

    # Skip the first element which is just text before the first itinerary
    if len(itinerary_blocks) > 1:
        itinerary_blocks = itinerary_blocks[1:]

    # Process each itinerary (number and content alternating)
    for i in range(0, len(itinerary_blocks), 2):
        if i + 1 >= len(itinerary_blocks):
            break

        try:
            # Clean up and convert itinerary number
            itinerary_num = int(itinerary_blocks[i].strip())
            itinerary_text = itinerary_blocks[i + 1]

            # Extract legs
            legs = re.findall(r'Leg \d+:.*?(?=Leg \d+:|Final Leg \(Leg \d+\):|Total Time:|$)', itinerary_text, re.DOTALL)
            final_leg = re.findall(r'Final Leg \(Leg \d+\):.*?(?=Total Time:|$)', itinerary_text, re.DOTALL)

            # Combine regular legs and final leg if it exists
            if final_leg:
                legs.extend(final_leg)

            # Process each leg
            for leg_index, leg in enumerate(legs):
                # Extract current location
                current_location_match = re.search(r'Current Location: (.*?), (.*?)(?=Looking|\n)', leg, re.DOTALL)
                if current_location_match:
                    poi_name, poi_type = current_location_match.groups()
                    poi_name = poi_name.strip()
                    poi_type = poi_type.strip()

                    # Only add as start if it's the first leg
                    if leg_index == 0:
                        data.append({
                            'Itinerary': itinerary_num,
                            'Poi_name': poi_name,
                            'role': 'Start',
                            'type': poi_type,
                            'walking_time': 0  # Starting point has 0 walking time
                        })

                # Extract walking time
                walking_time_match = re.search(r'WALKING TIME: (\d+) min', leg, re.DOTALL)
                walking_time = 0
                if walking_time_match:
                    walking_time = int(walking_time_match.group(1))

                # Extract next location
                next_location_match = re.search(r'→ Next Location: (.*?), (.*?)(?=\n|$)', leg, re.DOTALL)
                if next_location_match:
                    poi_name, poi_type = next_location_match.groups()
                    poi_name = poi_name.strip()
                    poi_type = poi_type.strip()
                    data.append({
                        'Itinerary': itinerary_num,
                        'Poi_name': poi_name,
                        'role': f'Stop{leg_index + 1}',
                        'type': poi_type,
                        'walking_time': walking_time
                    })

            # Process the final location (only if present)
            final_match = re.search(r'Final location: (.*?), (.*?)(?=\n|$)', itinerary_text, re.DOTALL)
            if final_match:
                poi_name, poi_type = final_match.groups()
                poi_name = poi_name.strip()
                poi_type = poi_type.strip()

                # Extract walking time from the final leg
                final_walking_time = 0
                if legs and len(legs) > 0:
                    final_walking_time_match = re.search(r'WALKING TIME: (\d+) min', legs[-1], re.DOTALL)
                    if final_walking_time_match:
                        final_walking_time = int(final_walking_time_match.group(1))

                # Check if this is already added
                if data and data[-1]['Itinerary'] == itinerary_num:
                    last_entry = data[-1]
                    if not (last_entry['Poi_name'] == poi_name and last_entry['type'] == poi_type):
                        data.append({
                            'Itinerary': itinerary_num,
                            'Poi_name': poi_name,
                            'role': f'Stop{len(legs)}',
                            'type': poi_type,
                            'walking_time': final_walking_time
                        })
                else:
                    data.append({
                        'Itinerary': itinerary_num,
                        'Poi_name': poi_name,
                        'role': f'Stop{len(legs)}',
                        'type': poi_type,
                        'walking_time': final_walking_time
                    })

        except (ValueError, IndexError) as e:
            print(f"Error processing itinerary: {e}")
            continue

    # Create DataFrame
    df = pd.DataFrame(data)
    return df

In [None]:
df = parse_itineraries(response)
df

Unnamed: 0,Itinerary,Poi_name,role,type,walking_time
0,1,Museo del Novecento,Start,museum,0
1,1,Collections of the Ospedale Maggiore,Stop1,museum,9
2,1,Duomo di Milano Museum,Stop2,museum,8
3,1,Monument to Federico Borromeo,Stop3,monument,8
4,1,Missori,Stop4,metro_station,6
5,2,Museo del Novecento,Start,museum,0
6,2,Pinacoteca Ambrosiana,Stop1,art_gallery,5
7,2,Monument to Giuseppe Missori,Stop2,monument,5
8,2,San Fedele museum,Stop3,museum,10
9,2,San Babila,Stop4,metro_station,7


In [None]:
path_wikidata = r"/content/gdrive/MyDrive/BICOCCA/TESI/pois_with_wikidata.csv"
df_wikidata = pd.read_csv(path_wikidata)

In [None]:
df_wikidata

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,POI,TYPE,PLACE,WIKIDATA,LATITUDE,LONGITUDE,DISTANCE,wikidata_id,label
0,0,0,106481,Scultura,via-san-maurilio,Q56604977,45.46284,9.18291,0.583524,Q56604977,Grande bassorilievo
1,1,1,108187,Museo,via-guglielmo-marconi,Q261233,45.463414,9.190261,0.094433,Q261233,Museo del Novecento
2,2,2,115154,Scultura,piazza-cordusio,Q63633198,45.465456,9.186394,0.319298,Q63633198,Monument to Giuseppe Parini
3,3,3,119513,Museo,piazza-della-scala,Q55832575,45.467339,9.189588,0.345171,Q55832575,Leonardo3 Museum
4,4,4,108242,Museo,via-dell-ambrosiana,Q55376499,45.46343,9.185222,0.392464,Q55376499,Museo Mangini Bonomi
5,5,5,98652,Museo,universita-degli-studi-di-milano,Q17154436,45.460009,9.195317,0.622704,Q17154436,Collections of the Ospedale Maggiore
6,6,6,109486,GalleriaDArte,piazza-pio-xi,Q1085811,45.46378,9.185952,0.328911,Q1085811,Pinacoteca Ambrosiana
7,7,7,107078,Museo,cortile-del-palazzo-reale,Q51105,45.463054,9.191153,0.156383,Q51105,Royal Palace of Milan
8,8,8,120148,Monumento,piazza-belgioioso,Q109768623,45.467622,9.192375,0.413647,Q109768623,Monument to Cristina Trivulzio di Belgiojoso (...
9,9,9,110593,Teatro,piazza-cesare-beccaria,Q30880124,45.464135,9.194584,0.348843,Q30880124,Teatro Gerolamo


In [None]:
# Merge on From_Label
df_merged = df.merge(df_wikidata[['label','LATITUDE', 'LONGITUDE','WIKIDATA']],
                       left_on='Poi_name', right_on='label',
                       how='left')

In [None]:
df_merged

Unnamed: 0,Itinerary,Poi_name,role,type,walking_time,label,LATITUDE,LONGITUDE,WIKIDATA
0,1,Museo del Novecento,Start,museum,0,Museo del Novecento,45.463414,9.190261,Q261233
1,1,Collections of the Ospedale Maggiore,Stop1,museum,9,Collections of the Ospedale Maggiore,45.460009,9.195317,Q17154436
2,1,Duomo di Milano Museum,Stop2,museum,8,Duomo di Milano Museum,45.463246,9.191625,Q16337697
3,1,Monument to Federico Borromeo,Stop3,monument,8,Monument to Federico Borromeo,45.463196,9.185228,Q115868258
4,1,Missori,Stop4,metro_station,6,Missori,45.460556,9.188333,Q1088817
5,2,Museo del Novecento,Start,museum,0,Museo del Novecento,45.463414,9.190261,Q261233
6,2,Pinacoteca Ambrosiana,Stop1,art_gallery,5,Pinacoteca Ambrosiana,45.46378,9.185952,Q1085811
7,2,Monument to Giuseppe Missori,Stop2,monument,5,Monument to Giuseppe Missori,45.4611,9.1881,Q21141701
8,2,San Fedele museum,Stop3,museum,10,San Fedele museum,45.466442,9.191131,Q24940273
9,2,San Babila,Stop4,metro_station,7,San Babila,45.466667,9.1975,Q1087622


In [None]:
pip install SPARQLWrapper

Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.1.3-py3-none-any.whl.metadata (11 kB)
Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Downloading rdflib-7.1.3-py3-none-any.whl (564 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/564.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.9/564.9 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 rdflib-7.1.3


In [None]:
import pandas as pd
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
import re

# SPARQL endpoint
endpoint_url = "https://query.wikidata.org/sparql"

# SPARQL query template
query_template = """
SELECT ?rdfsLabel ?instanceOfLabel ?materialLabel ?heritageStatusLabel
       ?startime ?startime_2 ?creatorLabel ?description
WHERE {{
  wd:{entity_id} rdfs:label ?rdfsLabel.
  OPTIONAL {{ wd:{entity_id} wdt:P31 ?instanceOf. }}
  OPTIONAL {{ wd:{entity_id} wdt:P186 ?material. }}
  OPTIONAL {{ wd:{entity_id} wdt:P1435 ?heritageStatus. }}
  OPTIONAL {{ wd:{entity_id} wdt:P580 ?startime. }}
  OPTIONAL {{ wd:{entity_id} wdt:P571 ?startime_2. }}
  OPTIONAL {{ wd:{entity_id} wdt:P170 ?creator. }}
  OPTIONAL {{ wd:{entity_id} schema:description ?description. FILTER(LANG(?description) = "en") }}

  FILTER(LANG(?rdfsLabel) = "en")

  SERVICE wikibase:label {{
    bd:serviceParam wikibase:language "en".
    ?instanceOf rdfs:label ?instanceOfLabel.
    ?material rdfs:label ?materialLabel.
    ?heritageStatus rdfs:label ?heritageStatusLabel.
    ?creator rdfs:label ?creatorLabel.
  }}
}}
"""


def get_results(endpoint_url, query):
    """Fetch results from the SPARQL endpoint."""
    user_agent = f"WDQS-example Python/{sys.version_info[0]}.{sys.version_info[1]}"
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

def extract_qid(wikidata_url):
    """Extract Wikidata QID from URL."""
    if pd.isna(wikidata_url):
        return None
    match = re.search(r'Q\d+', str(wikidata_url))
    return match.group(0) if match else None

# Assuming your DataFrame is called 'data' and has a 'WIKIDATA' column
data = df_merged

# Add new columns for rdfs:label, instance of, material, and heritage status
data['label_'] = None
data["Instance_of"] = None
data["Material"] = None
data["Heritage_Status"] = None
data["Since"] = None
data["Since_2"] = None
data["Created_by"] = None
data["Description"] = None

# Iterate over each row in the dataset and query Wikidata
for index, row in data.iterrows():
    wikidata_url = row["WIKIDATA"]
    entity_id = extract_qid(wikidata_url)

    if entity_id is None:
        continue

    try:
        query = query_template.format(entity_id=entity_id)
        results = get_results(endpoint_url, query)

        # Initialize temporary variables for the query results
        label = None
        instance_of_label = None
        material_label = None
        heritage_status_label = None
        since_when = None
        since_when_2 = None
        creator = None
        description = None

        # Parse the results
        for result in results["results"]["bindings"]:
            if "rdfsLabel" in result:
                label = result["rdfsLabel"]["value"]

            # Extract wdt:P31 (instance of)
            if "instanceOfLabel" in result:
                instance_of_label = result["instanceOfLabel"]["value"]

            # Extract wdt:P186 (material)
            if "materialLabel" in result:
                material_label = result["materialLabel"]["value"]

            # Extract wdt:P1435 (heritage status)
            if "heritageStatusLabel" in result:
                heritage_status_label = result["heritageStatusLabel"]["value"]

            if "startime" in result:
                since_when = result["startime"]["value"]

            if "startime_2" in result:
                since_when_2 = result["startime_2"]["value"]

            if "creatorLabel" in result:
                creator = result["creatorLabel"]["value"]

            if "description" in result:
                description = result["description"]["value"]

        # Update the DataFrame with the extracted data
        data.at[index, "label_"] = label
        data.at[index, "Instance_of"] = instance_of_label
        data.at[index, "Material"] = material_label
        data.at[index, "Heritage_Status"] = heritage_status_label
        data.at[index, "Since"] = since_when
        data.at[index, "Since_2"] = since_when_2
        data.at[index, "Created_by"] = creator
        data.at[index, "Description"] = description

    except Exception as e:
        print(f"Error querying {entity_id}: {e}")

# Filter out metro stops
data = data[data["type"] != "metro_station"]

In [None]:
import pandas as pd
import re
from SPARQLWrapper import SPARQLWrapper, JSON
import urllib.parse  # For URL encoding

df = data  # Assuming data is your DataFrame

# SPARQL Query function
def get_dbpedia_abstract(label):
    # Remove "Monument to" from the label if it exists
    if label.lower().startswith("monument to "):
        label = label[len("Monument to "):]  # Remove the "Monument to " part

    # Remove parentheses content (e.g., "(some text)")
    label = re.sub(r"\(.*?\)", "", label).strip()

    # Encode the label to be URL-safe
    encoded_label = urllib.parse.quote(label.replace(" ", "_"))

    sparql = SPARQLWrapper("https://dbpedia.org/sparql")

    query = f"""
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX dbr: <http://dbpedia.org/resource/>

    ASK WHERE {{
        dbr:{encoded_label} ?p ?o .
    }}
    """

    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

    try:
        # Check if resource exists in DBpedia
        results = sparql.query().convert()
        if not results["boolean"]:  # If false, the resource doesn't exist
            return ""

        # If resource exists, fetch the abstract
        query = f"""
        PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX dbr: <http://dbpedia.org/resource/>

        SELECT ?abstract WHERE {{
            dbr:{encoded_label} dbo:abstract ?abstract .
            FILTER (lang(?abstract) = 'en')
        }}
        LIMIT 1
        """

        sparql.setQuery(query)
        results = sparql.query().convert()

        if results["results"]["bindings"]:
            return results["results"]["bindings"][0]["abstract"]["value"]
        else:
            return ""

    except Exception as e:
        return f"Error: {str(e)}"

# Apply the function to each row
df["Abstract"] = df["label"].apply(get_dbpedia_abstract)
df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Abstract"] = df["label"].apply(get_dbpedia_abstract)


Unnamed: 0,Itinerary,Poi_name,role,type,walking_time,label,LATITUDE,LONGITUDE,WIKIDATA,label_,Instance_of,Material,Heritage_Status,Since,Since_2,Created_by,Description,Abstract
0,1,Museo del Novecento,Start,museum,0,Museo del Novecento,45.463414,9.190261,Q261233,Museo del Novecento,art museum,,Italian national heritage,,2010-12-06T00:00:00Z,,"museum of twentieth-century art in Milan, Italy","The Museo del Novecento (""museum of the twenti..."
1,1,Collections of the Ospedale Maggiore,Stop1,museum,9,Collections of the Ospedale Maggiore,45.460009,9.195317,Q17154436,Collections of the Ospedale Maggiore,private museum,,,,1456-01-01T00:00:00Z,,museum in Italy,
2,1,Duomo di Milano Museum,Stop2,museum,8,Duomo di Milano Museum,45.463246,9.191625,Q16337697,Duomo di Milano Museum,art museum,,,,,,museum in Italy,
3,1,Monument to Federico Borromeo,Stop3,monument,8,Monument to Federico Borromeo,45.463196,9.185228,Q115868258,Monument to Federico Borromeo,memorial,,,,1865-01-01T00:00:00Z,Costantino Corti,,Federico Borromeo (18 August 1564 – 21 Septemb...
5,2,Museo del Novecento,Start,museum,0,Museo del Novecento,45.463414,9.190261,Q261233,Museo del Novecento,art museum,,Italian national heritage,,2010-12-06T00:00:00Z,,"museum of twentieth-century art in Milan, Italy","The Museo del Novecento (""museum of the twenti..."
6,2,Pinacoteca Ambrosiana,Stop1,art_gallery,5,Pinacoteca Ambrosiana,45.46378,9.185952,Q1085811,Pinacoteca Ambrosiana,art museum,,Italian national heritage,,1618-01-01T00:00:00Z,,art museum in Milan,
7,2,Monument to Giuseppe Missori,Stop2,monument,5,Monument to Giuseppe Missori,45.4611,9.1881,Q21141701,Monument to Giuseppe Missori,monument,bronze,,,1916-01-01T00:00:00Z,Riccardo Ripamonti,"statue by Riccardo Ripamonti in Milan, Italy",Giuseppe Missori (11 June 1829 - 25 March 1911...
8,2,San Fedele museum,Stop3,museum,10,San Fedele museum,45.466442,9.191131,Q24940273,San Fedele museum,private museum,,,,,,museum in Italy,
10,3,Museo del Novecento,Start,museum,0,Museo del Novecento,45.463414,9.190261,Q261233,Museo del Novecento,art museum,,Italian national heritage,,2010-12-06T00:00:00Z,,"museum of twentieth-century art in Milan, Italy","The Museo del Novecento (""museum of the twenti..."
11,3,Royal Palace of Milan,Stop1,museum,2,Royal Palace of Milan,45.463054,9.191153,Q51105,Royal Palace of Milan,art museum,,Italian national heritage,,,,art museum,The Royal Palace of Milan (Italian: Palazzo Re...


In [None]:
import pandas as pd
import re
from SPARQLWrapper import SPARQLWrapper, JSON
import urllib.parse  # For URL encoding

df = data  # Assuming data is your DataFrame

# SPARQL Query function
def get_dbpedia_abstract(label):
    # Remove "Monument to" from the label if it exists
    if label.lower().startswith("monument to "):
        label = label[len("Monument to "):]  # Remove the "Monument to " part

    # Remove parentheses content (e.g., "(some text)")
    label = re.sub(r"\(.*?\)", "", label).strip()

    # Encode label safely for DBpedia URI
    encoded_label = urllib.parse.quote(label.replace(" ", "_"))

    sparql = SPARQLWrapper("https://dbpedia.org/sparql")

    # Try direct match first
    query = f"""
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX dbr: <http://dbpedia.org/resource/>

    SELECT ?abstract WHERE {{
        dbr:{encoded_label} dbo:abstract ?abstract .
        FILTER (lang(?abstract) = 'en')
    }}
    LIMIT 1
    """

    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

    try:
        results = sparql.query().convert()
        if results["results"]["bindings"]:
            return results["results"]["bindings"][0]["abstract"]["value"]

        # If no direct match, try searching abstracts that contain the label
        # Escape special characters properly inside SPARQL FILTER
        safe_label = label.replace('"', '\\"')  # Escape quotes for SPARQL

        query = f"""
        PREFIX dbo: <http://dbpedia.org/ontology/>

        SELECT ?abstract WHERE {{
            ?subject dbo:abstract ?abstract .
            FILTER (lang(?abstract) = 'en')
            FILTER (CONTAINS(LCASE(?abstract), LCASE("{safe_label}")))
        }}
        LIMIT 1
        """

        sparql.setQuery(query)
        results = sparql.query().convert()

        if results["results"]["bindings"]:
            return results["results"]["bindings"][0]["abstract"]["value"]
        else:
            return ""

    except Exception as e:
        return f"Error: {str(e)}"

# Apply function to fill empty abstracts
df["Abstract"] = df["label"].apply(get_dbpedia_abstract)
df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Abstract"] = df["label"].apply(get_dbpedia_abstract)


Unnamed: 0,Itinerary,Poi_name,role,type,walking_time,label,LATITUDE,LONGITUDE,WIKIDATA,label_,Instance_of,Material,Heritage_Status,Since,Since_2,Created_by,Description,Abstract
0,1,Museo del Novecento,Start,museum,0,Museo del Novecento,45.463414,9.190261,Q261233,Museo del Novecento,art museum,,Italian national heritage,,2010-12-06T00:00:00Z,,"museum of twentieth-century art in Milan, Italy","The Museo del Novecento (""museum of the twenti..."
1,1,Collections of the Ospedale Maggiore,Stop1,museum,9,Collections of the Ospedale Maggiore,45.460009,9.195317,Q17154436,Collections of the Ospedale Maggiore,private museum,,,,1456-01-01T00:00:00Z,,museum in Italy,
2,1,Duomo di Milano Museum,Stop2,museum,8,Duomo di Milano Museum,45.463246,9.191625,Q16337697,Duomo di Milano Museum,art museum,,,,,,museum in Italy,
3,1,Monument to Federico Borromeo,Stop3,monument,8,Monument to Federico Borromeo,45.463196,9.185228,Q115868258,Monument to Federico Borromeo,memorial,,,,1865-01-01T00:00:00Z,Costantino Corti,,Federico Borromeo (18 August 1564 – 21 Septemb...
5,2,Museo del Novecento,Start,museum,0,Museo del Novecento,45.463414,9.190261,Q261233,Museo del Novecento,art museum,,Italian national heritage,,2010-12-06T00:00:00Z,,"museum of twentieth-century art in Milan, Italy","The Museo del Novecento (""museum of the twenti..."
6,2,Pinacoteca Ambrosiana,Stop1,art_gallery,5,Pinacoteca Ambrosiana,45.46378,9.185952,Q1085811,Pinacoteca Ambrosiana,art museum,,Italian national heritage,,1618-01-01T00:00:00Z,,art museum in Milan,
7,2,Monument to Giuseppe Missori,Stop2,monument,5,Monument to Giuseppe Missori,45.4611,9.1881,Q21141701,Monument to Giuseppe Missori,monument,bronze,,,1916-01-01T00:00:00Z,Riccardo Ripamonti,"statue by Riccardo Ripamonti in Milan, Italy",Giuseppe Missori (11 June 1829 - 25 March 1911...
8,2,San Fedele museum,Stop3,museum,10,San Fedele museum,45.466442,9.191131,Q24940273,San Fedele museum,private museum,,,,,,museum in Italy,
10,3,Museo del Novecento,Start,museum,0,Museo del Novecento,45.463414,9.190261,Q261233,Museo del Novecento,art museum,,Italian national heritage,,2010-12-06T00:00:00Z,,"museum of twentieth-century art in Milan, Italy","The Museo del Novecento (""museum of the twenti..."
11,3,Royal Palace of Milan,Stop1,museum,2,Royal Palace of Milan,45.463054,9.191153,Q51105,Royal Palace of Milan,art museum,,Italian national heritage,,,,art museum,The Royal Palace of Milan (Italian: Palazzo Re...


In [None]:
df.to_csv(r"/content/gdrive/MyDrive/BICOCCA/TESI/data_for_questions.csv", index=False)

In [None]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Model Configuration
MODEL_NAME = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
response_list = []
# Function to Initialize Model & Tokenizer
def initialize_model():
    """Load the Llama 2 model and tokenizer."""
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            device_map="auto",
            torch_dtype=torch.float16  # Use float16 for efficiency
        )
        return model, tokenizer
    except Exception as e:
        print(f"Error initializing model: {e}")
        return None, None

# Load Model & Tokenizer
model, tokenizer = initialize_model()

# Ensure model is loaded before proceeding
if model is None or tokenizer is None:
    print("Model initialization failed. Exiting.")
    exit()

# Load the dataset
df = pd.read_csv("/content/gdrive/MyDrive/BICOCCA/TESI/data_for_questions.csv")
df = df[df['role']!='Start']

# Handle missing values
df = df.fillna("")

# Iterate over POIs and generate quiz questions & facts
for index, row in df.iterrows():
    label = row["label_"]
    description = row["Description"]

    # Collect available information dynamically
    info_parts = []
    if pd.notna(row["Since_2"]) and row["Since_2"]:
        info_parts.append(f"The {label} was Built/Established around {row['Since_2']}")
    if pd.notna(row["Created_by"]) and row["Created_by"]:
        info_parts.append(f"The Creator/Architect of '{label}' is {row['Created_by']}")
    if pd.notna(row["Material"]) and row["Material"]:
        info_parts.append(f"{label} is made out of {row['Material']}")
    if pd.notna(row["Heritage_Status"]) and row["Heritage_Status"]:
        info_parts.append(f"The Heritage Status of {label} is {row['Heritage_Status']}")
    if pd.notna(row["Abstract"]) and row["Abstract"]:
        info_parts.append(row["Abstract"])  # Abstract is a standalone text

    # Combine the dynamically constructed info section
    info_text = "\n".join(info_parts) if info_parts else "No additional information available."

    # Generate a prompt
    prompt = f"""
You are an expert tour guide in Milan about {label}, {description}.

### TASK:
You must create a Multiple-choice question using the information provided:

Carefully read these information below to generate the question and the answers:
{info_text}

GENERATE THE QUESTION AND THE ANSWER BASED ONLY ON THE PROVIDED INFORMATION.
### OUTPUT FORMAT:
Your response must follow this exact structure:
Q: [Write a clear, engaging question about {label} based on above abstract and information]
A) * [Correct answer]
B) [Incorrect answer]
C) [Incorrect answer]
-------------------------------
Fact: [One interesting fact about {label} based on the provided information]

### REQUIREMENTS:
1. Ensure to have a correct answer
2. Create incorrect options.
3. Mark the correct answer with an asterisk *
4. The fact should highlight something notable about {label}.
5. Use the information provided to generate the question and the correct answer
6. Ensure the correct answer is clearly marked.
7. Check that the marked answer is really correct based on the provided knowledge
8. Do not invent answers
9. There must be a correct option in the answers and must be marked *

GENERATE EXACTLY ONE QUESTION
"""

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate response
    try:
        outputs = model.generate(**inputs, max_new_tokens=2000)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response_list.append(response)
        print(f"\n--- {label} ---\n{response}")
    except Exception as e:
        print(f"Error generating response for {label}: {e}")

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]


--- Collections of the Ospedale Maggiore ---

You are an expert tour guide in Milan about Collections of the Ospedale Maggiore, museum in Italy.

### TASK:
You must create a Multiple-choice question using the information provided:

Carefully read these information below to generate the question and the answers:
The Collections of the Ospedale Maggiore was Built/Established around 1456-01-01T00:00:00Z

GENERATE THE QUESTION AND THE ANSWER BASED ONLY ON THE PROVIDED INFORMATION.
### OUTPUT FORMAT:
Your response must follow this exact structure:
Q: [Write a clear, engaging question about Collections of the Ospedale Maggiore based on above abstract and information]
A) * [Correct answer]
B) [Incorrect answer]
C) [Incorrect answer]
-------------------------------
Fact: [One interesting fact about Collections of the Ospedale Maggiore based on the provided information]

### REQUIREMENTS:
1. Ensure to have a correct answer
2. Create incorrect options.
3. Mark the correct answer with an asteris

In [None]:
def extract_poi_info(text):
    # Extract POI name (stop at the first comma)
    poi_start = text.find('about ') + 6
    poi_end = text.find(',', poi_start)
    if poi_end == -1:  # If no comma found, use the end of the line
        poi_end = text.find('\n', poi_start)
    poi_name = text[poi_start:poi_end].strip()

    # Find the section after 'GENERATE EXACTLY ONE QUESTION'
    question_section = text.split('GENERATE EXACTLY ONE QUESTION')[-1].strip()

    # Extract question
    question_start = question_section.find('Q: ') + 3
    question_end = question_section.find('\n', question_start)
    question = question_section[question_start:question_end].strip()

    # Extract answers
    answers = []
    answer_lines = question_section.split('\n')[1:4]
    for line in answer_lines:
        answers.append(line.strip())

    # Extract fact
    fact_start = question_section.find('Fact: ') + 6
    fact = question_section[fact_start:].strip()

    return {
        'poi_name': poi_name,
        'question': question,
        'answers': answers,
        'fact': fact
    }

def process_poi_list(text_list):
    """
    Process a list of POI texts and return a list of extracted information.

    Args:
    text_list (list): A list of texts about different POIs

    Returns:
    list: A list of dictionaries, each containing extracted POI information
    """
    processed_pois = []

    for text in text_list:
        try:
            poi_info = extract_poi_info(text)
            processed_pois.append(poi_info)
        except Exception as e:
            print(f"Error processing text: {e}")
            # Optionally, you can append a placeholder or skip the problematic text
            processed_pois.append({
                'poi_name': 'Error',
                'question': 'Error',
                'answers': [],
                'fact': 'Error processing text'
            })

    return processed_pois

# Example usage
# Assuming your_list is the list of texts
processed_pois = process_poi_list(response_list)

# To print out all extracted information
# for poi in processed_pois:
#     print("POI Name:", poi['poi_name'])
#     print("Question:", poi['question'])
#     print("Answers:", poi['answers'])
#     print("Fact:", poi['fact'])
#     print("---")

In [None]:
processed_pois

[{'poi_name': 'Collections of the Ospedale Maggiore',
  'question': 'When was the Ospedale Maggiore in Milan established?',
  'answers': ['A) 1456-01-01T00:00:00Z *',
   'B) 1556-01-01T00:00:00Z',
   'C) 1656-01-01T00:00:00Z'],
  'fact': 'The Ospedale Maggiore in Milan was established around the year 1456.'},
 {'poi_name': 'Duomo di Milano Museum',
  'question': 'Which of the following is a unique feature of the Duomo di Milano Museum?',
  'answers': ['A) * A collection of ancient Roman artifacts',
   'B) A gallery of modern Italian art',
   'C) A display of medieval weaponry'],
  'fact': 'The Duomo di Milano Museum houses a collection of original tools and materials used in the construction of the cathedral.'},
 {'poi_name': 'Monument to Federico Borromeo',
  'question': 'Who was the architect of the Monument to Federico Borromeo?',
  'answers': ['A) * Costantino Corti',
   'B) Leonardo da Vinci',
   'C) Michelangelo'],
  'fact': 'The Monument to Federico Borromeo was built around 186