In [1]:
import random
import json

def generate_examples(num_examples=2000, plausible_ratio=0.5):
    """
    Generate unique plausible/implausible examples from a large mapping of cities->attractions.
    
    num_examples: total number of examples desired
    plausible_ratio: fraction of examples that should be plausible (e.g., 0.5 means 50%).
    
    Returns: a list of dicts, each with:
        {
            "input": "...",
            "target_scores": {
                "plausible": 1 or 0,
                "implausible": 0 or 1
            }
        }
    """
    # 1) Define a large dictionary of city -> list of plausible attractions.
    #    For demonstration, here are 40 cities * 5 attractions = 200 plausible combos total.
    #    In a real dataset, you'd want many more to reach 1,000+ unique plausible combos.
    cities_and_attractions = {
        "Rome": [
            "the Colosseum",
            "the Trevi Fountain",
            "the Pantheon",
            "the Roman Forum",
            "Piazza Navona"
        ],
        "Paris": [
            "the Eiffel Tower",
            "the Louvre Museum",
            "Notre-Dame Cathedral",
            "Champs-Élysées",
            "Sacré-Cœur Basilica"
        ],
        "New York": [
            "the Statue of Liberty",
            "Times Square",
            "Central Park",
            "the Empire State Building",
            "the Brooklyn Bridge"
        ],
        "London": [
            "Big Ben",
            "the Tower of London",
            "Buckingham Palace",
            "the British Museum",
            "Trafalgar Square"
        ],
        "Beijing": [
            "the Great Wall of China",
            "the Forbidden City",
            "Tiananmen Square",
            "the Summer Palace",
            "the Temple of Heaven"
        ],
        "Cairo": [
            "the Pyramids of Giza",
            "the Sphinx",
            "the Egyptian Museum",
            "Khan el-Khalili",
            "Al-Azhar Mosque"
        ],
        "Tokyo": [
            "Tokyo Tower",
            "Shibuya Crossing",
            "the Imperial Palace",
            "Sensoji Temple",
            "Meiji Shrine"
        ],
        "Sydney": [
            "the Sydney Opera House",
            "Bondi Beach",
            "the Harbour Bridge",
            "Darling Harbour",
            "Taronga Zoo"
        ],
        "Rio de Janeiro": [
            "Christ the Redeemer",
            "Copacabana Beach",
            "Sugarloaf Mountain",
            "Ipanema Beach",
            "Maracanã Stadium"
        ],
        "Bangkok": [
            "the Grand Palace",
            "Wat Pho",
            "Chatuchak Market",
            "Wat Arun",
            "Khao San Road"
        ],
        "Berlin": [
            "the Brandenburg Gate",
            "the Reichstag Building",
            "Checkpoint Charlie",
            "Museum Island",
            "the Berlin Wall Memorial"
        ],
        "Barcelona": [
            "La Sagrada Família",
            "Park Güell",
            "La Rambla",
            "Casa Milà",
            "Casa Batlló"
        ],
        "Athens": [
            "the Acropolis",
            "the Parthenon",
            "the Temple of Olympian Zeus",
            "the Ancient Agora",
            "Mount Lycabettus"
        ],
        "Moscow": [
            "Red Square",
            "the Kremlin",
            "Saint Basil's Cathedral",
            "the Bolshoi Theatre",
            "Gorky Park"
        ],
        "Istanbul": [
            "the Hagia Sophia",
            "the Blue Mosque",
            "Topkapi Palace",
            "the Grand Bazaar",
            "the Bosphorus Bridge"
        ],
        "Amsterdam": [
            "the Anne Frank House",
            "the Rijksmuseum",
            "the Van Gogh Museum",
            "Dam Square",
            "the Heineken Experience"
        ],
        "Venice": [
            "St. Mark’s Basilica",
            "the Grand Canal",
            "the Rialto Bridge",
            "Doge’s Palace",
            "Piazza San Marco"
        ],
        "Dubai": [
            "the Burj Khalifa",
            "the Dubai Mall",
            "Palm Jumeirah",
            "the Dubai Fountain",
            "the Burj Al Arab"
        ],
        "Singapore": [
            "Marina Bay Sands",
            "Gardens by the Bay",
            "the Merlion",
            "Orchard Road",
            "Sentosa Island"
        ],
        "Los Angeles": [
            "Hollywood Boulevard",
            "the Hollywood Sign",
            "Santa Monica Pier",
            "Universal Studios",
            "Rodeo Drive"
        ],
        "San Francisco": [
            "the Golden Gate Bridge",
            "Alcatraz Island",
            "Fisherman’s Wharf",
            "Chinatown",
            "Lombard Street"
        ],
        "Toronto": [
            "the CN Tower",
            "Niagara Falls (nearby)",
            "the Royal Ontario Museum",
            "Casa Loma",
            "Ripley's Aquarium of Canada"
        ],
        "Vancouver": [
            "Stanley Park",
            "Granville Island",
            "Capilano Suspension Bridge",
            "Grouse Mountain",
            "Science World"
        ],
        "Munich": [
            "Marienplatz",
            "Neuschwanstein Castle (nearby)",
            "the English Garden",
            "Hofbräuhaus",
            "Olympiapark"
        ],
        "Vienna": [
            "Schönbrunn Palace",
            "St. Stephen’s Cathedral",
            "the Hofburg",
            "the Belvedere",
            "the Vienna State Opera"
        ],
        "Florence": [
            "the Uffizi Gallery",
            "the Ponte Vecchio",
            "the Duomo (Cathedral of Santa Maria del Fiore)",
            "Piazza della Signoria",
            "the Accademia Gallery"
        ],
        "Milan": [
            "the Duomo di Milano",
            "Galleria Vittorio Emanuele II",
            "the Sforza Castle",
            "the Teatro alla Scala",
            "the Navigli District"
        ],
        "Madrid": [
            "the Prado Museum",
            "Puerta del Sol",
            "Plaza Mayor",
            "the Royal Palace",
            "El Retiro Park"
        ],
        "Lisbon": [
            "São Jorge Castle",
            "Belém Tower",
            "Jerónimos Monastery",
            "Alfama District",
            "the Commerce Square"
        ],
        "Prague": [
            "Charles Bridge",
            "Prague Castle",
            "the Old Town Square",
            "the Astronomical Clock",
            "Wenceslas Square"
        ],
        "Budapest": [
            "the Parliament Building",
            "Buda Castle",
            "Fisherman’s Bastion",
            "Heroes’ Square",
            "the Széchenyi Thermal Bath"
        ],
        "Warsaw": [
            "the Royal Castle",
            "the Old Town Market Square",
            "Łazienki Park",
            "the Warsaw Uprising Museum",
            "the Palace of Culture and Science"
        ],
        "Stockholm": [
            "the Vasa Museum",
            "Gamla Stan (Old Town)",
            "the Royal Palace",
            "Skansen",
            "Drottningholm Palace"
        ],
        "Oslo": [
            "the Viking Ship Museum",
            "the Oslo Opera House",
            "the Royal Palace",
            "Vigeland Park",
            "Aker Brygge"
        ],
        "Copenhagen": [
            "Nyhavn",
            "the Little Mermaid",
            "Tivoli Gardens",
            "Rosenborg Castle",
            "Amalienborg Palace"
        ],
        "Helsinki": [
            "Helsinki Cathedral",
            "Suomenlinna Fortress",
            "Market Square",
            "Temppeliaukio Church",
            "the Sibelius Monument"
        ],
        "Seoul": [
            "Gyeongbokgung Palace",
            "Bukchon Hanok Village",
            "Myeongdong",
            "N Seoul Tower",
            "Changdeokgung Palace"
        ],
        "Hong Kong": [
            "Victoria Peak",
            "Tsim Sha Tsui Promenade",
            "Lantau Island’s Big Buddha",
            "Hong Kong Disneyland",
            "Temple Street Night Market"
        ],
        "Mumbai": [
            "the Gateway of India",
            "Marine Drive",
            "the Elephanta Caves",
            "Chhatrapati Shivaji Terminus",
            "the Haji Ali Dargah"
        ],
        "Delhi": [
            "the Red Fort",
            "India Gate",
            "Qutub Minar",
            "Humayun's Tomb",
            "Lotus Temple"
        ],
        "Jerusalem": [
            "the Western Wall",
            "the Dome of the Rock",
            "the Church of the Holy Sepulchre",
            "the Mount of Olives",
            "the Tower of David"
        ],
        "Cape Town": [
            "Table Mountain",
            "Robben Island",
            "V&A Waterfront",
            "Cape Point",
            "Kirstenbosch National Botanical Garden",
            "Bo-Kaap",
            "Camps Bay Beach",
            "District Six Museum",
            "Boulders Beach",
            "Signal Hill"
        ],
        "Edinburgh": [
            "Edinburgh Castle",
            "The Royal Mile",
            "Holyrood Palace",
            "Arthur’s Seat",
            "Princes Street Gardens",
            "St Giles’ Cathedral",
            "The Scotch Whisky Experience",
            "Calton Hill",
            "National Museum of Scotland",
            "Greyfriars Kirkyard"
        ],
        "Buenos Aires": [
            "La Boca",
            "Teatro Colón",
            "Plaza de Mayo",
            "Recoleta Cemetery",
            "Casa Rosada",
            "San Telmo Market",
            "Caminito Street Museum",
            "Palermo Soho",
            "Café Tortoni",
            "Obelisco de Buenos Aires"
        ],
        "Dublin": [
            "Trinity College & the Book of Kells",
            "Guinness Storehouse",
            "Temple Bar District",
            "Dublin Castle",
            "St. Patrick’s Cathedral",
            "Ha’penny Bridge",
            "Phoenix Park",
            "Kilmainham Gaol",
            "Christ Church Cathedral",
            "Grafton Street"
        ],
        "Reykjavík": [
            "Hallgrímskirkja",
            "The Sun Voyager sculpture",
            "Perlan (The Pearl)",
            "Harpa Concert Hall",
            "National Museum of Iceland",
            "Reykjavik Old Harbour",
            "Laugavegur shopping street",
            "Tjörnin Pond",
            "Aurora Reykjavík (Northern Lights Center)",
            "Whales of Iceland exhibition"
        ],
        "Melbourne": [
            "Federation Square",
            "Queen Victoria Market",
            "Royal Botanic Gardens",
            "National Gallery of Victoria",
            "St Kilda Beach",
            "Melbourne Cricket Ground (MCG)",
            "Flinders Street Station",
            "Shrine of Remembrance",
            "Hosier Lane (street art)",
            "Yarra River promenade"
        ],
        "Auckland": [
            "Sky Tower",
            "Auckland War Memorial Museum",
            "Viaduct Harbour",
            "Rangitoto Island",
            "Waiheke Island",
            "Mount Eden",
            "Kelly Tarlton’s Sea Life Aquarium",
            "Auckland Domain",
            "One Tree Hill (Maungakiekie)",
            "Mission Bay Beach"
        ],
        "Santiago": [
            "Plaza de Armas",
            "Cerro San Cristóbal",
            "La Moneda Palace",
            "Barrio Bellavista",
            "Santa Lucía Hill",
            "Chilean Museum of Pre-Columbian Art",
            "Mercado Central",
            "Metropolitan Cathedral of Santiago",
            "Centro Cultural Gabriela Mistral (GAM)",
            "Costanera Center (Sky Costanera)"
        ],
        "Mexico City": [
            "Zócalo (Plaza de la Constitución)",
            "Templo Mayor",
            "Palacio de Bellas Artes",
            "Chapultepec Castle",
            "National Museum of Anthropology",
            "Frida Kahlo Museum (La Casa Azul)",
            "Coyoacán neighborhood",
            "Xochimilco canals",
            "Basilica of Our Lady of Guadalupe",
            "Torre Latinoamericana"
        ],
        "Havana": [
            "Old Havana (Habana Vieja)",
            "El Malecón",
            "Castillo de la Real Fuerza",
            "Plaza de la Catedral",
            "Museo de la Revolución",
            "La Bodeguita del Medio",
            "El Capitolio",
            "Plaza de Armas",
            "Fusterlandia",
            "Gran Teatro de La Habana"
        ],
        "Manila": [
            "Intramuros (Walled City)",
            "Fort Santiago",
            "San Agustin Church",
            "Manila Cathedral",
            "Rizal Park (Luneta)",
            "National Museum of the Philippines",
            "Binondo (oldest Chinatown)",
            "Quiapo Church",
            "Cultural Center of the Philippines",
            "Malacañang Palace"
        ],
        "Lima": [
            "Plaza de Armas (Plaza Mayor)",
            "Larco Museum",
            "Miraflores District",
            "Huaca Pucllana",
            "Barranco neighborhood",
            "San Francisco Monastery",
            "Kennedy Park",
            "Magic Water Circuit (Circuito Mágico del Agua)",
            "Government Palace",
            "Parque del Amor (Love Park)"
        ],
        "Brussels": [
            "Grand Place (Grote Markt)",
            "Manneken Pis",
            "Atomium",
            "Royal Palace of Brussels",
            "Magritte Museum",
            "Cinquantenaire Park",
            "Belgian Comic Strip Center",
            "Galeries Royales Saint-Hubert",
            "Saint-Michel Cathedral",
            "Mini-Europe"
        ],
        "Ghent": [
            "Gravensteen Castle",
            "Saint Bavo’s Cathedral",
            "Graslei and Korenlei",
            "Belfry of Ghent",
            "St. Nicholas’ Church",
            "Design Museum Gent",
            "Patershol neighborhood",
            "Vrijdagmarkt",
            "Ghent City Museum (STAM)",
            "MSK – Museum of Fine Arts"
        ],
        "Seville": [
            "Seville Cathedral & Giralda Tower",
            "Real Alcázar of Seville",
            "Plaza de España",
            "Metropol Parasol (Las Setas)",
            "Torre del Oro",
            "Barrio Santa Cruz",
            "Archivo de Indias",
            "Parque de María Luisa",
            "Flamenco Museum",
            "Triana neighborhood"
        ],
        "Granada": [
            "The Alhambra",
            "Generalife Gardens",
            "Albaicín neighborhood",
            "Granada Cathedral",
            "Royal Chapel of Granada",
            "Sacromonte district (Flamenco caves)",
            "Mirador de San Nicolás",
            "Carrera del Darro",
            "Corral del Carbón",
            "Monastery of San Jerónimo"
        ],
        "Valencia": [
            "City of Arts and Sciences",
            "Valencia Cathedral & El Micalet tower",
            "Plaza de la Virgen",
            "Mercado Central",
            "Turia Gardens",
            "Llotja de la Seda (Silk Exchange)",
            "Oceanogràfic (aquarium)",
            "Barrio del Carmen",
            "Bioparc Valencia",
            "Albufera Natural Park"
        ],
        "Porto": [
            "Ribeira District",
            "Dom Luís I Bridge",
            "Livraria Lello (famous bookstore)",
            "São Bento Railway Station",
            "Clérigos Tower",
            "Porto Cathedral (Sé do Porto)",
            "Palácio da Bolsa",
            "Cais da Ribeira",
            "Casa da Música",
            "Vila Nova de Gaia wine cellars"
        ],
        "Bruges": [
            "The Markt (Market Square)",
            "Belfry of Bruges (Belfort)",
            "The Burg Square",
            "Basilica of the Holy Blood",
            "Groeningemuseum",
            "Minnewater (Lake of Love)",
            "Church of Our Lady (Onze-Lieve-Vrouwekerk)",
            "Rozenhoedkaai",
            "Sint-Janshospitaal (Hospital museum)",
            "Choco-Story (Chocolate Museum)"
        ],
        "Lucerne": [
            "Chapel Bridge (Kapellbrücke)",
            "Lion Monument (Löwendenkmal)",
            "Lake Lucerne",
            "Old Town Lucerne",
            "Jesuit Church",
            "Mount Pilatus (nearby)",
            "Mount Rigi (nearby)",
            "Rosengart Collection",
            "Musegg Wall",
            "Glacier Garden (Gletschergarten)"
        ],
        "Zürich": [
            "Lake Zurich",
            "Grossmünster Church",
            "Fraumünster Church",
            "Bahnhofstrasse",
            "Uetliberg Mountain",
            "Swiss National Museum (Landesmuseum)",
            "Kunsthaus Zürich (art museum)",
            "Lindenhof Hill",
            "Old Town (Altstadt)",
            "Zoo Zürich"
        ],
        "Naples": [
            "Pompeii (nearby)",
            "Mount Vesuvius (visible / nearby)",
            "National Archaeological Museum",
            "Piazza del Plebiscito",
            "Castel dell’Ovo",
            "Castel Nuovo (Maschio Angioino)",
            "Spaccanapoli street",
            "Sansevero Chapel Museum",
            "Naples Cathedral (Duomo di Napoli)",
            "Galleria Umberto I"
        ],
        "Bologna": [
            "The Two Towers (Asinelli and Garisenda)",
            "Piazza Maggiore",
            "Basilica di San Petronio",
            "Fountain of Neptune",
            "University of Bologna",
            "Archiginnasio of Bologna",
            "Quadrilatero district (food market)",
            "Santo Stefano complex (Seven Churches)",
            "Portico di San Luca",
            "Museo per la Memoria di Ustica"
        ],
        "Hamburg": [
            "Speicherstadt (warehouse district)",
            "Elbphilharmonie",
            "St. Michael’s Church (Der Michel)",
            "Reeperbahn (St. Pauli district)",
            "Miniatur Wunderland",
            "Port of Hamburg (HafenCity)",
            "Alster Lakes",
            "Planten un Blomen park",
            "Fischmarkt (Fish Market)",
            "Rathaus (City Hall)"
        ],
        "Cologne": [
            "Cologne Cathedral (Kölner Dom)",
            "Hohenzollern Bridge (Lovelocks Bridge)",
            "Old Town (Altstadt)",
            "Museum Ludwig",
            "Chocolate Museum (Schokoladenmuseum)",
            "Rheinauhafen waterfront",
            "Cologne Triangle (observation deck)",
            "Fragrance Museum (Farina House)",
            "Botanical Garden Flora",
            "NS Documentation Center (EL-DE Haus)"
        ],
        "Frankfurt": [
            "Römerberg square",
            "St. Paul’s Church (Paulskirche)",
            "Frankfurt Cathedral (Kaiserdom)",
            "Main Tower (observation deck)",
            "Goethe House and Museum",
            "Museumsufer (Museum Embankment)",
            "Palmengarten (botanical garden)",
            "Zeil shopping street",
            "Alte Oper (Old Opera House)",
            "Kleinmarkthalle"
        ],
        "Bilbao": [
            "Guggenheim Museum Bilbao",
            "Casco Viejo (Old Town)",
            "Plaza Nueva",
            "Zubizuri Bridge (Calatrava Bridge)",
            "Mount Artxanda (funicular)",
            "Azkuna Zentroa (Alhóndiga)",
            "Mercado de la Ribera",
            "Basilica of Begoña",
            "Doña Casilda Iturrizar Park",
            "Arriaga Theatre"
        ],
        "Marseille": [
            "Old Port (Vieux Port)",
            "Basilique Notre-Dame de la Garde",
            "Le Panier district",
            "MuCEM (Museum of European and Mediterranean Civilizations)",
            "Fort Saint-Jean",
            "Calanques National Park (nearby)",
            "Château d’If",
            "La Canebière avenue",
            "Palais Longchamp",
            "Cathédrale La Major"
        ],
        "Bordeaux": [
            "Place de la Bourse & Miroir d’Eau",
            "Cathédrale Saint-André",
            "La Cité du Vin (wine museum)",
            "Grand Théâtre de Bordeaux",
            "Rue Sainte-Catherine",
            "Pont de Pierre",
            "Grosse Cloche",
            "Jardin Public",
            "Musee d’Aquitaine",
            "Quinconces Square (Place des Quinconces)"
        ],
        "Nice": [
            "Promenade des Anglais",
            "Old Town (Vieux Nice)",
            "Castle Hill (Colline du Château)",
            "Cours Saleya Market",
            "Musée Matisse",
            "Place Masséna",
            "Marc Chagall National Museum",
            "Cathédrale Sainte-Réparate",
            "Port Lympia",
            "Villa Ephrussi de Rothschild (nearby Saint-Jean-Cap-Ferrat)"
        ]
}


    # 2) Flatten a list of all attractions for possible implausible pairs.
    all_attractions = set()
    for attrs in cities_and_attractions.values():
        all_attractions.update(attrs)
    all_attractions = list(all_attractions)  # for random.choice

    # 3) Decide how many plausible vs. implausible examples we want.
    num_plausible = int(num_examples * plausible_ratio)
    num_implausible = num_examples - num_plausible

    # 4) Simple sentence templates (WITHOUT extra years, seasons, etc.)
    templates = [
        "When I was in {}, I visited {}.",
        "During my stay in {}, I saw {}.",
        "While I was in {}, I went to {}.",
        "In {}, I checked out {}."
    ]

    # 5) Generate PL plausible examples (city + correct attraction).
    city_names = list(cities_and_attractions.keys())
    used_sentences = set()
    examples = []

    plausible_count = 0
    while plausible_count < num_plausible:
        city = random.choice(city_names)
        # If that city's list is small, we might pick the same attraction multiple times,
        # so we keep generating until we get a unique sentence
        attraction = random.choice(cities_and_attractions[city])
        template = random.choice(templates)
        sentence = template.format(city, attraction)

        if sentence not in used_sentences:
            used_sentences.add(sentence)
            examples.append({
                "input": sentence,
                "target_scores": {
                    "plausible": 1,
                    "implausible": 0
                }
            })
            plausible_count += 1

    # 6) Generate IM plausible examples (city + attraction NOT from city).
    implausible_count = 0
    while implausible_count < num_implausible:
        city = random.choice(city_names)

        # pick a random attraction that is NOT from that city
        # keep picking until it's truly from another city
        attraction = random.choice(all_attractions)
        if attraction not in cities_and_attractions[city]:
            template = random.choice(templates)
            sentence = template.format(city, attraction)
            if sentence not in used_sentences:
                used_sentences.add(sentence)
                examples.append({
                    "input": sentence,
                    "target_scores": {
                        "plausible": 0,
                        "implausible": 1
                    }
                })
                implausible_count += 1

    # 7) Shuffle the final examples
    random.shuffle(examples)

    return examples

if __name__ == "__main__":
    # Generate 2,000 total examples
    data = {"examples": generate_examples(num_examples=2000, plausible_ratio=0.5)}

    # Print to console in JSON format (could also write to file)
    print(json.dumps(data, indent=4, ensure_ascii=False))
    
    # save
    with open('data/city_reasoning/city_reasoning.json', 'w') as f:

{
    "examples": [
        {
            "input": "While I was in Amsterdam, I went to Archivo de Indias.",
            "target_scores": {
                "plausible": 0,
                "implausible": 1
            }
        },
        {
            "input": "During my stay in Delhi, I saw Qutub Minar.",
            "target_scores": {
                "plausible": 1,
                "implausible": 0
            }
        },
        {
            "input": "When I was in Valencia, I visited the Gateway of India.",
            "target_scores": {
                "plausible": 0,
                "implausible": 1
            }
        },
        {
            "input": "In Lucerne, I checked out Mount Pilatus (nearby).",
            "target_scores": {
                "plausible": 1,
                "implausible": 0
            }
        },
        {
            "input": "While I was in Budapest, I went to the Parliament Building.",
            "target_scores": {
                "plausible": 1