In [15]:
import random
import math
from llm import generate_simple

# Number of interviews to simulate
N_INTERVIEWS = 3 # Start with a smaller number for testing, e.g., 10

# City data
CITIES = [
    {"city": "Helsinki", "latitude": 60.1695, "longitude": 24.9354, "pop": 684589},
    {"city": "Tampere", "latitude": 61.4978, "longitude": 23.7608, "pop": 260358},
    {"city": "Turku", "latitude": 60.4518, "longitude": 22.2666, "pop": 206035},
    {"city": "Oulu", "latitude": 65.0121, "longitude": 25.4651, "pop": 216194},
    {"city": "Rovaniemi", "latitude": 66.5039, "longitude": 25.7294, "pop": 65738},
    {"city": "Kuopio", "latitude": 62.8910, "longitude": 27.6780, "pop": 125668},
    {"city": "Joensuu", "latitude": 62.6010, "longitude": 29.7639, "pop": 78743},
    {"city": "Jyväskylä", "latitude": 62.2426, "longitude": 25.7473, "pop": 149269},
]

# Distance threshold in kilometers to differentiate rural/urban themes
DISTANCE_THRESHOLD_KM = 30.0

# Standard deviation for adding noise to city coordinates (in degrees)
# Adjust this to control how far from the city center points are generated
LOCATION_NOISE_STD_DEV = 0.3 # Approx 33km variation N/S, less E/W at Finnish latitudes

# Themes
COMMON_THEMES = ["linnut", "luonto"] # Themes always included
RURAL_THEMES = ["maaseutu", "hiljaisuus"] # Added if location is rural
URBAN_THEMES = ["kaupunki", "äänet"] # Added if location is urban

# LLM Model for generation
GENERATION_MODEL = "llama3.3:70b" # Or choose another suitable model


In [16]:
def generate_location(cities, noise_std_dev):
    """
    Generates a random location biased towards the vicinity of a randomly chosen city.
    Returns (latitude, longitude, chosen_city_name).
    """
    city = random.choice(cities)
    city_name = city['city']
    base_lat, base_lon = city['latitude'], city['longitude']

    # Add Gaussian noise to the coordinates
    gen_lat = base_lat + random.gauss(0, noise_std_dev)
    gen_lon = base_lon + random.gauss(0, noise_std_dev)

    return gen_lat, gen_lon, city

def calculate_distance_km(lat1, lon1, lat2, lon2):
    """
    Calculates the approximate distance between two points using the Haversine formula.
    """
    R = 6371 # Earth radius in kilometers

    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    a = math.sin(dlat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2)**2
    c = 2 * math.asin(math.sqrt(a))

    distance = R * c
    return distance

def select_themes(distance_to_city, threshold, common, rural, urban):
    """Selects themes based on distance."""
    if distance_to_city > threshold:
        return common + rural
    else:
        return common + urban


In [17]:
simulated_data = []

print(f"Starting simulation of {N_INTERVIEWS} interviews...")

for i in range(N_INTERVIEWS):
    # 1. Generate Location
    latitude, longitude, city = generate_location(CITIES, LOCATION_NOISE_STD_DEV)
    city_lat, city_lon = city['latitude'], city['longitude']

    # 2. Calculate distance and select themes
    distance = calculate_distance_km(latitude, longitude, city_lat, city_lon)
    themes_for_interview = select_themes(distance, DISTANCE_THRESHOLD_KM, COMMON_THEMES, RURAL_THEMES, URBAN_THEMES)
    location_type = "rural" if distance > DISTANCE_THRESHOLD_KM else "urban"

    # 3. Prepare LLM Prompt
    instruction = f"""
Olet haastattelusimulaattori. Tehtäväsi on luoda lyhyt kuvitteellinen haastattelukatkelma (muutama kappale).
Henkilö puhuu kokemuksistaan luonnosta ja linnuista.
Sisällytä vastaukseen viittauksia annettuihin teemoihin.
Älä mainitse teemoja suoraan termeillä, vaan kuvaile niihin liittyviä asioita luontevasti osana kertomusta.
Vastaa suomeksi.
"""
    content = f"""
Teemat, joita tulee käsitellä implisiittisesti: {', '.join(themes_for_interview)}
"""

    # 4. Generate Interview Text using LLM
    try:
        result = generate_simple(
            instruction,
            content,
            model=GENERATION_MODEL,
            seed=i # Use loop index for reproducibility if needed, but vary for diverse outputs
        )
        interview_text = result['message']['content']
    except Exception as e:
        print(f"Error generating interview {i}: {e}")
        interview_text = f"ERROR: Could not generate text (Themes: {', '.join(themes_for_interview)})"

    # 5. Store results
    simulated_data.append({
        "interview_id": f"sim_{i:04d}",
        "latitude": latitude,
        "longitude": longitude,
        "city_context": city_name,
        "distance_to_city_km": round(distance, 2),
        "location_type_generated": location_type,
        "themes_used_for_generation": themes_for_interview,
        "interview_text": interview_text.strip()
    })

    # Print progress
    print(f"Generated interview {i + 1}/{N_INTERVIEWS}")

print("Simulation finished.")


Starting simulation of 3 interviews...
Generated interview 1/3
Generated interview 2/3
Generated interview 3/3
Simulation finished.


In [23]:
from pprint import pprint
pprint(simulated_data)


[{'city_context': 'Tampere',
  'distance_to_city_km': 24.65,
  'interview_id': 'sim_0000',
  'interview_text': 'Olen aina ollut kiinnostunut elävien olentojen '
                    'tutkimisesta ja tarkkailusta. Erityisesti nuo pienet, '
                    'siivekkäät ystävämme ovat aina ottaneet haltuun mieleni. '
                    'Muistan lapsuudestani, kun istuin ikkunan alla '
                    'kuuntelemassa heidän iloisia laulujaan. Se oli niin '
                    'rauhoittavaa.\n'
                    '\n'
                    'Kun asuin kaupungissa, nämä äänet olivat harvinaisia, '
                    'mutta sitten kun muutin maaseutuun, tämä köyhyyden ja '
                    'yksinkertaisuuden maailma paljastui minulle. Aamulla '
                    'herätessäni ensimmäinen asia, joka välittyi korviini, oli '
                    'tämä kauneus, jota en voinut käsittää. Jotkut laulavat '
                    'yksin, toiset kuorossa, ja heidän äänensä sekoittuvat '
        