In [20]:
import pandas as pd
from pyproj import Transformer

# Load the CSV
df_stops = pd.read_csv("/home/vincent/repos/isc-summer-school/haltestellen-oev_2056_fr.csv/PointExploitation.csv", sep=",") 

# LV95 (CH1903+) to WGS84 transformer
transformer = Transformer.from_crs("EPSG:2056", "EPSG:4326", always_xy=True)

# Convert E, N to lon, lat
df_stops["longitude"], df_stops["latitude"] = transformer.transform(df_stops["E"].values, df_stops["N"].values)

public_transport_stops = list(zip(df_stops["latitude"], df_stops["longitude"]))

In [21]:
df_stops

Unnamed: 0,xtf_id,Numero,Nom,Abreviation,EntrepriseTransport_Numero,EntrepriseTransport_Abreviation,TypePointExploitation_Code,TypePointExploitation_Designation,MoyenTransport_Code,MoyenTransport_Designation,...,Commune_Nom,Validite_DebutValidite,Validite_FinValidite,Validite_Etat,rArretSuperieur,E,N,H,longitude,latitude
0,ch14uvag00044646,8590569,"Dübendorf, Kämmaten",,560,VBG,VP,Arrêt,A,Bus,...,Dübendorf,20210401,,20250727,,2687433,1249385,484,8.596653,47.389490
1,ch14uvag00044507,8579614,"Dombresson, temple",,47,TRN,VP,Arrêt,A,Bus,...,Val-de-Ruz,20210401,,20250727,,2563688,1213566,743,6.960551,47.072110
2,ch14uvag00065200,8591257,"Zürich, Limmatplatz",,346,VBZ,VP,Arrêt,AC,Bus / Tram,...,Zürich,20210401,,20250727,,2682531,1248771,405,8.531623,47.384600
3,ch14uvag00045454,8573597,"Eschenbach SG, Twirren",,801,BRER,VP,Arrêt,A,Bus,...,Eschenbach (SG),20210401,,20250727,,2713165,1233290,484,8.933333,47.240833
4,ch14uvag00049732,8573198,"Kleinandelfingen, Bad",,7,PAG,VP,Arrêt,A,Bus,...,Kleinandelfingen,20210401,,20250727,,2693870,1273063,365,8.686885,47.601534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28817,ch14uvag00060467,8502893,"Staffelegg, Passhöhe",,7,PAG,VP,Arrêt,A,Bus,...,Densbüren,20210401,,20250727,,2646901,1253850,620,8.060317,47.433779
28818,ch14uvag00055567,8581620,"Opfershofen TG, Dorf",,7,PAG,VP,Arrêt,A,Bus,...,Bürglen (TG),20220726,,20250727,,2730673,1269101,448,9.174979,47.559612
28819,ch14uvag00054393,8505723,"Naters, Moos",,7,PAG,VP,Arrêt,A,Bus,...,Naters,20210401,,20250727,,2642065,1131699,955,7.984953,46.335368
28820,ch14uvag00043524,8587384,"Chêne-Bougeries, Gradelle",,316,TPG,VP,Arrêt,A,Bus,...,Chêne-Bougeries,20241215,,20250727,,2502893,1117837,421,6.180466,46.204965


In [23]:
import pandas as pd, numpy as np, random
from geopy.distance import geodesic

# reproducibility
SEED, N_ROWS = 42, 500
np.random.seed(SEED); random.seed(SEED)

# ------------------------------------------------------------------
# reference coordinates
# ------------------------------------------------------------------
hesso_campuses = {
    "HES-SO Sion"  : (46.2270, 7.3625),
    "HES-SO Sierre": (46.2919, 7.5356),
    "HES-SO Loeche"  : (46.3786, 7.6265),
}

valais_locations = {
    "Sion": (46.2276, 7.3585), "Sierre": (46.2919, 7.5356), "Visp": (46.2930, 7.8836),
    "Martigny": (46.1020, 7.0723), "Monthey": (46.2520, 6.9486), "Brig": (46.3167, 7.9833),
    "Leuk": (46.3170, 7.6340), "Fully": (46.1381, 7.1107), "Conthey": (46.2170, 7.3000),
    "Vétroz": (46.2220, 7.2830), "Saxon": (46.1494, 7.1753), "St-Maurice": (46.2194, 7.0011),
    "Riddes": (46.1725, 7.2163),
}

postal_code = {
    "Sion":"1950","Sierre":"3960","Visp":"3930","Martigny":"1920","Monthey":"1870","Brig":"3900",
    "Leuk":"3953","Fully":"1926","Conthey":"1964","Vétroz":"1963","Saxon":"1907",
    "St-Maurice":"1890","Riddes":"1908",
}

# ------------------------------------------------------------------
# helpers
# ------------------------------------------------------------------
def nearest_hesso(lat, lon):
    d = {k: geodesic((lat, lon), xy).km for k, xy in hesso_campuses.items()}
    nearest = min(d, key=d.get)
    return round(d[nearest], 2), nearest

def random_surface(n_rooms):
    if n_rooms <= 1.0:
        return round(np.random.uniform(15, 30), 1)
    elif n_rooms <= 2.0:
        return round(np.random.uniform(28, 45), 1)
    else:
        return round(np.random.uniform(40, 65), 1)

def print_stats(label, values):
    print(f"{label:<12} → min: {values.min():>4},  max: {values.max():>4},  mean: {values.mean():.1f}")

def distance_to_nearest_stop(lat, lon, stops=public_transport_stops):
    return min(geodesic((lat, lon), (stop_lat, stop_lon)).km for stop_lat, stop_lon in stops)
        
def make_listing():
    # ----------------- location -----------------
    city             = random.choice(list(valais_locations))
    base_lat, base_lon = valais_locations[city]
    lat = round(base_lat + random.uniform(-0.005, 0.005), 6)
    lon = round(base_lon + random.uniform(-0.005, 0.005),6)
    prox_km, nearest = nearest_hesso(lat, lon)

    # ----------------- dwelling specs ----------
    num_rooms  = 1 if random.random() < 0.8 else random.choice([1.5, 2, 2.5, 3])
    accom_type = "room" if num_rooms <= 1 else "entire_home"
    surface_m2 = random_surface(num_rooms)

    is_furn         = random.choice([True, False])
    floor           = np.random.randint(0, 6)
    wifi_incl       = random.choice([True, False])
    chg_incl        = random.choice([True, False])
    pub_trans_km    = round(distance_to_nearest_stop(lat, lon), 3)
    car_park        = random.random() < (0.150 if accom_type == "room" else 0.4)

    # ----------------- pricing -----------------
    computed = (
        20  * surface_m2
        + 50 * chg_incl
        + 40 * wifi_incl
        + 90 * is_furn
        - min(10 * prox_km, 150) 
        - min(40 * pub_trans_km, 120)
        + 500 * (accom_type == "entire_home")
        +  60 * car_park
    )
    base_price = max(300 if accom_type == "room" else 800, round(computed))
    price      = int(base_price + np.random.normal(0, 30))

    return {
        "computed": computed,
        "base_price": base_price,
        "price_chf": price,
        "city": city,
        "postal_code": postal_code[city],
        "latitude": round(lat, 6),
        "longitude": round(lon, 6),
        "surface_m2": surface_m2,
        "num_rooms": num_rooms,
        "type": accom_type,
        "is_furnished": is_furn,
        "floor": floor,
        "wifi_incl": wifi_incl,
        "charges_incl": chg_incl,
        "car_park": car_park,
        "dist_public_transport_km": round(pub_trans_km, 3),
        "proxim_hesso_km": prox_km,
        "nearest_hesso_name": nearest,
    }

# ------------------------------------------------------------------
# produce dataset
# ------------------------------------------------------------------
df = pd.DataFrame(make_listing() for _ in range(N_ROWS))
print_stats("computed", df["computed"])
print_stats("base_price", df["base_price"])
print_stats("price_chf", df["price_chf"])
print(df.head())

df = df.drop(columns=['computed', 'base_price'])
df.to_csv("synthetic_valais_price.csv", index=False)

computed     → min: 135.8,  max: 1900.5,  mean: 632.4
base_price   → min:  300,  max: 1900,  mean: 638.7
price_chf    → min:  240,  max: 1862,  mean: 638.4
   computed  base_price  price_chf      city postal_code   latitude  \
0    422.68         423        439     Saxon        1907  46.145513   
1    597.80         598        579      Sion        1950  46.222898   
2    349.16         349        288    Vétroz        1963  46.219782   
3    279.12         300        285  Martigny        1920  46.106572   
4    193.96         300        306    Riddes        1908  46.167935   

   longitude  surface_m2  num_rooms  type  is_furnished  floor  wifi_incl  \
0   7.177716        20.6        1.0  room          True      4       True   
1   7.355686        24.0        1.0  room          True      1       True   
2   7.286693        17.3        1.0  room          True      2      False   
3   7.070666        17.1        1.0  room          True      2      False   
4   7.215894        15.3        