In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import pickle
from sklearn.model_selection import cross_val_score
from geopy.distance import geodesic


In [2]:

# Load the data
df_bfs_data = pd.read_csv('apartments_data_enriched_lat_lon_combined.csv', sep=',', encoding='utf-8')


In [3]:

# Define train stations in Zurich canton
train_stations = {
    "Zürich HB": (47.378177, 8.540192),
    "Zürich Oerlikon": (47.4116, 8.5446),
    "Zürich Altstetten": (47.3913, 8.4850),
    "Zürich Enge": (47.3643, 8.5312),
    "Zürich Stadelhofen": (47.3663, 8.5485),
    "Winterthur": (47.4998, 8.7257),
    "Dietikon": (47.4052, 8.4009),
    "Uster": (47.3476, 8.7207),
    "Dübendorf": (47.3978, 8.6189),
    "Wetzikon": (47.3275, 8.7976),
    "Bülach": (47.5202, 8.5385),
    "Meilen": (47.2699, 8.6458),
    "Thalwil": (47.2911, 8.5647),
    "Horgen": (47.2597, 8.5975)
}


In [4]:

# Function to calculate distance to nearest train station
def get_nearest_station_distance(lat, lon):
    return min([geodesic((lat, lon), coords).km for coords in train_stations.values()])

# Function to get name of nearest train station
def get_nearest_station_name(lat, lon):
    return min(train_stations.keys(), key=lambda station: geodesic((lat, lon), train_stations[station]).km)

# Calculate distances and add as new features
df_bfs_data['distance_to_train_station'] = df_bfs_data.apply(
    lambda row: get_nearest_station_distance(row['lat'], row['lon']), axis=1)
df_bfs_data['nearest_train_station'] = df_bfs_data.apply(
    lambda row: get_nearest_station_name(row['lat'], row['lon']), axis=1)


In [5]:

# Model performance evaluation function
def model_performance(features, df_bfs_data, random_forest_model=RandomForestRegressor(random_state=42)):
    df_bfs_data = df_bfs_data.sample(frac=1, random_state=42)
    X, y = df_bfs_data[features], df_bfs_data['price']
    scores = cross_val_score(random_forest_model, X, y, scoring="neg_root_mean_squared_error", cv=5)
    print('CV results RMSE: ', np.round(scores))
    print('Mean RMSE: ', np.mean(np.round(scores, 0)))
    return scores


In [6]:

# Price prediction function
def predict_price(model, rooms, area, pop, pop_dens, frg_pct, emp, tax_income, distance_to_train_station):
    input_data = np.array([[rooms, area, pop, pop_dens, frg_pct, emp, tax_income, distance_to_train_station]])
    return model.predict(input_data)[0]


In [7]:

# Define features for the old and new models
old_features = ['rooms', 'area', 'pop', 'pop_dens', 'frg_pct', 'emp', 'tax_income']
new_features = old_features + ['distance_to_train_station']


In [8]:

# Define location dictionary
locations = {
    "Zürich": 261,
    "Kloten": 62,
    "Uster": 198,
    "Illnau-Effretikon": 296,
    "Feuerthalen": 27,
    "Pfäffikon": 177,
    "Ottenbach": 11,
    "Dübendorf": 191,
    "Richterswil": 138,
    "Maur": 195,
    "Embrach": 56,
    "Bülach": 53,
    "Winterthur": 230,
    "Oetwil am See": 157,
    "Russikon": 178,
    "Obfelden": 10,
    "Wald (ZH)": 120,
    "Niederweningen": 91,
    "Dällikon": 84,
    "Buchs (ZH)": 83,
    "Rüti (ZH)": 118,
    "Hittnau": 173,
    "Bassersdorf": 52,
    "Glattfelden": 58,
    "Opfikon": 66,
    "Hinwil": 117,
    "Regensberg": 95,
    "Langnau am Albis": 136,
    "Dietikon": 243,
    "Erlenbach (ZH)": 151,
    "Kappel am Albis": 6,
    "Stäfa": 158,
    "Zell (ZH)": 231,
    "Turbenthal": 228,
    "Oberglatt": 92,
    "Winkel": 72,
    "Volketswil": 199,
    "Kilchberg (ZH)": 135,
    "Wetzikon (ZH)": 121,
    "Zumikon": 160,
    "Weisslingen": 180,
    "Elsau": 219,
    "Hettlingen": 221,
    "Rüschlikon": 139,
    "Stallikon": 13,
    "Dielsdorf": 86,
    "Wallisellen": 69,
    "Dietlikon": 54,
    "Meilen": 156,
    "Wangen-Brüttisellen": 200,
    "Flaach": 28,
    "Regensdorf": 96,
    "Niederhasli": 90,
    "Bauma": 297,
    "Aesch (ZH)": 241,
    "Schlieren": 247,
    "Dürnten": 113,
    "Unterengstringen": 249,
    "Gossau (ZH)": 115,
    "Oberengstringen": 245,
    "Schleinikon": 98,
    "Aeugst am Albis": 1,
    "Rheinau": 38,
    "Höri": 60,
    "Rickenbach (ZH)": 225,
    "Rafz": 67,
    "Adliswil": 131,
    "Zollikon": 161,
    "Urdorf": 250,
    "Hombrechtikon": 153,
    "Birmensdorf (ZH)": 242,
    "Fehraltorf": 172,
    "Weiach": 102,
    "Männedorf": 155,
    "Küsnacht (ZH)": 154,
    "Hausen am Albis": 4,
    "Hochfelden": 59,
    "Fällanden": 193,
    "Greifensee": 194,
    "Mönchaltorf": 196,
    "Dägerlen": 214,
    "Thalheim an der Thur": 39,
    "Uetikon am See": 159,
    "Seuzach": 227,
    "Uitikon": 248,
    "Affoltern am Albis": 2,
    "Geroldswil": 244,
    "Niederglatt": 89,
    "Thalwil": 141,
    "Rorbas": 68,
    "Pfungen": 224,
    "Weiningen (ZH)": 251,
    "Bubikon": 112,
    "Neftenbach": 223,
    "Mettmenstetten": 9,
    "Otelfingen": 94,
    "Flurlingen": 29,
    "Stadel": 100,
    "Grüningen": 116,
    "Henggart": 31,
    "Dachsen": 25,
    "Bonstetten": 3,
    "Bachenbülach": 51,
    "Horgen": 295
}


In [9]:

# Train and evaluate models (optional - can be commented out if model is already saved)
print("Evaluating models...")
print("Performance with original features:")
old_scores = model_performance(old_features, df_bfs_data)
print("\nPerformance with added train station distance feature:")
new_scores = model_performance(new_features, df_bfs_data)

# Train the model with the new features
final_model = RandomForestRegressor(random_state=42)
X = df_bfs_data[new_features]
y = df_bfs_data['price']
final_model.fit(X, y)


Evaluating models...
Performance with original features:
CV results RMSE:  [-837. -687. -749. -891. -629.]
Mean RMSE:  -758.6

Performance with added train station distance feature:
CV results RMSE:  [-781. -600. -682. -838. -589.]
Mean RMSE:  -698.0


In [10]:

# Save the new model
try:
    with open('random_forest_regression_with_station.pkl', 'wb') as f:
        pickle.dump(final_model, f)
    print('\nNew model trained and saved successfully with', final_model.n_features_in_, 'features')
    print('Features used:', new_features)
except Exception as e:
    print(f"Error saving model: {e}")



New model trained and saved successfully with 8 features
Features used: ['rooms', 'area', 'pop', 'pop_dens', 'frg_pct', 'emp', 'tax_income', 'distance_to_train_station']


In [11]:

# Load the trained model
try:
    with open('random_forest_regression_with_station.pkl', 'rb') as f:
        random_forest_model = pickle.load(f)
    print("Model loaded successfully")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Using the newly trained model instead")
    random_forest_model = final_model


Model loaded successfully


In [12]:

# Define the core prediction function
def predict_apartment(rooms, area, town):
    if town not in locations:
        print(f"Town '{town}' not found in locations dictionary")
        return -1
        
    bfs_number = locations[town]
    
    # Get municipality data 
    municipality_data = df_bfs_data[df_bfs_data['bfs_number'] == bfs_number]
    
    if len(municipality_data) == 0:
        print(f"No data found for BFS number {bfs_number} (town: {town})")
        return -1
    
    # Create a copy of the first row from this municipality
    prediction_row = municipality_data.iloc[0:1].copy()
    
    # Update with our input parameters
    prediction_row.loc[prediction_row.index[0], 'rooms'] = rooms
    prediction_row.loc[prediction_row.index[0], 'area'] = area
    
    # Ensure we have the distance_to_train_station feature
    if 'distance_to_train_station' not in prediction_row.columns and 'lat' in prediction_row.columns and 'lon' in prediction_row.columns:
        prediction_row['distance_to_train_station'] = prediction_row.apply(
            lambda row: get_nearest_station_distance(row['lat'], row['lon']), axis=1)
    
    # Check that we have all required features
    required_features = ['rooms', 'area', 'pop', 'pop_dens', 'frg_pct', 'emp', 'tax_income', 'distance_to_train_station']
    missing_features = [feat for feat in required_features if feat not in prediction_row.columns]
    
    if missing_features:
        print(f"Missing features for prediction: {missing_features}")
        return -1
        # Make prediction
    try:
        prediction = random_forest_model.predict(prediction_row[required_features])
        return np.round(prediction[0], 0)
    except Exception as e:
        print(f"Error making prediction: {e}")
        print(f"Available columns: {prediction_row.columns.tolist()}")
        return -1

    


In [13]:

# Example usage
if __name__ == "__main__":
    # Print some data for diagnostic purposes
    print("\nFirst few rows of the dataset:")
    print(df_bfs_data.head(2))
    print("\nColumns available:")
    print(df_bfs_data.columns.tolist())
    
    # Example predictions
    test_towns = ['Zürich', 'Winterthur', 'Uster']
    rooms = 3
    area = 100
    
    print("\nTesting predictions:")
    for town in test_towns:
        price = predict_apartment(rooms, area, town)
        if price != -1:
            print(f"Predicted price for a {rooms}-room, {area}m² apartment in {town}: CHF {price}")
        else:
            print(f"Failed to predict price for {town}")
            
   # Generate more predictions with different parameters
    print("\nVarying parameters in Zürich:")
    for r in [1, 2, 3, 4, 5]:
        for a in [50, 75, 100, 125]:
            price = predict_apartment(r, a, 'Zürich')
            if price != -1:
                print(f"{r} rooms, {a}m²: CHF {price}")


First few rows of the dataset:
   bfs_number  rooms  area  price  postalcode  \
0         261    4.5   148   4180        8050   
1         261    2.0   122   3190        8050   

                                address     town  \
0  Schaffhauserstrasse 363, 8050 Zürich   Zürich   
1         Max Bill Platz 5, 8050 Zürich   Zürich   

                                     description_raw bfs_name     pop  \
0  ««Renovierte 4.5-Zimmerwohnung an zentraler La...   Zürich  420217   
1          «Modernes Wohnen im Zentrum von Oerlikon»   Zürich  420217   

      pop_dens    frg_pct       emp  tax_income        lat       lon  \
0  4778.994655  32.458468  491193.0       85446  47.411068  8.546547   
1  4778.994655  32.458468  491193.0       85446  47.413803  8.539404   

               x           y  distance_to_train_station nearest_train_station  
0  251729.250000  683615.375                   0.158395       Zürich Oerlikon  
1  252026.109375  683072.000                   0.462351       Züri