In [21]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Datensatz laden
data = pd.read_csv('data_immo_geocoded.csv')

# Spaltennamen anzeigen
print("Spaltennamen:", data.columns)

# Entferne die ID-Spalte, falls vorhanden
if 'id' in data.columns:
    data = data.drop(columns=['id'])

# Entferne Hausnummern aus der 'Street'-Spalte
data['Street'] = data['Street'].str.replace(r'\d+', '', regex=True).str.strip()

# Entferne Ausreißer in der 'Price'-Spalte mittels IQR
Q1 = data['Price'].quantile(0.25)
Q3 = data['Price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
filtered_data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]

# Filtere alle fehlenden Werte heraus
filtered_data = filtered_data.dropna()

# Initialize encoders
onehot_encoder = OneHotEncoder(drop='first')
label_encoder = LabelEncoder()

# Encode 'Street' using label encoding
filtered_data['Street'] = label_encoder.fit_transform(filtered_data['Street'])

# Definiere die Preprocessing Pipeline
def get_preprocessor(include_coordinates):
    numerical_features = ['Rooms', 'Area']
    categorical_features = ['Street', 'Region', 'City']
    if include_coordinates:
        numerical_features.extend(['Latitude', 'Longitude'])
    return ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numerical_features),
            ('cat', onehot_encoder, categorical_features)
        ],
        remainder='passthrough'
    )

# Definiere Modelle
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Decision Tree': DecisionTreeRegressor()
}

# Funktionen zur Bewertung der Modelle
def evaluate_model(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_test, y_pred)
    return mae, rmse, r2

# Vergleiche Modelle ohne Koordinaten
print("Vergleich ohne Koordinaten:")
X = filtered_data.drop(columns=['Price', 'Latitude', 'Longitude'])
y = filtered_data['Price']
preprocessor = get_preprocessor(include_coordinates=False)
X_transformed = preprocessor.fit_transform(X)
results_without_coordinates = {}

for name, model in models.items():
    mae, rmse, r2 = evaluate_model(X_transformed, y, model)
    results_without_coordinates[name] = (mae, rmse, r2)
    print(f'{name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}, R^2: {r2:.2f}')

# Vergleiche Modelle mit Koordinaten
print("\nVergleich mit Koordinaten:")
X = filtered_data.drop(columns=['Price'])
y = filtered_data['Price']
preprocessor = get_preprocessor(include_coordinates=True)
X_transformed = preprocessor.fit_transform(X)
results_with_coordinates = {}

for name, model in models.items():
    mae, rmse, r2 = evaluate_model(X_transformed, y, model)
    results_with_coordinates[name] = (mae, rmse, r2)
    print(f'{name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}, R^2: {r2:.2f}')

# Bereinigten Datensatz speichern
filtered_data.to_csv('cleaned_data.csv', index=False)


Spaltennamen: Index(['id', 'Region', 'Rooms', 'Area', 'Price', 'Street', 'Zip', 'City',
       'Latitude', 'Longitude'],
      dtype='object')
Vergleich ohne Koordinaten:
Linear Regression - MAE: 187.41, RMSE: 272.33, R^2: 0.74
Random Forest - MAE: 172.05, RMSE: 259.34, R^2: 0.77
Gradient Boosting - MAE: 221.86, RMSE: 299.80, R^2: 0.69
Decision Tree - MAE: 176.29, RMSE: 295.10, R^2: 0.70

Vergleich mit Koordinaten:
Linear Regression - MAE: 187.90, RMSE: 272.53, R^2: 0.74
Random Forest - MAE: 167.15, RMSE: 248.97, R^2: 0.79
Gradient Boosting - MAE: 218.42, RMSE: 294.99, R^2: 0.70
Decision Tree - MAE: 184.72, RMSE: 309.28, R^2: 0.67


In [22]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

# Datensatz laden
data = pd.read_csv('data_immo_geocoded.csv')

# Spaltennamen anzeigen
print("Spaltennamen:", data.columns)

# Entferne die ID-Spalte, falls vorhanden
if 'id' in data.columns:
    data = data.drop(columns=['id'])

# Entferne Hausnummern aus der 'Street'-Spalte
data['Street'] = data['Street'].str.replace(r'\d+', '', regex=True).str.strip()

# Entferne Ausreißer in der 'Price'-Spalte mittels IQR
Q1 = data['Price'].quantile(0.25)
Q3 = data['Price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
filtered_data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]


filtered_data = filtered_data.dropna()


label_encoder = LabelEncoder()
filtered_data['Street'] = label_encoder.fit_transform(filtered_data['Street'])


kmeans = KMeans(n_clusters=10, random_state=42)
filtered_data['Zone'] = kmeans.fit_predict(filtered_data[['Latitude', 'Longitude']])


onehot_encoder = OneHotEncoder(drop='first')

# Definiere die Preprocessing Pipeline
def get_preprocessor():
    numerical_features = ['Rooms', 'Area']
    categorical_features = ['Street', 'Region', 'City', 'Zone']
    return ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numerical_features),
            ('cat', onehot_encoder, categorical_features)
        ],
        remainder='passthrough'
    )

# Definiere das Modell
model = RandomForestRegressor()

# Funktionen zur Bewertung des Modells
def evaluate_model(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_test, y_pred)
    return mae, rmse, r2

# Modelle bewerten
print("Vergleich mit Zonen (K-Means Clustering):")
X = filtered_data.drop(columns=['Price', 'Latitude', 'Longitude'])
y = filtered_data['Price']
preprocessor = get_preprocessor()
X_transformed = preprocessor.fit_transform(X)

mae, rmse, r2 = evaluate_model(X_transformed, y, model)
print(f'Random Forest - MAE: {mae:.2f}, RMSE: {rmse:.2f}, R^2: {r2:.2f}')


Spaltennamen: Index(['id', 'Region', 'Rooms', 'Area', 'Price', 'Street', 'Zip', 'City',
       'Latitude', 'Longitude'],
      dtype='object')
Vergleich mit Zonen (K-Means Clustering):
Random Forest - MAE: 170.03, RMSE: 256.09, R^2: 0.77


In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

# Custom transformer for KMeans clustering
class KMeansTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, random_state=42):
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
    
    def fit(self, X, y=None):
        self.kmeans.fit(X[['Latitude', 'Longitude']])
        return self
    
    def transform(self, X):
        zones = self.kmeans.predict(X[['Latitude', 'Longitude']])
        X = X.copy()
        X['Zone'] = zones
        return X

# Custom transformer for Label Encoding 'Street'
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.classes_ = None
    
    def fit(self, X, y=None):
        self.label_encoder.fit(np.append(X, 'unknown'))
        self.classes_ = self.label_encoder.classes_
        return self
    
    def transform(self, X):
        X_encoded = X.apply(lambda x: x if x in self.classes_ else 'unknown')
        return self.label_encoder.transform(X_encoded)
    
    def inverse_transform(self, X):
        return self.label_encoder.inverse_transform(X)

# Load dataset
data = pd.read_csv('data_immo_geocoded.csv')

# Display column names
print("Column names:", data.columns)

# Remove the ID column if present
if 'id' in data.columns:
    data = data.drop(columns=['id'])

# Remove house numbers from the 'Street' column
data['Street'] = data['Street'].str.replace(r'\d+', '', regex=True).str.strip()

# Remove outliers in the 'Price' column using IQR
Q1 = data['Price'].quantile(0.25)
Q3 = data['Price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
filtered_data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]

# Remove all missing values
filtered_data = filtered_data.dropna()

# Encode 'Street' using custom label encoding
label_encoder = CustomLabelEncoder()
filtered_data['Street'] = label_encoder.fit_transform(filtered_data['Street'])

# Initialize encoders
onehot_encoder = OneHotEncoder(drop='first', handle_unknown='ignore')

# Define the Preprocessing Pipeline
def get_preprocessor():
    numerical_features = ['Rooms', 'Area']
    categorical_features = ['Street', 'Region', 'City', 'Zone', 'Zip']
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numerical_features),
            ('cat', onehot_encoder, categorical_features)
        ],
        remainder='passthrough'
    )
    return preprocessor

# Define the model
model = RandomForestRegressor(n_jobs=-1)

# Cross-Validation
print("Cross-Validation with Zones (K-Means Clustering):")
X = filtered_data.drop(columns=['Price'])
y = filtered_data['Price']

# Create the pipeline with KMeansTransformer
pipeline = Pipeline(steps=[
    ('kmeans', KMeansTransformer()),
    ('preprocessor', get_preprocessor()),
    ('model', model)
])

# Define Scoring Metrics
scoring = {
    'MAE': make_scorer(mean_absolute_error),
    'RMSE': make_scorer(mean_squared_error, greater_is_better=False),
    'R2': make_scorer(r2_score)
}

# Perform Cross-Validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_val_score(pipeline, X, y, cv=cv, scoring='neg_mean_absolute_error')
cv_rmse_results = cross_val_score(pipeline, X, y, cv=cv, scoring='neg_mean_squared_error')
cv_r2_results = cross_val_score(pipeline, X, y, cv=cv, scoring='r2')

# Display Results
print(f"MAE: {np.mean(-cv_results):.2f}")
print(f"RMSE: {np.sqrt(np.mean(-cv_rmse_results)):.2f}")
print(f"R2: {np.mean(cv_r2_results):.2f}")

# Training the pipeline with the full data
pipeline.fit(X, y)

# Save the trained pipeline
joblib.dump(pipeline, 'real_estate_pipeline.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

Column names: Index(['id', 'Region', 'Rooms', 'Area', 'Price', 'Street', 'Zip', 'City',
       'Latitude', 'Longitude'],
      dtype='object')
Cross-Validation with Zones (K-Means Clustering):




MAE: 163.95
RMSE: 246.59
R2: 0.79


['label_encoder.pkl']

In [2]:
import pandas as pd
import re
import joblib

# Load the trained pipeline and label encoder
pipeline = joblib.load('real_estate_pipeline.pkl')
label_encoder = joblib.load('label_encoder.pkl')

# Function to make predictions with new data
def predict_new_data(new_data):
    new_data = new_data.copy()
    new_data['Street'] = label_encoder.transform(new_data['Street'].str.replace(r'\d+', '', regex=True).str.strip())
    return pipeline.predict(new_data)

# Example usage of the prediction function with the required new data
new_data = pd.DataFrame({
    'Rooms': [1.5, 4, 5.5,3.5],
    'Area': [30, 79, 115, 80],
    'Street': ['Segnesstrasse', 'Albisstrasse','Puentstrasse', 'Kirchrain'],
    'Region': ['Zuerich', 'Zuerich','Zuerich','Zuerich'],
    'City': ['Zuerich', 'Thalwil','Horgen', 'Hirzel'],
    'Latitude': [47.3887, 47.28947945131419,47.25566413947244,47.21792114153681],
    'Longitude': [8.4855, 8.562855462704258,8.59696745636132, 8.609008997637604],
    'Zip': ['8048', '8800','8810','8816']
})

predictions = predict_new_data(new_data)
print("Predictions for new data:", predictions)

Predictions for new data: [1859.46166667 2031.45       2777.21       2188.09      ]




In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb
import joblib

# Custom transformer for KMeans clustering
class KMeansTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, random_state=42):
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
    
    def fit(self, X, y=None):
        self.kmeans.fit(X[['Latitude', 'Longitude']])
        return self
    
    def transform(self, X):
        zones = self.kmeans.predict(X[['Latitude', 'Longitude']])
        X = X.copy()
        X['Zone'] = zones
        return X

# Custom transformer for Label Encoding 'Street'
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.classes_ = None
    
    def fit(self, X, y=None):
        self.label_encoder.fit(np.append(X, 'unknown'))
        self.classes_ = self.label_encoder.classes_
        return self
    
    def transform(self, X):
        X_encoded = X.apply(lambda x: x if x in self.classes_ else 'unknown')
        return self.label_encoder.transform(X_encoded)
    
    def inverse_transform(self, X):
        return self.label_encoder.inverse_transform(X)

# Load dataset
data = pd.read_csv('data_immo_geocoded.csv')

# Display column names
print("Column names:", data.columns)

# Remove the ID column if present
if 'id' in data.columns:
    data = data.drop(columns=['id'])

# Remove house numbers from the 'Street' column
data['Street'] = data['Street'].str.replace(r'\d+', '', regex=True).str.strip()

# Remove outliers in the 'Price' column using IQR
Q1 = data['Price'].quantile(0.25)
Q3 = data['Price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
filtered_data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]

# Remove all missing values
filtered_data = filtered_data.dropna()

# Encode 'Street' using custom label encoding
label_encoder = CustomLabelEncoder()
filtered_data['Street'] = label_encoder.fit_transform(filtered_data['Street'])

# Initialize encoders
onehot_encoder = OneHotEncoder(drop='first', handle_unknown='ignore')

# Define the Preprocessing Pipeline
def get_preprocessor():
    numerical_features = ['Rooms', 'Area']
    categorical_features = ['Street', 'Region', 'City', 'Zone', 'Zip']
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numerical_features),
            ('cat', onehot_encoder, categorical_features)
        ],
        remainder='passthrough'
    )
    return preprocessor

# Define the model using XGBoost
model = xgb.XGBRegressor(n_jobs=-1, random_state=42)

# Create the pipeline with KMeansTransformer
pipeline = Pipeline(steps=[
    ('kmeans', KMeansTransformer()),
    ('preprocessor', get_preprocessor()),
    ('model', model)
])

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0]
}

# Define Scoring Metrics
scoring = {
    'MAE': make_scorer(mean_absolute_error),
    'RMSE': make_scorer(mean_squared_error, greater_is_better=False),
    'R2': make_scorer(r2_score)
}

# Load dataset and prepare features and target
X = filtered_data.drop(columns=['Price'])
y = filtered_data['Price']

# Perform Grid Search with Cross-Validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(X, y)

# Display best parameters and results
print(f"Best parameters found: {grid_search.best_params_}")
best_model = grid_search.best_estimator_
cv_results = cross_val_score(best_model, X, y, cv=cv, scoring='neg_mean_absolute_error')
cv_rmse_results = cross_val_score(best_model, X, y, cv=cv, scoring='neg_mean_squared_error')
cv_r2_results = cross_val_score(best_model, X, y, cv=cv, scoring='r2')

print(f"Best MAE: {np.mean(-cv_results):.2f}")
print(f"Best RMSE: {np.sqrt(np.mean(-cv_rmse_results)):.2f}")
print(f"Best R2: {np.mean(cv_r2_results):.2f}")

# Training the pipeline with the full data
best_model.fit(X, y)

# Save the trained pipeline
joblib.dump(best_model, 'real_estate_pipeline_xgb_tuned.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')


Column names: Index(['id', 'Region', 'Rooms', 'Area', 'Price', 'Street', 'Zip', 'City',
       'Latitude', 'Longitude'],
      dtype='object')
Best parameters found: {'model__colsample_bytree': 1.0, 'model__learning_rate': 0.2, 'model__max_depth': 7, 'model__n_estimators': 300, 'model__subsample': 0.8}




Best MAE: 182.19
Best RMSE: 250.79
Best R2: 0.78


['label_encoder.pkl']