In [2]:
import pandas as pd
import joblib
import logging
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    median_absolute_error,
    explained_variance_score,
    mean_absolute_percentage_error,
)
import xgboost as xgb

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 1. Data Loading
try:
    df = pd.read_csv('apartments_sqm.csv')
    logging.info("Data loaded successfully.")
except FileNotFoundError:
    logging.error("File 'houses_sqm.csv' not found.")
    raise
except Exception as e:
    logging.error(f"Error loading data: {e}")
    raise

# 2. Data Preprocessing

# Define feature categories
cat_features = ['state_building', 
                'zip_code', 'locality', 'province']
num_features = ["total_area_sqm", "construction_year", "nbr_bedrooms", ]
dummy_features = ["fl_furnished", 
                  "fl_terrace","fl_double_glazing"]

# Separate features and target
X = df[num_features + dummy_features + cat_features]
y = df['price']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=535
)
logging.info("Data split into training and testing sets.")

# Handle rare categories by grouping them into 'Other' and ensure all are strings
for cat in cat_features:
    top_categories = X_train[cat].value_counts().nlargest(50).index
    X_train[cat] = X_train[cat].apply(lambda x: str(x) if x in top_categories else 'Other')
    X_test[cat] = X_test[cat].apply(lambda x: str(x) if x in top_categories else 'Other')
    logging.info(f"Handled rare categories in '{cat}'.")

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
        ]), cat_features),
        ('dummy', 'passthrough', dummy_features)  # Assuming these are already binary
    ]
)

# Fit and transform the training data, transform the testing data
X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc = preprocessor.transform(X_test)
logging.info("Data preprocessing completed.")

# Retrieve feature names after encoding
ohe_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(cat_features)
all_features = num_features + list(ohe_features) + dummy_features

# Convert to DataFrame for easier handling
X_train_enc = pd.DataFrame(X_train_enc.toarray(), columns=all_features, index=X_train.index)
X_test_enc = pd.DataFrame(X_test_enc.toarray(), columns=all_features, index=X_test.index)

# 3. Model Training
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=535
)
model.fit(X_train_enc, y_train)
logging.info("Model training completed.")

# 4. Model Evaluation
y_pred_test = model.predict(X_test_enc)

# Calculate evaluation metrics
r2 = r2_score(y_test, y_pred_test)
mae = mean_absolute_error(y_test, y_pred_test)
median_ae = median_absolute_error(y_test, y_pred_test)
explained_variance = explained_variance_score(y_test, y_pred_test)
try:
    mape = mean_absolute_percentage_error(y_test, y_pred_test)
except ZeroDivisionError:
    mape = float('inf')  # Assign infinity if actual price is zero
    logging.warning("MAPE is infinity due to division by zero (actual price is zero).")

# Log evaluation metrics
logging.info(f"R-squared (Test): {r2:.4f}")
logging.info(f"Mean Absolute Error (MAE): {mae:.2f}")
logging.info(f"Median Absolute Error: {median_ae:.2f}")
logging.info(f"Explained Variance Score: {explained_variance:.4f}")
logging.info(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# 5. Results Visualization

# Create a comparison DataFrame
comparison_df = pd.DataFrame({
    'Actual Price': y_test,
    'Predicted Price': y_pred_test
})
comparison_df['Absolute Error'] = (comparison_df['Actual Price'] - comparison_df['Predicted Price']).abs()
comparison_df['Percentage Error'] = (comparison_df['Absolute Error'] / comparison_df['Actual Price']) * 100

# Display first 10 rows
print("First 10 Rows of Comparison:")
print(comparison_df.head(10))

# Scatter plot: Actual vs Predicted
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.scatterplot(x='Actual Price', y='Predicted Price', data=comparison_df, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')

# Histogram of Percentage Errors
plt.subplot(1, 2, 2)
sns.histplot(comparison_df['Percentage Error'], bins=50, kde=True, color='skyblue')
plt.xlabel('Percentage Error (%)')
plt.title('Distribution of Percentage Errors')

plt.tight_layout()
plt.show()

# 6. Saving the Model
joblib.dump(model, 'appartments_xgb_model.joblib')
logging.info("Model saved as 'appartments_xgb_model.joblib'.")

2024-11-05 11:59:20,381 - ERROR - File 'houses_sqm.csv' not found.


FileNotFoundError: [Errno 2] No such file or directory: 'apartments_sqm.csv'

In [7]:
import pandas as pd
import joblib
import logging
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    median_absolute_error,
    explained_variance_score,
    mean_absolute_percentage_error,
)
import xgboost as xgb

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 1. Data Loading

try:
    df = pd.read_csv(r'Data/Preprocessed/houses_sqm.csv')
    logging.info("Data loaded successfully.")
except FileNotFoundError:
    logging.error("File 'houses_sqm.csv' not found.")
    raise
except Exception as e:
    logging.error(f"Error loading data: {e}")
    raise

# 2. Data Preprocessing

# Define feature categories
cat_features = ['state_building', 
                'zip_code', 'locality', 'province']
num_features = ["total_area_sqm", "construction_year", "nbr_bedrooms", ]
dummy_features = ["fl_furnished", 
                  "fl_terrace","fl_double_glazing"]

# Separate features and target
X = df[num_features + dummy_features + cat_features]
y = df['price']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=535
)
logging.info("Data split into training and testing sets.")

# Handle rare categories by grouping them into 'Other' and ensure all are strings
for cat in cat_features:
    top_categories = X_train[cat].value_counts().nlargest(50).index
    X_train[cat] = X_train[cat].apply(lambda x: str(x) if x in top_categories else 'Other')
    X_test[cat] = X_test[cat].apply(lambda x: str(x) if x in top_categories else 'Other')
    logging.info(f"Handled rare categories in '{cat}'.")

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
        ]), cat_features),
        ('dummy', 'passthrough', dummy_features)  # Assuming these are already binary
    ]
)

# Fit and transform the training data, transform the testing data
X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc = preprocessor.transform(X_test)
logging.info("Data preprocessing completed.")

# Retrieve feature names after encoding
ohe_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(cat_features)
all_features = num_features + list(ohe_features) + dummy_features

# Convert to DataFrame for easier handling
X_train_enc = pd.DataFrame(X_train_enc.toarray(), columns=all_features, index=X_train.index)
X_test_enc = pd.DataFrame(X_test_enc.toarray(), columns=all_features, index=X_test.index)

# 3. Model Training
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=535
)
model.fit(X_train_enc, y_train)
logging.info("Model training completed.")

# 4. Model Evaluation
y_pred_test = model.predict(X_test_enc)

# Calculate evaluation metrics
r2 = r2_score(y_test, y_pred_test)
mae = mean_absolute_error(y_test, y_pred_test)
median_ae = median_absolute_error(y_test, y_pred_test)
explained_variance = explained_variance_score(y_test, y_pred_test)
try:
    mape = mean_absolute_percentage_error(y_test, y_pred_test)
except ZeroDivisionError:
    mape = float('inf')  # Assign infinity if actual price is zero
    logging.warning("MAPE is infinity due to division by zero (actual price is zero).")

# Log evaluation metrics
logging.info(f"R-squared (Test): {r2:.4f}")
logging.info(f"Mean Absolute Error (MAE): {mae:.2f}")
logging.info(f"Median Absolute Error: {median_ae:.2f}")
logging.info(f"Explained Variance Score: {explained_variance:.4f}")
logging.info(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# 5. Results Visualization

# Create a comparison DataFrame
comparison_df = pd.DataFrame({
    'Actual Price': y_test,
    'Predicted Price': y_pred_test
})
comparison_df['Absolute Error'] = (comparison_df['Actual Price'] - comparison_df['Predicted Price']).abs()
comparison_df['Percentage Error'] = (comparison_df['Absolute Error'] / comparison_df['Actual Price']) * 100

# Display first 10 rows
print("First 10 Rows of Comparison:")
print(comparison_df.head(10))

# Scatter plot: Actual vs Predicted
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.scatterplot(x='Actual Price', y='Predicted Price', data=comparison_df, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')

# Histogram of Percentage Errors
plt.subplot(1, 2, 2)
sns.histplot(comparison_df['Percentage Error'], bins=50, kde=True, color='skyblue')
plt.xlabel('Percentage Error (%)')
plt.title('Distribution of Percentage Errors')

plt.tight_layout()
plt.show()

# 6. Saving the Model
joblib.dump(model, 'houses_xgb.joblib')
logging.info("Model saved as 'appartments_xgb_model.joblib'.")

2024-11-05 15:59:32,710 - ERROR - File 'houses_sqm.csv' not found.


FileNotFoundError: [Errno 2] No such file or directory: 'Data/Preprocessed/houses_sqm.csv'

In [2]:
import joblib

# Define the directory path
model_directory = "C:/Users/Becode-son/Documents/Machine_Learning_Becode/ml_immoweb/Trained_Models/"

# Load each model
apartments_model = joblib.load(f"{model_directory}appartments_xgb_model.joblib")
houses_model = joblib.load(f"{model_directory}houses_xgb.joblib")
trained_model = joblib.load(f"{model_directory}trained_xgb_model.joblib")

# Example usage: print model type to verify successful loading
print(type(apartments_model))
print(type(houses_model))
print(type(trained_model))


<class 'xgboost.sklearn.XGBRegressor'>
<class 'xgboost.sklearn.XGBRegressor'>
<class 'xgboost.sklearn.XGBRegressor'>


# Below is more granular on zip_code


In [3]:
# Check feature names for Scikit-Learn models
if hasattr(apartments_model, "feature_names_in_"):
    print("Features used in apartments model:", apartments_model.feature_names_in_)

if hasattr(houses_model, "feature_names_in_"):
    print("Features used in houses model:", houses_model.feature_names_in_)

if hasattr(trained_model, "feature_names_in_"):
    print("Features used in trained model:", trained_model.feature_names_in_)


Features used in apartments model: ['total_area_sqm' 'construction_year' 'nbr_bedrooms' 'state_building_GOOD'
 'state_building_JUST_RENOVATED' 'state_building_MISSING'
 'state_building_TO_BE_DONE_UP' 'state_building_TO_RENOVATE'
 'state_building_TO_RESTORE' 'zip_code_1020' 'zip_code_1030'
 'zip_code_1040' 'zip_code_1050' 'zip_code_1060' 'zip_code_1070'
 'zip_code_1080' 'zip_code_1082' 'zip_code_1090' 'zip_code_1120'
 'zip_code_1140' 'zip_code_1180' 'zip_code_1190' 'zip_code_1200'
 'zip_code_1420' 'zip_code_1480' 'zip_code_1500' 'zip_code_1800'
 'zip_code_2000' 'zip_code_2018' 'zip_code_2060' 'zip_code_2100'
 'zip_code_2140' 'zip_code_2170' 'zip_code_2300' 'zip_code_2500'
 'zip_code_2600' 'zip_code_2610' 'zip_code_2640' 'zip_code_2800'
 'zip_code_3000' 'zip_code_3500' 'zip_code_4000' 'zip_code_4020'
 'zip_code_5000' 'zip_code_5100' 'zip_code_7000' 'zip_code_7060'
 'zip_code_7500' 'zip_code_7700' 'zip_code_8300' 'zip_code_8370'
 'zip_code_8400' 'zip_code_8430' 'zip_code_8500' 'zip_code_8

In [None]:
import pandas as pd
import joblib
import logging
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    median_absolute_error,
    explained_variance_score,
    mean_absolute_percentage_error,
)
import xgboost as xgb

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 1. Data Loading
try:
    df = pd.read_csv('houses_sqm.csv')
    logging.info("Data loaded successfully.")
except FileNotFoundError:
    logging.error("File 'houses_sqm.csv' not found.")
    raise
except Exception as e:
    logging.error(f"Error loading data: {e}")
    raise

# 2. Data Preprocessing

# Define feature categories
cat_features = ['state_building', 'zip_code', 'propertysubtype' 'building_condition']
num_features = ["total_area_sqm", "construction_year", "nbr_bedrooms"]
dummy_features = ["fl_furnished", "fl_terrace", "fl_double_glazing"]

# Separate features and target
X = df[num_features + dummy_features + cat_features]
y = df['price']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=535
)
logging.info("Data split into training and testing sets.")

# Handle rare categories
for cat in cat_features:
    top_categories = X_train[cat].value_counts().nlargest(50).index
    X_train[cat] = X_train[cat].apply(lambda x: str(x) if x in top_categories else 'Other')
    X_test[cat] = X_test[cat].apply(lambda x: str(x) if x in top_categories else 'Other')
    logging.info(f"Handled rare categories in '{cat}'.")

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
        ]), cat_features),
        ('dummy', 'passthrough', dummy_features)  # Assuming these are already binary
    ]
)

# Fit and transform the training data, transform the testing data
X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc = preprocessor.transform(X_test)
logging.info("Data preprocessing completed.")

# Retrieve feature names after encoding
ohe_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(cat_features)
all_features = num_features + list(ohe_features) + dummy_features

# Convert to DataFrame for easier handling
# Convert to dense array if sparse
X_train_enc = pd.DataFrame(X_train_enc.toarray(), columns=all_features, index=X_train.index)
X_test_enc = pd.DataFrame(X_test_enc.toarray(), columns=all_features, index=X_test.index)

# 3. Model Training
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=535
)
model.fit(X_train_enc, y_train)
logging.info("Model training completed.")

# 4. Model Evaluation
y_pred_test = model.predict(X_test_enc)

# Calculate evaluation metrics
r2 = r2_score(y_test, y_pred_test)
mae = mean_absolute_error(y_test, y_pred_test)
median_ae = median_absolute_error(y_test, y_pred_test)
explained_variance = explained_variance_score(y_test, y_pred_test)
try:
    mape = mean_absolute_percentage_error(y_test, y_pred_test)
except ZeroDivisionError:
    mape = float('inf')  # Assign infinity if actual price is zero
    logging.warning("MAPE is infinity due to division by zero (actual price is zero).")

# Log evaluation metrics
logging.info(f"R-squared (Test): {r2:.4f}")
logging.info(f"Mean Absolute Error (MAE): {mae:.2f}")
logging.info(f"Median Absolute Error: {median_ae:.2f}")
logging.info(f"Explained Variance Score: {explained_variance:.4f}")
logging.info(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# 5. Results Visualization
comparison_df = pd.DataFrame({
    'Actual Price': y_test,
    'Predicted Price': y_pred_test
})
comparison_df['Absolute Error'] = (comparison_df['Actual Price'] - comparison_df['Predicted Price']).abs()
comparison_df['Percentage Error'] = (comparison_df['Absolute Error'] / comparison_df['Actual Price']) * 100

# Display first 10 rows
print("First 10 Rows of Comparison:")
print(comparison_df.head(10))

# Scatter plot: Actual vs Predicted
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.scatterplot(x='Actual Price', y='Predicted Price', data=comparison_df, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')

# Histogram of Percentage Errors
plt.subplot(1, 2, 2)
sns.histplot(comparison_df['Percentage Error'], bins=50, kde=True, color='skyblue')
plt.xlabel('Percentage Error (%)')
plt.title('Distribution of Percentage Errors')

plt.tight_layout()
plt.show()

# 6. Saving the Model
joblib.dump(model, 'houses_xgb.joblib')
logging.info("Model saved as 'houses_xgb_test2.joblib'.")


2024-11-05 15:56:11,817 - ERROR - File 'houses_sqm.csv' not found.


FileNotFoundError: [Errno 2] No such file or directory: 'houses_sqm.csv'