In [14]:
# Import libraries:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error, r2_score


In [15]:
# Define file paths:
data_folder = '/Users/xenanurbilgin/DSP-Sena-Nur-Bilgin/Data/house-prices-advanced-regression-techniques/'
# Read data
data_train = pd.read_csv(os.path.join(data_folder, 'train.csv'))
data_test = pd.read_csv(os.path.join(data_folder, 'test.csv'))
target_variable = 'SalePrice'
models_folder = '/Users/xenanurbilgin/DSP-Sena-Nur-Bilgin/Models'


# Define selected columns and features:
selected_columns = ['YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea',
                    'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea',
                    'OverallQual', 'KitchenQual', 'GarageFinish']

numeric_features = ['YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea',
                    'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea']

ordinal_features = ["OverallQual", "KitchenQual", "GarageFinish"]

In [16]:

def train_test_split_data(data: pd.DataFrame, target_variable: str, test_size: float = 0.25, random_state: int = 42):
    """Split the dataset into train and test sets."""
    X = data[selected_columns]
    y = data[target_variable]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_valid, y_train, y_valid


def preprocess_features(data, features, strategy='mean'):
    """Preprocess features."""
    imputer = SimpleImputer(strategy=strategy)
    imputer.fit(data[features])
    return imputer


def engineer_features(data, ordinal_features, numeric_features):
    """Engineer features."""
    ordinal_encoder = OrdinalEncoder()
    ordinal_encoder.fit(data[ordinal_features])

    scaler = StandardScaler()
    scaler.fit(data[numeric_features])

    return ordinal_encoder, scaler


def filter_outliers(X, y, numeric_features, z_score_threshold=3):
    """Filter outliers."""
    z_scores = X[numeric_features].apply(lambda x: np.abs((x - x.mean()) / x.std()))
    outliers = z_scores.max(axis=1) > z_score_threshold
    return X[~outliers], y[~outliers]


def save_model_and_transformers(regressor, numeric_imputer, categorical_imputer, ordinal_encoder, scaler, models_folder):
    """Save model and transformers."""
    model_path = os.path.join(models_folder, 'model.joblib')
    numeric_imputer_path = os.path.join(models_folder, 'numeric_imputer.joblib')
    categorical_imputer_path = os.path.join(models_folder, 'categorical_imputer.joblib')
    encoder_path = os.path.join(models_folder, 'ordinal_encoder.joblib')
    scaler_path = os.path.join(models_folder, 'scaler.joblib')

    joblib.dump(regressor, model_path)
    joblib.dump(numeric_imputer, numeric_imputer_path)
    joblib.dump(categorical_imputer, categorical_imputer_path)
    joblib.dump(ordinal_encoder, encoder_path)
    joblib.dump(scaler, scaler_path)


def load_model_and_transformers(models_folder):
    """Load model and transformers."""
    model_path = os.path.join(models_folder, 'model.joblib')
    numeric_imputer_path = os.path.join(models_folder, 'numeric_imputer.joblib')
    categorical_imputer_path = os.path.join(models_folder, 'categorical_imputer.joblib')
    encoder_path = os.path.join(models_folder, 'ordinal_encoder.joblib')
    scaler_path = os.path.join(models_folder, 'scaler.joblib')

    regressor = joblib.load(model_path)
    numeric_imputer = joblib.load(numeric_imputer_path)
    categorical_imputer = joblib.load(categorical_imputer_path)
    ordinal_encoder = joblib.load(encoder_path)
    scaler = joblib.load(scaler_path)

    return regressor, numeric_imputer, categorical_imputer, ordinal_encoder, scaler


def preprocess_new_data(new_data, ordinal_features, numeric_features, numeric_imputer, categorical_imputer, ordinal_encoder, scaler):
    """Preprocess new data."""
    new_data[ordinal_features] = categorical_imputer.transform(new_data[ordinal_features])
    new_data[numeric_features] = numeric_imputer.transform(new_data[numeric_features])
    new_data[ordinal_features] = ordinal_encoder.transform(new_data[ordinal_features])
    new_data[numeric_features] = scaler.transform(new_data[numeric_features])
    return new_data


def evaluate_model(y_true, y_pred):
    """Evaluate model."""
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    mse = mean_squared_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {
        'RMSLE': round(rmsle, 3),
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2-score': round(r2, 2)
    }


def transform_data(data_train, data_valid, ordinal_features, numeric_features):
    """Transform data."""
    categorical_imputer = preprocess_features(data_train, ordinal_features, strategy='most_frequent')
    data_train[ordinal_features] = categorical_imputer.transform(data_train[ordinal_features])
    data_valid[ordinal_features] = categorical_imputer.transform(data_valid[ordinal_features])
    numeric_imputer = preprocess_features(data_train, numeric_features)
    data_train[numeric_features] = numeric_imputer.transform(data_train[numeric_features])
    data_valid[numeric_features] = numeric_imputer.transform(data_valid[numeric_features])
    ordinal_encoder, scaler = engineer_features(data_train, ordinal_features, numeric_features)
    data_train[ordinal_features] = ordinal_encoder.transform(data_train[ordinal_features])
    data_valid[ordinal_features] = ordinal_encoder.transform(data_valid[ordinal_features])
    data_train[numeric_features] = scaler.transform(data_train[numeric_features])
    data_valid[numeric_features] = scaler.transform(data_valid[numeric_features])
    return data_train, data_valid


def build_model(data: pd.DataFrame, target_variable: str, models_folder):
    """Build model."""
    X_train, X_valid, y_train, y_valid = train_test_split_data(data, target_variable)
    X_train, y_train = filter_outliers(X_train, y_train, numeric_features)
    X_valid, y_valid = filter_outliers(X_valid, y_valid, numeric_features)
    transformed_X_train, transformed_X_valid = transform_data(X_train.copy(), X_valid.copy(), ordinal_features, numeric_features)
    y_train_log, y_valid_log = np.log(y_train), np.log(y_valid)
    model = LinearRegression()
    model.fit(transformed_X_train, y_train_log)
    y_valid_pred_log = model.predict(transformed_X_valid)
    y_valid_pred = np.exp(y_valid_pred_log)
    evaluation_results = evaluate_model(y_valid, y_valid_pred)
    save_model_and_transformers(model, preprocess_features(data, numeric_features),  # Pass data instead of data[numeric_features]
                                preprocess_features(data, ordinal_features, strategy='most_frequent'),  # Pass data and strategy
                                *engineer_features(data, ordinal_features, numeric_features), 
                                models_folder)
    return evaluation_results



def make_prediction(data_test, model_path):
    """Make predictions."""
    regressor, numeric_imputer, categorical_imputer, ordinal_encoder, scaler = load_model_and_transformers(model_path)
    X_test = data_test[selected_columns].copy()
    z_score_threshold = 3
    z_scores = X_test[numeric_features].apply(lambda x: np.abs((x - x.mean()) / x.std()))
    outliers = z_scores.max(axis=1) > z_score_threshold
    X_test = X_test[~outliers]
    X_test_transformed = preprocess_new_data(X_test, ordinal_features, numeric_features, 
                                              numeric_imputer, categorical_imputer, ordinal_encoder, scaler)
    predictions = regressor.predict(X_test_transformed)
    return np.exp(predictions)





In [18]:
if __name__ == "__main__":
    target_variable = 'SalePrice'  # Example target variable
    evaluation_results = build_model(data_train, target_variable, models_folder)
    print("Evaluation Results:")
    for metric, value in evaluation_results.items():
        print(f"{metric}: {value}")

    predictions = make_prediction(data_test, models_folder)
    print("Predictions:")
    print(predictions)


Evaluation Results:
RMSLE: 0.152
MSE: 584840515.8266134
RMSE: 24183.476090641176
MAE: 17501.766050725353
R2-score: 0.89
Predictions:
[120740.4717788  139156.00190348 172068.71639766 ... 154993.39396393
 118444.19652196 223011.3201104 ]


