In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from datetime import datetime
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from mlflow.tracking import MlflowClient


In [None]:
mlflow.set_tracking_uri("https://66e509f05a9319f04690f279.bm-east.lab.poridhi.io/proxy/5000/")  # Replace with your MLflow server URI
mlflow.set_experiment("House Price Prediction lab 10")


In [None]:
train_data = pd.read_csv('./house-prices-advanced-regression-techniques/train.csv')
train_data.head()


In [None]:
train_data.info()


In [None]:
train_data.isnull().sum()
train_data.shape


In [None]:
train_data.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'Id', 'GarageYrBlt'], axis=1, inplace = True)


In [None]:
train_data['LotFrontage']=train_data['LotFrontage'].fillna(train_data['LotFrontage'].mode()[0])
train_data['BsmtCond']=train_data['BsmtCond'].fillna(train_data['BsmtCond'].mode()[0])
train_data['BsmtQual']=train_data['BsmtQual'].fillna(train_data['BsmtQual'].mode()[0])
train_data['FireplaceQu']=train_data['FireplaceQu'].fillna(train_data['FireplaceQu'].mode()[0])
train_data['GarageType']=train_data['GarageType'].fillna(train_data['GarageType'].mode()[0])
train_data['GarageFinish']=train_data['GarageFinish'].fillna(train_data['GarageFinish'].mode()[0])
train_data['GarageQual']=train_data['GarageQual'].fillna(train_data['GarageQual'].mode()[0])
train_data['GarageCond']=train_data['GarageCond'].fillna(train_data['GarageCond'].mode()[0])


In [None]:
train_data.isnull().sum().sum()


In [None]:
import seaborn as sns
sns.heatmap(train_data.isnull(),yticklabels=False,cbar=False,cmap='coolwarm')


In [None]:
train_data['MasVnrType']=train_data['MasVnrType'].fillna(train_data['MasVnrType'].mode()[0])
train_data['MasVnrArea']=train_data['MasVnrArea'].fillna(train_data['MasVnrArea'].mode()[0])
train_data['BsmtExposure']=train_data['BsmtExposure'].fillna(train_data['BsmtExposure'].mode()[0])
train_data['BsmtFinType2']=train_data['BsmtFinType2'].fillna(train_data['BsmtFinType2'].mode()[0])


In [None]:
sns.heatmap(train_data.isnull(), yticklabels=False, cbar=False, cmap='YlGnBu')
train_data.dropna(inplace=True)


In [None]:
test_data = pd.read_csv('./house-prices-advanced-regression-techniques/test.csv')
test_data.head()


In [None]:
test_data.isnull().sum()
test_data.shape


In [None]:
test_data.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'Id', 'GarageYrBlt'], axis=1, inplace = True)


In [None]:
test_data['LotFrontage']=test_data['LotFrontage'].fillna(test_data['LotFrontage'].mode()[0])
test_data['BsmtCond']=test_data['BsmtCond'].fillna(test_data['BsmtCond'].mode()[0])
test_data['BsmtQual']=test_data['BsmtQual'].fillna(test_data['BsmtQual'].mode()[0])
test_data['FireplaceQu']=test_data['FireplaceQu'].fillna(test_data['FireplaceQu'].mode()[0])
test_data['GarageType']=test_data['GarageType'].fillna(test_data['GarageType'].mode()[0])
test_data['GarageFinish']=test_data['GarageFinish'].fillna(test_data['GarageFinish'].mode()[0])
test_data['GarageQual']=test_data['GarageQual'].fillna(test_data['GarageQual'].mode()[0])
test_data['GarageCond']=test_data['GarageCond'].fillna(test_data['GarageCond'].mode()[0])
test_data['MasVnrType']=test_data['MasVnrType'].fillna(test_data['MasVnrType'].mode()[0])
test_data['MasVnrArea']=test_data['MasVnrArea'].fillna(test_data['MasVnrArea'].mode()[0])
test_data['BsmtExposure']=test_data['BsmtExposure'].fillna(test_data['BsmtExposure'].mode()[0])
test_data['BsmtFinType2']=test_data['BsmtFinType2'].fillna(test_data['BsmtFinType2'].mode()[0])


In [None]:
test_data.loc[:, test_data.isnull().any()].head()


In [None]:
test_data['Utilities']=test_data['Utilities'].fillna(test_data['Utilities'].mode()[0])
test_data['Exterior1st']=test_data['Exterior1st'].fillna(test_data['Exterior1st'].mode()[0])
test_data['Exterior2nd']=test_data['Exterior2nd'].fillna(test_data['Exterior2nd'].mode()[0])
test_data['BsmtFinType1']=test_data['BsmtFinType1'].fillna(test_data['BsmtFinType1'].mode()[0])
test_data['BsmtFinSF1']=test_data['BsmtFinSF1'].fillna(test_data['BsmtFinSF1'].mean())
test_data['BsmtFinSF2']=test_data['BsmtFinSF2'].fillna(test_data['BsmtFinSF2'].mean())
test_data['BsmtUnfSF']=test_data['BsmtUnfSF'].fillna(test_data['BsmtUnfSF'].mean())
test_data['TotalBsmtSF']=test_data['TotalBsmtSF'].fillna(test_data['TotalBsmtSF'].mean())
test_data['BsmtFullBath']=test_data['BsmtFullBath'].fillna(test_data['BsmtFullBath'].mode()[0])
test_data['BsmtHalfBath']=test_data['BsmtHalfBath'].fillna(test_data['BsmtHalfBath'].mode()[0])
test_data['KitchenQual']=test_data['KitchenQual'].fillna(test_data['KitchenQual'].mode()[0])
test_data['Functional']=test_data['Functional'].fillna(test_data['Functional'].mode()[0])
test_data['GarageCars']=test_data['GarageCars'].fillna(test_data['GarageCars'].mean())
test_data['GarageArea']=test_data['GarageArea'].fillna(test_data['GarageArea'].mean())
test_data['SaleType']=test_data['SaleType'].fillna(test_data['SaleType'].mode()[0])


In [None]:
test_data.shape


In [None]:
columns=['MSZoning','Street','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood',
         'Condition2','BldgType','Condition1','HouseStyle','SaleType',
        'SaleCondition','ExterCond',
         'ExterQual','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
        'RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','Heating','HeatingQC',
         'CentralAir',
         'Electrical','KitchenQual','Functional',
         'FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','PavedDrive']


In [None]:
def category_onehot_multcols(multcolumns):
    data_final=final_data
    i=0
    for fields in multcolumns:
        
        print(fields)
        df1=pd.get_dummies(final_data[fields],drop_first=True)
        
        final_data.drop([fields],axis=1,inplace=True)
        if i==0:
            data_final=df1.copy()
        else:
            
            data_final=pd.concat([data_final,df1],axis=1)
        i=i+1
       
        
    data_final=pd.concat([final_data,data_final],axis=1)
        
    return data_final


In [None]:
train_data2 = train_data.copy()


In [None]:
final_data=pd.concat([train_data,test_data],axis=0)


In [None]:
final_data['SalePrice']
final_data.shape


In [None]:
final_data=category_onehot_multcols(columns)
final_data = final_data.loc[:,~final_data.columns.duplicated()]
final_data.isnull().sum().sum()




In [None]:
boolean_columns = ['Min1', 'Min2', 'Typ', 'Attchd', 'Basment', 'BuiltIn', 'CarPort', 'Detchd', 'RFn', 'P']
for col in boolean_columns:
    final_data[col] = final_data[col].astype(int)
    final_data[col] = final_data[col].astype(int)

final_data.dtypes


In [None]:
numeric_columns = final_data.select_dtypes(include=['int64', 'float64']).columns
final_data[numeric_columns] = final_data[numeric_columns].astype(float)

final_data.dtypes


In [None]:
null_values_before = final_data.isnull().sum().sum()
null_columns = final_data.columns[final_data.isnull().any()].tolist()
print("Columns with null values:")
for col in null_columns:
    print(f"{col}: {final_data[col].isnull().sum()} nulls")


In [None]:
for col in null_columns:
    if final_data[col].dtype == 'object':
        # For categorical columns, fill with mode
        final_data[col] = final_data[col].fillna(final_data[col].mode()[0])
    else:
        # For numerical columns, fill with mean
        final_data[col] = final_data[col].fillna(final_data[col].mean())

# Verify all nulls are handled
print("Remaining null values:", final_data.isnull().sum().sum())


In [None]:
with mlflow.start_run(run_name="data_preprocessing") as run:
    # Log dataset info
    mlflow.log_param("dataset_size", len(final_data))
    mlflow.log_param("num_features", final_data.shape[1])
    
    # Log preprocessing steps
    preprocessing_steps = {
        "dropped_columns": ['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'Id', 'GarageYrBlt'],
        "filled_null_columns": ["LotFrontage", "BsmtCond", "BsmtQual", "FireplaceQu", 
                              "GarageType", "GarageFinish", "GarageQual", "GarageCond"]
    }
    mlflow.log_dict(preprocessing_steps, "preprocessing_steps.json")
    
    # Log data quality metrics
    data_quality_metrics = {
        "null_values_before": null_values_before,
        "null_values_after": final_data.isnull().sum().sum()
    }
    mlflow.log_metrics(data_quality_metrics)


In [None]:
data_train=final_data.iloc[:1422,:]
data_test=final_data.iloc[1422:,:]


In [None]:
X_train=data_train.drop(['SalePrice'],axis=1)
y_train=data_train['SalePrice']


In [None]:
import xgboost
regressor=xgboost.XGBRegressor()

booster=['gbtree','gblinear']
base_score=[0.25,0.5,0.75,1]

n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }


In [None]:
from sklearn.model_selection import RandomizedSearchCV
random_cv = RandomizedSearchCV(estimator=regressor,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=2)


In [None]:
def evaluate_model(model, X, y):
    predictions = model.predict(X)
    mse = mean_squared_error(y, predictions)
    mae = mean_absolute_error(y, predictions)
    r2 = r2_score(y, predictions)
    return {"mse": mse, "mae": mae, "r2": r2, "rmse": np.sqrt(mse)}


In [None]:
with mlflow.start_run(run_name="model_training") as run:
    # Log hyperparameter search space
    mlflow.log_dict(hyperparameter_grid, "hyperparameter_grid.json")
    
    # Perform RandomizedSearchCV
    random_cv.fit(X_train, y_train)
    
    # Log best parameters
    mlflow.log_params(random_cv.best_params_)
    
    # Log cross-validation results
    cv_results = {
        "mean_test_score": random_cv.cv_results_['mean_test_score'],
        "std_test_score": random_cv.cv_results_['std_test_score'],
        "mean_train_score": random_cv.cv_results_['mean_train_score'],
        "std_train_score": random_cv.cv_results_['std_train_score']
    }
    mlflow.log_dict(cv_results, "cv_results.json")
    
    # Log best model metrics
    best_model = random_cv.best_estimator_
    train_metrics = evaluate_model(best_model, X_train, y_train)
    mlflow.log_metrics(train_metrics)
    
    # Log feature importance plot
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    plt.bar(feature_importance['feature'][:10], feature_importance['importance'][:10])
    plt.xticks(rotation=45)
    plt.title('Top 10 Feature Importance')
    plt.tight_layout()
    mlflow.log_figure(plt.gcf(), "feature_importance.png")
    
    # Log the model
    mlflow.xgboost.log_model(best_model, "model",
                            registered_model_name="house_price_prediction_model")

    # Print best score and parameters
    print(f"Best score: {random_cv.best_score_}")
    print(f"Best parameters: {random_cv.best_params_}")


In [None]:
import pickle
best_model = random_cv.best_estimator_
filename = 'finalized_model.pkl'
pickle.dump(best_model, open(filename, 'wb'))


In [None]:
import os
with mlflow.start_run(run_name="model_prediction") as run:
    # Make predictions
    y_pred = best_model.predict(data_test)
    
    # Log prediction statistics
    prediction_stats = {
        "mean_predicted_price": float(np.mean(y_pred)),
        "median_predicted_price": float(np.median(y_pred)),
        "std_predicted_price": float(np.std(y_pred)),
        "min_predicted_price": float(np.min(y_pred)),
        "max_predicted_price": float(np.max(y_pred))
    }
    mlflow.log_metrics(prediction_stats)
    
    # Create and save predictions to a CSV file
    prediction_df = pd.DataFrame({
        'predicted_price': y_pred
    })
    
    # Save predictions locally
    predictions_path = "predictions.csv"
    prediction_df.to_csv(predictions_path, index=False)
    
    # Log the predictions file
    mlflow.log_artifact(predictions_path)
    
    # Create and log a histogram of predictions
    plt.figure(figsize=(10, 6))
    plt.hist(y_pred, bins=50)
    plt.title('Distribution of Predicted House Prices')
    plt.xlabel('Predicted Price')
    plt.ylabel('Frequency')
    mlflow.log_figure(plt.gcf(), "prediction_distribution.png")
    
    # Log model version and status
    client = MlflowClient()
    try:
        model_version = client.get_latest_versions("house_price_prediction_model", stages=["None"])[0]
        client.transition_model_version_stage(
            name="house_price_prediction_model",
            version=model_version.version,
            stage="Production"
        )
    except Exception as e:
        print(f"Error updating model version: {str(e)}")
    
    # Clean up
    plt.close()
    if os.path.exists(predictions_path):
        os.remove(predictions_path)


In [None]:
def compare_runs(experiment_name="House Price Prediction", top_n=5):
    """Compare different runs and their metrics"""
    
    # Start a new MLflow run to track this comparison process
    with mlflow.start_run(run_name="model_comparison"):
        client = MlflowClient()
        experiment = client.get_experiment_by_name(experiment_name)
        
        # Search for runs based on R2 score in descending order
        runs = client.search_runs(
            experiment_ids=[experiment.experiment_id],
            order_by=["metrics.r2 DESC"]
        )
        
        # Create a DataFrame to store the top N runs' performance and parameters
        comparison_df = pd.DataFrame([
            {
                'run_id': run.info.run_id,
                'r2_score': run.data.metrics.get('r2', None),
                'rmse': run.data.metrics.get('rmse', None),
                'mae': run.data.metrics.get('mae', None),
                'parameters': run.data.params
            }
            for run in runs[:top_n]
        ])
        
        # Log comparison DataFrame as an artifact (optional)
        comparison_df_path = "run_comparison.csv"
        comparison_df.to_csv(comparison_df_path, index=False)
        mlflow.log_artifact(comparison_df_path)

        # Optionally, log the best performing run's metrics for tracking
        best_run = runs[0]
        best_run_metrics = {
            "best_r2_score": best_run.data.metrics.get('r2', None),
            "best_rmse": best_run.data.metrics.get('rmse', None),
            "best_mae": best_run.data.metrics.get('mae', None)
        }
        mlflow.log_metrics(best_run_metrics)

        # Print and return the comparison dataframe
        print("Best performing runs:")
        print(comparison_df)

    return comparison_df

# Example usage after model training
best_runs = compare_runs()


In [None]:
def register_best_model(experiment_name="House Price Prediction"):
    """Register the best performing model to the model registry"""
    client = MlflowClient()
    experiment = client.get_experiment_by_name(experiment_name)
    best_run = client.search_runs(
        experiment_ids=[experiment.experiment_id],
        order_by=["metrics.r2 DESC"]
    )[0]
    
    # Register the model from the best run
    model_uri = f"runs:/{best_run.info.run_id}/model"
    mv = mlflow.register_model(model_uri, "house_price_prediction_model")
    
    # Transition the model to production
    client.transition_model_version_stage(
        name="house_price_prediction_model",
        version=mv.version,
        stage="Production"
    )
    
    return mv

# Add after model training
best_model_version = register_best_model()
print(f"Registered model version: {best_model_version.version}")
