In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.expand_frame_repr', False)

In [None]:
df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
df.shape

In [None]:
df.head()

# Cleaning

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
cols_to_drop = ['PoolQC', 'MiscFeature', 'Alley', 'Fence'] # too many missing values
X_train.drop(cols_to_drop, axis=1, inplace=True)
X_test.drop(cols_to_drop, axis=1, inplace=True)

In [None]:
train_ids = X_train.pop('Id')
test_ids = X_test.pop('Id')

In [None]:
print(X_train.isna().mean().sort_values(ascending=False).head(10))

In [None]:
X_train.drop('GarageYrBlt', axis=1, inplace=True)
X_test.drop('GarageYrBlt', axis=1, inplace=True)

# Feature Engineering


In [None]:
cat_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
num_cols = [col for col in X_train.columns if X_train[col].dtype != 'object']

print(f"Categorical columns ({len(cat_cols)}): {cat_cols}")
print(f"Numerical columns ({len(num_cols)}): {num_cols}")

In [None]:
for col in cat_cols:
    print(X_train[col].value_counts()) # სვეტში რა ობიექტი რამდენჯერ გვხვდება

In [None]:
s = X_train[cat_cols].nunique()
s

In [None]:
n = X_train[num_cols].nunique()
n

In [None]:
def analyze_numericals(df, num_cols):
    analysis = []
    for col in num_cols:
        # Basic stats
        value_counts = df[col].value_counts(dropna=False)
        n_unique = len(value_counts)
        na_count = df[col].isna().sum()
        
        # Dominant category percentage
        dominant_pct = (value_counts.iloc[0] / len(df)) * 100
        
        analysis.append({
            'Column': col,
            'Unique Values': n_unique,
            'NA Values': na_count,
            'Dominant Value': value_counts.index[0],
            'Dominant %': round(dominant_pct, 1),
            'Value Counts': value_counts.to_dict()
        })
    
    return pd.DataFrame(analysis)

cat_analysis = analyze_numericals(X_train, num_cols)
pd.set_option('display.max_rows', None)
print(cat_analysis[['Column', 'Unique Values', 'NA Values', 'Dominant Value', 'Dominant %']])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cols_to_check = [
    'LowQualFinSF',       # 95% 'Gtl' (almost no variance)
    'PoolArea',      # 92% 'Y' (nearly all paved)
    'MiscVal',      # 92% 'SBrkr' (dominant category)
    '3SsnPorch',
    'LotFrontage',
    'FireplaceQu',
    'MasVnrArea'
]

for col in cols_to_check:
    if col in X_train.columns:
        plt.figure(figsize=(8, 4))
        sns.boxplot(x=X_train[col], y=y_train)
        plt.title(f"SalePrice by {col}")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

# Separate data into complete and incomplete records
train = X_train[X_train['LotFrontage'].notna()]
predict = X_train[X_train['LotFrontage'].isna()]

# Simple model using just LotArea (could add more features)
model = LinearRegression()
model.fit(np.log1p(train['LotArea'].values.reshape(-1, 1)), train['LotFrontage'])

# Predict missing values
predicted_frontage = model.predict(np.log1p(predict['LotArea'].values.reshape(-1, 1)))

# Update X_train with the predicted values
X_train.loc[X_train['LotFrontage'].isna(), 'LotFrontage'] = predicted_frontage

In [None]:
X_train.drop(columns=['PoolArea'], inplace=True)
X_test.drop(columns=['PoolArea'], inplace=True)

In [None]:
X_train['MasVnrArea'] = X_train['MasVnrArea'].fillna(0)
X_test['MasVnrArea'] = X_test['MasVnrArea'].fillna(0)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

cols_to_drop = ['Street', 'Utilities', 'Condition2', 'RoofMatl']

# Create a figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

for i, col in enumerate(cols_to_drop):
    if col in X_train.columns:
        # Calculate value counts and percentages
        value_counts = X_train[col].value_counts()
        percentages = X_train[col].value_counts(normalize=True) * 100
        
        # Create a DataFrame for display
        df_display = pd.DataFrame({
            'Count': value_counts,
            'Percentage (%)': percentages.round(1)
        })
        
        print(f"\nValue distribution for {col}:")
        print(df_display)
        
        # Plot the distribution
        value_counts.plot(kind='bar', ax=axes[i], color='skyblue')
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel('')
        axes[i].set_ylabel('Count')
        
        # Add percentage labels
        for p in axes[i].patches:
            axes[i].annotate(f'{p.get_height()}\n({p.get_height()/len(X_train)*100:.1f}%)', 
                           (p.get_x() + p.get_width() / 2., p.get_height()), 
                           ha='center', va='center', xytext=(0, 10), 
                           textcoords='offset points')
    else:
        print(f"\nColumn {col} not found in training data")

plt.tight_layout()
plt.show()

# Now actually drop the columns
X_train.drop(columns=cols_to_drop, inplace=True, errors='ignore')
X_test.drop(columns=cols_to_drop, inplace=True, errors='ignore')

print("\nColumns dropped successfully.")

In [None]:
cat_cols = [col for col in cat_cols if col not in cols_to_drop]

In [None]:
def analyze_categoricals(df, cat_cols):
    analysis = []
    for col in cat_cols:
        # Basic stats
        value_counts = df[col].value_counts(dropna=False)
        n_unique = len(value_counts)
        na_count = df[col].isna().sum()
        
        # Dominant category percentage
        dominant_pct = (value_counts.iloc[0] / len(df)) * 100
        
        analysis.append({
            'Column': col,
            'Unique Values': n_unique,
            'NA Values': na_count,
            'Dominant Value': value_counts.index[0],
            'Dominant %': round(dominant_pct, 1),
            'Value Counts': value_counts.to_dict()
        })
    
    return pd.DataFrame(analysis)

cat_analysis = analyze_categoricals(X_train, cat_cols)
pd.set_option('display.max_rows', None)
print(cat_analysis[['Column', 'Unique Values', 'NA Values', 'Dominant Value', 'Dominant %']])

In [None]:
X_train['GarageCond_Problem'] = (X_train['GarageCond'] != 'TA').astype(int)
X_test['GarageCond_Problem'] = (X_test['GarageCond'] != 'TA').astype(int)

X_train.drop(columns=['GarageCond'], inplace=True)
X_test.drop(columns=['GarageCond'], inplace=True)

In [None]:
threshold = 3

woe_columns = list(s[s > 3].index)
one_hot_columns = list(s[s <= 3].index)

In [None]:
valid_woe_columns = [col for col in woe_columns if col in X_train.columns]
X_train[valid_woe_columns].mode().T[0].to_dict()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, woe_columns, one_hot_columns):
        self.woe_columns = woe_columns
        self.one_hot_columns = one_hot_columns

    def fit(self, X, y):
        # Store mode values for WOE columns
        self.woe_columns_fill_na = X[self.woe_columns].mode().T[0].to_dict()
        
        # Calculate WOE and IV
        df_woe = X[self.woe_columns].copy()
        df_woe['SalePrice'] = y
        
        self.woe_mappings = {}
        self.iv_values = {}
        
        for col in self.woe_columns:
            # Group by category and calculate metrics
            groups = df_woe.groupby(col)['SalePrice'].agg(['count', 'mean'])
            groups['n_pos'] = groups['mean'] * groups['count']
            groups['n_neg'] = groups['count'] - groups['n_pos']
            
            # Calculate proportions
            total_pos = groups['n_pos'].sum()
            total_neg = groups['n_neg'].sum()
            groups['prop_pos'] = groups['n_pos'] / total_pos
            groups['prop_neg'] = groups['n_neg'] / total_neg
            
            # Calculate WOE and IV
            groups['woe'] = np.log(groups['prop_pos'] / groups['prop_neg'])
            groups['iv'] = (groups['prop_pos'] - groups['prop_neg']) * groups['woe']
            
            # Clean infinite/NA values
            groups.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
            
            # Store mappings
            self.woe_mappings[col] = groups['woe'].to_dict()
            self.iv_values[col] = groups['iv'].sum()

        return self

    def transform(self, X):
        X_transformed = X.copy()
        
        # Apply WOE encoding
        for col in self.woe_columns:
            new_col = f'{col}_woe'
            # Map values and handle NAs in one step
            X_transformed[new_col] = (
                X_transformed[col]
                .map(self.woe_mappings[col])
                .fillna(self.woe_mappings[col].get(self.woe_columns_fill_na[col], 0))
            )
        
        # One-hot encode remaining categoricals
        X_transformed = pd.get_dummies(
            X_transformed,
            columns=self.one_hot_columns,
            drop_first=True,
            dummy_na=True
        )
        
        # Drop original columns safely
        cols_to_drop = [c for c in (self.woe_columns + self.one_hot_columns) 
                       if c in X_transformed.columns]
        X_transformed = X_transformed.drop(columns=cols_to_drop)
        
        # Final NA clean-up (shouldn't be needed if dummy_na=True)
        return X_transformed.fillna(0)

In [None]:
valid_one_hot_columns = [col for col in one_hot_columns if col in X_train.columns]
preprocessor = CustomPreprocessor(
    woe_columns=valid_woe_columns,
    one_hot_columns=valid_one_hot_columns
)

In [None]:
X_train_t = preprocessor.fit_transform(X_train, y_train)

In [None]:
X_test_t = preprocessor.transform(X_test)

In [None]:
X_test_t.shape

In [None]:
X_train_t.head()

# Feature selection

In [None]:
X_corr = X_train_t.copy()
X_corr['SalePrice'] = y_train

In [None]:
available_cols = [col for col in X_corr.columns if col != 'SalePrice']
corr_matrix = X_corr[available_cols + ['SalePrice']].corr().abs()

In [None]:
corr_matrix

In [None]:
def find_and_drop_high_correlations(df, target_col='SalePrice', threshold=0.8):
    """
    1. Finds only feature pairs with correlation > threshold
    2. Clearly shows which feature to drop from each pair
    3. Returns filtered pairs and drop list
    """
    # Calculate correlations
    corr_matrix = df.corr().abs()
    
    # Get upper triangle without diagonal
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
    high_corr = corr_matrix.where(mask > threshold).stack().reset_index()
    high_corr.columns = ['Feature1', 'Feature2', 'Correlation']
    
    # Filter strictly above threshold
    high_corr = high_corr[high_corr['Correlation'] > threshold]
    
    if not high_corr.empty:
        # Calculate target correlations
        target_corrs = df.corr()[target_col].abs()
        
        # Add target correlation info
        high_corr['Feature1_TargetCorr'] = high_corr['Feature1'].map(target_corrs)
        high_corr['Feature2_TargetCorr'] = high_corr['Feature2'].map(target_corrs)
        
        # Determine which to drop
        high_corr['Drop'] = high_corr.apply(
            lambda x: x['Feature1'] if x['Feature1_TargetCorr'] < x['Feature2_TargetCorr'] else x['Feature2'],
            axis=1
        )
        
        # Sort and display
        high_corr = high_corr.sort_values('Correlation', ascending=False)
        print(f"Features with correlation > {threshold}:")
        display(high_corr[['Feature1', 'Feature2', 'Correlation', 
                          'Feature1_TargetCorr', 'Feature2_TargetCorr', 'Drop']])
        
        # Get unique features to drop
        features_to_drop = list(high_corr['Drop'].unique())
        print(f"\nRecommended features to drop: {features_to_drop}")
        
        return df.drop(columns=features_to_drop), features_to_drop
    else:
        print(f"No feature pairs with correlation > {threshold}")
        return df, []
# Usage
X_train_reduced, dropped_features = find_and_drop_high_correlations(
    X_corr, 
    target_col='SalePrice'
)

In [None]:
features_to_drop =  ['GarageArea', 'Exterior1st_woe', '1stFlrSF', 'TotRmsAbvGrd', 
                     'SaleCondition_woe', 'MasVnrType_BrkFace']
X_filtered = X_train_t.drop(columns=features_to_drop)
X_test_new = X_test_t.drop(columns=features_to_drop)
print(f"Original shape: {X_train_t.shape}, New shape: {X_filtered.shape}")

In [None]:
X_filtered.shape

In [None]:
X_train.shape

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Standardize the data
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_filtered),
    columns=X_filtered.columns
)

# Prepare the model
model = LinearRegression()

# Store results
results = []

# Define scoring metrics
scoring = {
    'r2': 'r2',
    'rmse': make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)))
}

# Test different feature counts
feature_counts = [10, 15, 20, 25, 30, 35, 40]

for n_features in feature_counts:
    rfe = RFE(estimator=model, n_features_to_select=n_features, step=1)
    rfe.fit(X_train_scaled, y_train)
    
    # Get selected features
    selected_features = X_filtered.columns[rfe.support_].tolist()
    X_selected = X_train_scaled[selected_features]
    
    # Cross-validation with multiple metrics
    cv_r2 = cross_val_score(model, X_selected, y_train, cv=5, scoring='r2').mean()
    cv_rmse = -cross_val_score(model, X_selected, y_train, cv=5, 
                              scoring='neg_root_mean_squared_error').mean()
    
    # Store all results
    results.append({
        'n_features': n_features,
        'features': selected_features,
        'r2': cv_r2,
        'rmse': cv_rmse
    })
    
    print(f"RFE with {n_features} features selected:")
    print(f"R²: {cv_r2:.4f}, RMSE: {cv_rmse:.4f}")
    print(f"Features: {selected_features}\n")

# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(results)

# Find best features by R²
best_r2_idx = results_df['r2'].idxmax()
best_features_r2 = results_df.loc[best_r2_idx, 'features']
best_n_r2 = results_df.loc[best_r2_idx, 'n_features']

# Find best features by RMSE
best_rmse_idx = results_df['rmse'].idxmin()
best_features_rmse = results_df.loc[best_rmse_idx, 'features']
best_n_rmse = results_df.loc[best_rmse_idx, 'n_features']

print(f"\nBest by R² ({results_df.loc[best_r2_idx, 'r2']:.4f}):")
print(f"{best_n_r2} features: {best_features_r2}")

print(f"\nBest by RMSE ({results_df.loc[best_rmse_idx, 'rmse']:.4f}):")
print(f"{best_n_rmse} features: {best_features_rmse}")

# Plot the results
plt.figure(figsize=(12, 5))

# R² plot
plt.subplot(1, 2, 1)
plt.plot(results_df['n_features'], results_df['r2'], marker='o', linestyle='-', color='b')
plt.title('Cross-validated R² Score')
plt.xlabel('Number of Features Selected')
plt.ylabel('R² Score')
plt.axvline(x=best_n_r2, color='r', linestyle='--', alpha=0.3)
plt.grid(True)

# RMSE plot
plt.subplot(1, 2, 2)
plt.plot(results_df['n_features'], results_df['rmse'], marker='o', linestyle='-', color='g')
plt.title('Cross-validated RMSE')
plt.xlabel('Number of Features Selected')
plt.ylabel('RMSE')
plt.axvline(x=best_n_rmse, color='r', linestyle='--', alpha=0.3)
plt.grid(True)

plt.tight_layout()
plt.show()

# Store the best feature sets (you can access these later)
Best_features_r2 = best_features_r2
Best_features_rmse = best_features_rmse
Best_n_r2 = best_n_r2
Best_n_rmse = best_n_rmse

In [None]:
Best_features_r2

# Training

In [None]:
!pip install dagshub mlflow

In [None]:
import dagshub
dagshub.init(repo_owner='tvani2', repo_name='assn1', mlflow=True)

# L1

In [None]:
import mlflow
import mlflow.sklearn

# Start MLflow experiment and run
mlflow.set_experiment("House Price Regression")

with mlflow.start_run(run_name="L1") as run:
    # Log basic parameters
    mlflow.log_params({
        "model": "Lasso",
        "cv_folds": 5,
        "scoring": "neg_root_mean_squared_error",
        "search_type": "GridSearchCV"
    })

    # Fit the model
    grid_search.fit(X_filtered, y_train)

    # Log best parameters
    mlflow.log_params(grid_search.best_params_)

    # Log best cross-validation RMSE
    best_rmse_cv = -grid_search.best_score_
    mlflow.log_metric("cv_rmse", best_rmse_cv)

    # Test set evaluation (if available)
    if 'X_test_new' in locals():
        y_pred = grid_search.predict(X_test_new)
        rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
        r2_test = r2_score(y_test, y_pred)
        mlflow.log_metric("test_rmse", rmse_test)
        mlflow.log_metric("test_r2", r2_test)

    # Feature importance logging
    best_lasso = grid_search.best_estimator_.named_steps['regressor']
    feature_importance = pd.DataFrame({
        'Feature': X_filtered.columns,
        'Coefficient': best_lasso.coef_,
        'Absolute_Coeff': np.abs(best_lasso.coef_)
    }).sort_values('Absolute_Coeff', ascending=False)
    
    # Save top 10 features to CSV and log as artifact
    top_features_path = "top_features.csv"
    feature_importance.head(10).to_csv(top_features_path, index=False)
    mlflow.log_artifact(top_features_path)

    # Log the full model
    mlflow.sklearn.log_model(grid_search.best_estimator_, "model")

In [None]:
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score

# Set MLflow experiment
mlflow.set_experiment("House Price Regression")

with mlflow.start_run(run_name="L1 new alphas"):
    # Define pipeline
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('regressor', Lasso(random_state=42))
    ])

    # Hyperparameter grid
    alphas = np.logspace(0, 3, 20)
    param_grid = {
        'scaler': [MinMaxScaler()],
        'regressor__alpha': alphas,
        'regressor__selection': ['random']
    }

    # K-Fold cross-validation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)

    # Grid search
    grid_search = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        cv=kfold,
        scoring='neg_root_mean_squared_error',
        verbose=2,
        n_jobs=-1,
        return_train_score=True
    )

    # Fit the model
    grid_search.fit(X_filtered, y_train)

    # Log parameters (convert non-serializables to strings!)
    mlflow.log_param("scaler", "MinMaxScaler")
    mlflow.log_param("selection", "random")
    mlflow.log_param("alpha_range", "logspace(0, 3, 20)")
    mlflow.log_param("best_alpha", grid_search.best_params_['regressor__alpha'])

    # Log best CV RMSE
    best_rmse_cv = -grid_search.best_score_
    mlflow.log_metric("cv_rmse", best_rmse_cv)

    # Evaluate on test set
    if 'X_test_new' in locals():
        y_pred = grid_search.predict(X_test_new)
        rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
        r2_test = r2_score(y_test, y_pred)
        mlflow.log_metric("test_rmse", rmse_test)
        mlflow.log_metric("test_r2", r2_test)

    # Log feature importance
    best_lasso = grid_search.best_estimator_.named_steps['regressor']
    feature_importance = pd.DataFrame({
        'Feature': X_filtered.columns,
        'Coefficient': best_lasso.coef_,
        'Absolute_Coeff': np.abs(best_lasso.coef_)
    }).sort_values('Absolute_Coeff', ascending=False)

    # Save and log top features
    top_features_path = "top_features_L1_new_alphas.csv"
    feature_importance.head(10).to_csv(top_features_path, index=False)
    mlflow.log_artifact(top_features_path)

    # Log model
    mlflow.sklearn.log_model(grid_search.best_estimator_, "model")

In [None]:
import matplotlib.pyplot as plt

results = pd.DataFrame(grid_search.cv_results_)
alphas = results['param_regressor__alpha'].astype(float)
rmse = -results['mean_test_score']

plt.figure(figsize=(8,5))
plt.plot(alphas, rmse, marker='o')
plt.xscale('log')
plt.xlabel('Alpha')
plt.ylabel('CV RMSE')
plt.title('Validation Curve')
plt.grid(True)
plt.show()

# L2

In [None]:
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error, r2_score

# Set experiment
mlflow.set_experiment("House Price Regression")

with mlflow.start_run(run_name="L2"):
    # Define pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', Ridge(random_state=42))
    ])

    # Hyperparameter grid
    alphas = np.logspace(-3, 3, 50)
    param_grid = {
        'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
        'regressor__alpha': alphas
    }

    # K-Fold CV
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)

    # Scoring
    scoring = {
        'rmse': 'neg_root_mean_squared_error',
        'r2': 'r2'
    }

    # Grid search
    grid_search = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        cv=kfold,
        scoring=scoring,
        refit='rmse',
        verbose=2,
        n_jobs=-1,
        return_train_score=True
    )

    # Fit model
    grid_search.fit(X_filtered, y_train)

    # Log best params (with scaler name converted to string)
    best_params = grid_search.best_params_
    mlflow.log_param("scaler", type(best_params['scaler']).__name__)
    mlflow.log_param("best_alpha", best_params['regressor__alpha'])

    # Log CV metrics
    best_rmse_cv = -grid_search.best_score_
    results_df = pd.DataFrame(grid_search.cv_results_)
    best_r2_cv = results_df.loc[grid_search.best_index_, 'mean_test_r2']
    mlflow.log_metric("cv_rmse", best_rmse_cv)
    mlflow.log_metric("cv_r2", best_r2_cv)

    # Evaluate on test set
    if 'X_test_new' in locals():
        y_pred = grid_search.predict(X_test_new)
        rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
        r2_test = r2_score(y_test, y_pred)
        mlflow.log_metric("test_rmse", rmse_test)
        mlflow.log_metric("test_r2", r2_test)

    # Feature importance
    best_ridge = grid_search.best_estimator_.named_steps['regressor']
    feature_importance = pd.DataFrame({
        'Feature': X_filtered.columns,
        'Coefficient': best_ridge.coef_,
        'Absolute_Coeff': np.abs(best_ridge.coef_)
    }).sort_values('Absolute_Coeff', ascending=False)

    # Save & log top 10 features
    top_features_path = "top_features_L2.csv"
    feature_importance.head(10).to_csv(top_features_path, index=False)
    mlflow.log_artifact(top_features_path)

    # Log model
    mlflow.sklearn.log_model(grid_search.best_estimator_, "model")

# ElasticNet

In [None]:
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error, r2_score

# Set experiment
mlflow.set_experiment("House Price Regression")

with mlflow.start_run(run_name="ElasticNet"):
    # Define pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Placeholder
        ('regressor', ElasticNet(random_state=42, max_iter=10000))
    ])

    # Parameter grid
    param_grid = {
        'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
        'regressor__alpha': np.logspace(-3, 3, 10),
        'regressor__l1_ratio': np.linspace(0.1, 0.9, 5),
        'regressor__selection': ['cyclic', 'random']
    }

    # Cross-validation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)

    # Scoring
    scoring = {
        'rmse': 'neg_root_mean_squared_error',
        'r2': 'r2'
    }

    # Grid Search
    grid_search = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        cv=kfold,
        scoring=scoring,
        refit='rmse',
        verbose=2,
        n_jobs=-1,
        return_train_score=True
    )

    # Fit model
    grid_search.fit(X_filtered, y_train)

    # Extract best params
    best_params = grid_search.best_params_
    mlflow.log_param("scaler", type(best_params['scaler']).__name__)
    mlflow.log_param("alpha", best_params['regressor__alpha'])
    mlflow.log_param("l1_ratio", best_params['regressor__l1_ratio'])
    mlflow.log_param("selection", best_params['regressor__selection'])

    # Cross-validation results
    best_rmse_cv = -grid_search.best_score_
    results_df = pd.DataFrame(grid_search.cv_results_)
    best_r2_cv = results_df.loc[grid_search.best_index_, 'mean_test_r2']

    mlflow.log_metric("cv_rmse", best_rmse_cv)
    mlflow.log_metric("cv_r2", best_r2_cv)

    print("\nBest parameters:", best_params)
    print("Best CV RMSE:", best_rmse_cv)
    print("Best CV R²:", best_r2_cv)

    # Evaluate on test set if available
    if 'X_test_new' in locals():
        y_pred = grid_search.predict(X_test_new)
        rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
        r2_test = r2_score(y_test, y_pred)

        mlflow.log_metric("test_rmse", rmse_test)
        mlflow.log_metric("test_r2", r2_test)

        print("\nTest set performance:")
        print(f"RMSE: {rmse_test:.4f}")
        print(f"R²: {r2_test:.4f}")

    # Feature importance
    best_model = grid_search.best_estimator_.named_steps['regressor']
    feature_importance = pd.DataFrame({
        'Feature': X_filtered.columns,
        'Coefficient': best_model.coef_,
        'Absolute_Coeff': np.abs(best_model.coef_)
    }).sort_values('Absolute_Coeff', ascending=False)

    print("\nTop 10 most important features:")
    print(feature_importance.head(10))

    # Save top 10 features
    top_features_path = "top_features_elasticnet.csv"
    feature_importance.head(10).to_csv(top_features_path, index=False)
    mlflow.log_artifact(top_features_path)

    # Log the model
    mlflow.sklearn.log_model(grid_search.best_estimator_, "model")

In [None]:
print(f"Original shape: {X_train.shape}")  # Likely (n_samples, 72)
print(f"Effective features used: {(best_model.coef_ != 0).sum()}")  # 55

# L2 on limited data

In [None]:
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

# Set MLflow experiment
mlflow.set_experiment("House Price Regression")

# Select the best features
X_train_best = X_filtered[Best_features_r2]

# Define alpha values to test
alphas = np.logspace(-3, 3, 50)

# Store results
ridge_results = []

with mlflow.start_run(run_name="L2 with limited features"):
    # Loop through alpha values
    for alpha in alphas:
        ridge = Ridge(alpha=alpha)

        # Cross-validation
        cv_r2 = cross_val_score(ridge, X_train_best, y_train, cv=5, scoring='r2').mean()
        cv_rmse = -cross_val_score(ridge, X_train_best, y_train, cv=5, 
                                  scoring='neg_root_mean_squared_error').mean()

        ridge_results.append({
            'alpha': alpha,
            'r2': cv_r2,
            'rmse': cv_rmse
        })

    # Convert results to DataFrame
    ridge_results_df = pd.DataFrame(ridge_results)

    # Find best alpha values
    best_alpha_r2 = ridge_results_df.loc[ridge_results_df['r2'].idxmax(), 'alpha']
    best_r2 = ridge_results_df['r2'].max()
    best_alpha_rmse = ridge_results_df.loc[ridge_results_df['rmse'].idxmin(), 'alpha']
    best_rmse = ridge_results_df['rmse'].min()

    # Log best parameters and metrics
    mlflow.log_param("best_alpha_r2", best_alpha_r2)
    mlflow.log_param("best_alpha_rmse", best_alpha_rmse)
    mlflow.log_metric("best_r2", best_r2)
    mlflow.log_metric("best_rmse", best_rmse)

    # Fit final model using best R² alpha
    final_ridge = Ridge(alpha=best_alpha_r2)
    final_ridge.fit(X_train_best, y_train)

    # Log model
    mlflow.sklearn.log_model(final_ridge, "ridge_model_limited_features")

    # Feature importances
    ridge_coefs = pd.DataFrame({
        'feature': Best_features_r2,
        'coefficient': final_ridge.coef_
    }).sort_values('coefficient', key=abs, ascending=False)

    # Log top 10 features
    top_features_str = ridge_coefs.head(10).to_string(index=False)
    mlflow.log_text(top_features_str, "top_10_features.txt")

    # Optional: Save and log the plots
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.semilogx(ridge_results_df['alpha'], ridge_results_df['r2'])
    plt.title('Ridge Regression R² vs Alpha')
    plt.xlabel('Alpha (log scale)')
    plt.ylabel('R² Score')
    plt.axvline(x=best_alpha_r2, color='r', linestyle='--', alpha=0.3)
    plt.grid(True)

    plt.subplot(1, 2, 2)
    plt.semilogx(ridge_results_df['alpha'], ridge_results_df['rmse'])
    plt.title('Ridge Regression RMSE vs Alpha')
    plt.xlabel('Alpha (log scale)')
    plt.ylabel('RMSE')
    plt.axvline(x=best_alpha_rmse, color='r', linestyle='--', alpha=0.3)
    plt.grid(True)

    plt.tight_layout()
    plot_path = "ridge_alpha_tuning.png"
    plt.savefig(plot_path)
    mlflow.log_artifact(plot_path)
    plt.show()

    print(f"Best alpha by R²: {best_alpha_r2:.4f} (R²: {best_r2:.4f})")
    print(f"Best alpha by RMSE: {best_alpha_rmse:.4f} (RMSE: {best_rmse:.4f})")
    print("\nTop 10 most important features:")
    print(ridge_coefs.head(10))

# Non Linear

In [None]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Set experiment
mlflow.set_experiment("House Price Regression")

# Hyperparameters to evaluate
depths = [3, 5, 10, None]
splits = [2, 5, 10]

# Begin MLflow run
with mlflow.start_run(run_name="Non Linear"):
    mlflow.log_param("model_type", "DecisionTree & RandomForest")

    print("Decision Tree Results:")
    dt_results = []
    for depth in depths:
        for split in splits:
            dt = DecisionTreeRegressor(max_depth=depth, min_samples_split=split, random_state=0)
            r2_scores = cross_val_score(dt, X_filtered, y_train, cv=5, scoring='r2')
            mean_r2 = np.mean(r2_scores)

            print(f"DT: max_depth={depth}, min_samples_split={split} → R² = {mean_r2:.4f}")
            
            # Log metrics to MLflow
            mlflow.log_metric(f"DT_r2_depth_{depth}_split_{split}", mean_r2)

            dt_results.append({
                'model': 'DecisionTree',
                'max_depth': depth,
                'min_samples_split': split,
                'r2': mean_r2
            })

    print("\nRandom Forest Results:")
    rf_results = []
    for depth in depths:
        for split in splits:
            rf = RandomForestRegressor(n_estimators=100, max_depth=depth, 
                                       min_samples_split=split, random_state=0)
            r2_scores = cross_val_score(rf, X_filtered, y_train, cv=5, scoring='r2')
            rmse_scores = -cross_val_score(rf, X_filtered, y_train, cv=5,
                                           scoring='neg_root_mean_squared_error')

            mean_r2 = np.mean(r2_scores)
            mean_rmse = np.mean(rmse_scores)

            print(f"RF: max_depth={depth}, min_samples_split={split} → R² = {mean_r2:.4f}")
            print(f"RF: max_depth={depth}, min_samples_split={split} → RMSE = {mean_rmse:.4f}")

            # Log metrics to MLflow
            mlflow.log_metric(f"RF_r2_depth_{depth}_split_{split}", mean_r2)
            mlflow.log_metric(f"RF_rmse_depth_{depth}_split_{split}", mean_rmse)

            rf_results.append({
                'model': 'RandomForest',
                'max_depth': depth,
                'min_samples_split': split,
                'r2': mean_r2,
                'rmse': mean_rmse
            })

    # Save full results to file and log
    all_results_df = pd.DataFrame(dt_results + rf_results)
    all_results_df.to_csv("non_linear_results.csv", index=False)
    mlflow.log_artifact("non_linear_results.csv")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Train model
rf = RandomForestRegressor(max_depth=None, min_samples_split=2, random_state=0)
rf.fit(X_filtered, y_train)

# Evaluate
train_r2 = r2_score(y_train, rf.predict(X_filtered))
test_r2 = r2_score(y_test, rf.predict(X_test_new))

print(f"Training R²: {train_r2:.4f}")
print(f"Test R²:     {test_r2:.4f}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
from sklearn.model_selection import learning_curve
from sklearn.ensemble import RandomForestRegressor

# Set experiment
mlflow.set_experiment("House Price Regression")

with mlflow.start_run(run_name="Random Forest"):

    # Log model type as a parameter
    mlflow.log_param("model_type", "RandomForestRegressor")

    # Data (replace with your real data if needed)
    X = X_filtered
    y = y_train

    # Model setup
    model = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)

    # Get learning curve data
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y,
        train_sizes=np.linspace(0.1, 1.0, 10),
        cv=5,
        scoring='r2',
        n_jobs=-1,
        shuffle=True,
        random_state=42
    )

    # Compute means and stds
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Plotting
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Training R²')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')

    plt.plot(train_sizes, test_mean, 'o-', color='green', label='Cross-validation R²')
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='green')

    plt.title('Learning Curve')
    plt.xlabel('Training Set Size')
    plt.ylabel('R² Score')
    plt.legend(loc='best')
    plt.grid(True)
    plt.tight_layout()

    # Save and log the plot
    plot_path = "learning_curve_rf.png"
    plt.savefig(plot_path)
    mlflow.log_artifact(plot_path)
    plt.close()

    # Print the link to view in UI
    run = mlflow.active_run()
    print("View run at:", f"http://127.0.0.1:5000/#/experiments/{run.info.experiment_id}/runs/{run.info.run_id}")

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

# Create the model
gbr = GradientBoostingRegressor(
    n_estimators=300,         # Number of boosting stages
    learning_rate=0.05,       # Shrinks contribution of each tree
    max_depth=3,              # Limits depth of individual trees
    min_samples_split=5,      # Minimum samples required to split
    min_samples_leaf=3,       # Minimum samples per leaf node
    subsample=0.8,            # Use 80% of samples for fitting each tree
    max_features='sqrt',      # Use sqrt(n_features) per tree (like RF)
    random_state=42
)

# Fit and evaluate
gbr.fit(X_filtered, y_train)

train_r2 = gbr.score(X_filtered, y_train)
test_r2 = gbr.score(X_test_new, y_test)
cv_r2 = cross_val_score(gbr, X_filtered, y_train, cv=5, scoring='r2')

print(f"Training R²: {train_r2:.4f}")
print(f"Test R²:     {test_r2:.4f}")
print(f"CV R² (mean): {np.mean(cv_r2):.4f} ± {np.std(cv_r2):.4f}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
import mlflow
import mlflow.sklearn

# Set experiment
mlflow.set_experiment("House Price Regression")

with mlflow.start_run(run_name="Booster"):

    # Initialize the model
    gbr = GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
        min_samples_split=5,
        min_samples_leaf=3,
        subsample=0.8,
        max_features='sqrt',
        random_state=42
    )

    # Log model parameters
    mlflow.log_param("model_type", "GradientBoostingRegressor")
    mlflow.log_params({
        "n_estimators": 300,
        "learning_rate": 0.05,
        "max_depth": 3,
        "min_samples_split": 5,
        "min_samples_leaf": 3,
        "subsample": 0.8,
        "max_features": "sqrt",
    })

    # Fit the model
    gbr.fit(X_filtered, y_train)

    # Evaluate
    train_r2 = gbr.score(X_filtered, y_train)
    test_r2 = gbr.score(X_test_new, y_test)
    cv_r2 = cross_val_score(gbr, X_filtered, y_train, cv=5, scoring='r2')

    # Log metrics
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_r2", test_r2)
    mlflow.log_metric("cv_r2_mean", np.mean(cv_r2))
    mlflow.log_metric("cv_r2_std", np.std(cv_r2))

    mlflow.sklearn.log_model(gbr, "model")

    # Optional: log CV scores as a CSV artifact
    pd.DataFrame({'cv_r2_scores': cv_r2}).to_csv("cv_r2_scores.csv", index=False)
    mlflow.log_artifact("cv_r2_scores.csv")

    # Output
    print(f"Training R²: {train_r2:.4f}")
    print(f"Test R²:     {test_r2:.4f}")
    print(f"CV R² (mean): {np.mean(cv_r2):.4f} ± {np.std(cv_r2):.4f}")

    # Direct link
    run = mlflow.active_run()
    print("View run at:", f"http://127.0.0.1:5000/#/experiments/{run.info.experiment_id}/runs/{run.info.run_id}")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

# --- Assuming the model is already trained ---

# Predict on test data
y_pred = gbr.predict(X_test_new)

# Calculate error
errors = y_pred - y_test

# Create DataFrame for plotting
df_error = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Error': errors
}).reset_index(drop=True)

# Set seaborn style
sns.set(style="whitegrid")

# Plot error bar chart
plt.figure(figsize=(14, 6))
sns.barplot(x=df_error.index, y='Error', data=df_error, palette='coolwarm')
plt.axhline(0, color='black', linestyle='--')
plt.title('Prediction Error Per House (Predicted - Actual)')
plt.xlabel('House Index')
plt.ylabel('Error')
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error

# Define experiment
mlflow.set_experiment("House Price Regression")

with mlflow.start_run(run_name="Booster loss = huber"):

    # Model parameters
    model_params = {
        "n_estimators": 500,
        "learning_rate": 0.05,
        "max_depth": 2,
        "min_samples_split": 15,
        "min_samples_leaf": 8,
        "subsample": 0.8,
        "max_features": 'sqrt',
        "loss": 'huber',
        "random_state": 42
    }

    # Log parameters
    mlflow.log_params(model_params)

    # Create and train model
    gbr = GradientBoostingRegressor(**model_params)
    gbr.fit(X_filtered, y_train)

    # Predictions
    y_train_pred = gbr.predict(X_filtered)
    y_test_pred = gbr.predict(X_test_new)
    cv_preds = cross_val_predict(gbr, X_filtered, y_train, cv=5)

    # Evaluation metrics
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    cv_r2_scores = cross_val_score(gbr, X_filtered, y_train, cv=5, scoring='r2')
    cv_r2_mean = np.mean(cv_r2_scores)
    cv_r2_std = np.std(cv_r2_scores)

    # Log metrics
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_r2", test_r2)
    mlflow.log_metric("cv_r2_mean", cv_r2_mean)
    mlflow.log_metric("cv_r2_std", cv_r2_std)

    print(f"Training R²: {train_r2:.4f}")
    print(f"Test R²:     {test_r2:.4f}")
    print(f"CV R² (mean): {cv_r2_mean:.4f} ± {cv_r2_std:.4f}")

    # Residual plots
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.scatter(range(len(y_train)), y_train_pred - y_train, alpha=0.6)
    plt.axhline(0, linestyle='--', color='black')
    plt.title("Training Residuals")
    plt.xlabel("House #")
    plt.ylabel("Error (Predicted - Actual)")

    plt.subplot(1, 2, 2)
    plt.scatter(range(len(y_train)), cv_preds - y_train, alpha=0.6, color='orange')
    plt.axhline(0, linestyle='--', color='black')
    plt.title("CV Residuals")
    plt.xlabel("House #")
    plt.ylabel("Error (Predicted - Actual)")

    plt.tight_layout()
    plot_path = "residuals_plot.png"
    plt.savefig(plot_path)
    mlflow.log_artifact(plot_path)
    plt.show()

    # Log model
    mlflow.sklearn.log_model(gbr, "gradient_boosting_model")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Split training set for early stopping
X_train2, X_val, y_train2, y_val = train_test_split(X_filtered, y_train, test_size=0.2, random_state=42)

# Set up MLflow
mlflow.set_experiment("House Price Regression")

with mlflow.start_run(run_name="Booster early stopping"):

    base_params = {
        "n_estimators": 1000,
        "learning_rate": 0.05,
        "max_depth": 2,
        "min_samples_split": 15,
        "min_samples_leaf": 8,
        "subsample": 0.8,
        "max_features": 'sqrt',
        "loss": 'huber',
        "random_state": 42
    }

    mlflow.log_params(base_params)

    # Train model with full estimators for early stopping
    gbr_es = GradientBoostingRegressor(**base_params)
    gbr_es.fit(X_train2, y_train2)

    # Track validation error at each stage
    errors = []
    for y_pred in gbr_es.staged_predict(X_val):
        errors.append(mean_squared_error(y_val, y_pred))

    # Log all validation errors
    for i, e in enumerate(errors):
        mlflow.log_metric("val_mse", e, step=i)

    best_n = np.argmin(errors)
    best_val_mse = errors[best_n]

    print(f"Best number of trees: {best_n}")
    mlflow.log_param("early_stopped_n_estimators", best_n)
    mlflow.log_metric("best_val_mse", best_val_mse)

    # Retrain with best_n on full training data
    gbr_final = GradientBoostingRegressor(**{**base_params, "n_estimators": best_n})
    gbr_final.fit(X_filtered, y_train)

    train_r2 = gbr_final.score(X_filtered, y_train)
    test_r2 = gbr_final.score(X_test_new, y_test)

    print(f"Final Training R²: {train_r2:.4f}")
    print(f"Final Test R²:     {test_r2:.4f}")

    mlflow.log_metric("final_train_r2", train_r2)
    mlflow.log_metric("final_test_r2", test_r2)

    # Plot validation error
    plt.figure(figsize=(8, 5))
    plt.plot(errors, label='Validation MSE')
    plt.axvline(best_n, linestyle='--', color='red', label=f'Best n = {best_n}')
    plt.xlabel("Number of Trees")
    plt.ylabel("Validation MSE")
    plt.title("Validation Error over Boosting Stages")
    plt.legend()
    plot_path = "early_stopping_curve.png"
    plt.savefig(plot_path)
    mlflow.log_artifact(plot_path)
    plt.close()

    # Log model
    mlflow.sklearn.log_model(gbr_final, "final_model")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Predict on test set
y_pred = gbr_final.predict(X_test_new)

# Calculate residuals
residuals = y_test - y_pred

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(range(len(residuals)), residuals, alpha=0.7, color='teal', edgecolors='k')
plt.axhline(0, color='red', linestyle='--', linewidth=2)
plt.title("Residual Plot (Actual - Predicted)", fontsize=14)
plt.xlabel("House Index", fontsize=12)
plt.ylabel("Residual Error", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()