In [None]:
def remove_features(X, features_to_remove):
    """
    Removes specified features from the DataFrame.

    Args:
    - X (pd.DataFrame): The original DataFrame.
    - features_to_remove (list): List of column names to remove from X.

    Returns:
    - pd.DataFrame: DataFrame after removing specified features.
    """
    #return X.drop(columns=features_to_remove, errors='ignore')


In [None]:
import pandas as pd
import numpy as np

def preprocess_data_remove_features2(df):
    # Remove specific columns

    cols_to_remove = ["Longitude", "Latitude", "Street", "Utilities", "Condition_2",
                      "Roof_Matl", "Heating", "Pool_QC", "Misc_Feature", "Low_Qual_Fin_SF", "Pool_Area"]
    X = df
    #print("X  before remove features", X.columns)
    X = df.drop(columns=cols_to_remove)

    #X = df
    #print("X  after remove features", X.columns)
    # Create quadratic features
    quad = X.select_dtypes(include=[np.number]) ** 2
    quad.columns = [str(col) + "_Q2" for col in quad.columns]

    # Create cubic features
    cube = X.select_dtypes(include=[np.number]) ** 3
    cube.columns = [str(col) + "_C3" for col in cube.columns]

    # Combine original data with quadratic and cubic features
    X = pd.concat([X, quad, cube], axis=1)

    # One-hot encoding for categorical columns
    X = pd.get_dummies(X, drop_first=True)
    #print("X  after adding dummies", X.columns)
    return X


In [None]:
def preprocess_data_remove_features(df, categorical_columns):
    """
    Preprocesses the dataframe by removing specific columns, creating quadratic
    and cubic features for numeric columns, and encoding specified categorical columns.

    Args:
    - df (pd.DataFrame): The original dataframe.
    - categorical_columns (list): List of categorical column names to be one-hot encoded.

    Returns:
    - pd.DataFrame: The processed dataframe with quadratic and cubic features and encoded categorical variables.
    """
    # Remove specific columns
    cols_to_remove = ["Longitude", "Latitude", "Street", "Utilities", "Condition_2",
                      "Roof_Matl", "Heating", "Pool_QC", "Misc_Feature", "Low_Qual_Fin_SF", "Pool_Area"]
    X = df.drop(columns=cols_to_remove, errors='ignore')

    # Create quadratic and cubic features for numeric columns only
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    quad = X[numeric_cols] ** 2
    quad.columns = [str(col) + "_Q2" for col in quad.columns]

    cube = X[numeric_cols] ** 3
    cube.columns = [str(col) + "_C3" for col in cube.columns]

    # Combine original data with quadratic and cubic features
    X_combined = pd.concat([X, quad, cube], axis=1)

    # Apply one-hot encoding only to the specified categorical columns
    X_encoded = pd.get_dummies(X_combined, columns=categorical_columns, drop_first=True)

    return X_encoded


In [None]:
from scipy.stats import mstats

# To store saved quantiles across folds
saved_quantiles = {}

def preprocess_data_winsorize_selected(df, fold, to_winsorize, train=True):
    global saved_quantiles

    # Initialize result as a copy of the input dataframe
    res = df.copy()

    if fold not in saved_quantiles:
        saved_quantiles[fold] = {}

    for col in to_winsorize:
        if col in res.columns:
            if train:
                # Calculate the 5th and 95th percentiles
                lower_quantile = res[col].quantile(0.05)
                upper_quantile = res[col].quantile(0.95)
                saved_quantiles[fold][col] = (lower_quantile, upper_quantile)
            else:
                # Use previously saved quantiles
                lower_quantile, upper_quantile = saved_quantiles[fold][col]

            # Apply winsorization
            res[col] = mstats.winsorize(res[col], limits=[0.05, 0.05])

    return res


In [None]:
import pandas as pd

def numeric_columns(df):
    # Select columns that are not categorical (i.e., numeric columns)
    categorical_vars = df.select_dtypes(include=['object']).columns
    df_numeric = df.drop(columns=categorical_vars)

    return df_numeric


In [None]:
def raise_pow(df, p):
    return df ** p


In [None]:
def changename(col):
    return col + "_A2"


Example usage

In [None]:
import pandas as pd

# Sample DataFrame
data = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': ['cat', 'dog', 'mouse']}
df = pd.DataFrame(data)

# 1. Get only numeric columns
numeric_df = numeric_columns(df)

# 2. Raise numeric columns to the power of 2
quad_df = raise_pow(numeric_df, 2)

# 3. Change column names (append "_A2")
quad_df.columns = [changename(col) for col in quad_df.columns]

print(quad_df)


   A_A2  B_A2
0     1    16
1     4    25
2     9    36


In [None]:
import pandas as pd
import numpy as np

def preprocess_data_poly(df):
    # Remove specific columns
    cols_to_remove = ["Longitude", "Latitude", "Street", "Utilities", "Condition_2",
                      "Roof_Matl", "Heating", "Pool_QC", "Misc_Feature", "Low_Qual_Fin_SF", "Pool_Area"]

    X = df.drop(columns=cols_to_remove)
    """
    # Create quadratic features
    quad = X.select_dtypes(include=[np.number]) ** 2
    quad.columns = [str(col) + "_A2" for col in quad.columns]

    # Create cubic features
    cube = X.select_dtypes(include=[np.number]) ** 3
    cube.columns = [str(col) + "_A3" for col in cube.columns]

    # Combine original data with quadratic and cubic features
    X = pd.concat([X, quad, cube], axis=1)
    """
    # One-hot encoding for categorical columns
    X = pd.get_dummies(X, drop_first=True)
    print ("X", X)
    return X


In [None]:
from scipy.stats import mstats

# To store saved quantiles across folds
saved_quantiles = {}

def winsorize_selected(df, fold, to_winsorize, train=True):
    global saved_quantiles

    # Initialize result as a copy of the input dataframe
    res = df.copy()

    if fold not in saved_quantiles:
        saved_quantiles[fold] = {}

    for col in to_winsorize:
        if col in res.columns:
            if train:
                # Calculate the 5th and 95th percentiles
                lower_quantile = res[col].quantile(0.05)
                upper_quantile = res[col].quantile(0.95)
                saved_quantiles[fold][col] = (lower_quantile, upper_quantile)
            else:
                # Use previously saved quantiles
                lower_quantile, upper_quantile = saved_quantiles[fold][col]

            # Apply winsorization
            res[col] = mstats.winsorize(res[col], limits=[0.05, 0.05])

    return res



In [None]:
import pandas as pd

def numeric_columns(df):
    # Select columns that are not categorical (i.e., numeric columns)
    categorical_vars = df.select_dtypes(include=['object']).columns
    df_numeric = df.drop(columns=categorical_vars)

    return df_numeric

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from scipy import stats  # Importing for z-score calculation

# Helper functions
def preprocess_data(X_trn):
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit the scaler on the training data and transform it
    X_trn_scaled = scaler.fit_transform(X_trn)

    # Return the scaled training data as a DataFrame (to preserve column names) and the scaler object
    X_trn_scaled_df = pd.DataFrame(X_trn_scaled, columns=X_trn.columns)

    return X_trn_scaled_df, scaler

def calc_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def adjusted_r_squared(r2, n, p):
    # Calculate adjusted R-squared
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

def process_fold(fold_num):
    print(f"\nProcessing fold{fold_num}...")

    # Load the train, test, and test_y datasets
    train_file_path = f'fold{fold_num}/train.csv'
    test_file_path = f'fold{fold_num}/test.csv'
    test_y_file_path = f'fold{fold_num}/test_y.csv'

    housing_data_train = pd.read_csv(train_file_path)
    housing_data_test = pd.read_csv(test_file_path)
    test_y_data = pd.read_csv(test_y_file_path)  # Load the actual Sale Price for the test set

    # Ensure that test_y_data and housing_data_test have matching PIDs
    merged_test_data = pd.merge(housing_data_test, test_y_data, on='PID', how='inner')


    # Identify categorical columns
    categorical_columns = ['MS_SubClass', 'MS_Zoning',  'Alley', 'Lot_Shape', 'Land_Contour',
                           'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1',  'Bldg_Type',
                           'House_Style', 'Overall_Qual', 'Overall_Cond', 'Roof_Style',  'Exterior_1st',
                           'Exterior_2nd', 'Mas_Vnr_Type', 'Exter_Qual', 'Exter_Cond', 'Foundation', 'Bsmt_Qual',
                           'Bsmt_Cond', 'Bsmt_Exposure', 'BsmtFin_Type_1', 'BsmtFin_Type_2',  'Heating_QC',
                           'Central_Air', 'Electrical', 'Kitchen_Qual', 'Functional', 'Fireplace_Qu', 'Garage_Type',
                           'Garage_Finish', 'Garage_Qual', 'Garage_Cond', 'Paved_Drive',  'Fence',
                           'Sale_Type', 'Sale_Condition']

    # Perform dummy encoding on both train and test datasets
    #housing_data_encoded_train = pd.get_dummies(housing_data_train, columns=categorical_columns, drop_first=True)
    #housing_data_encoded_test = pd.get_dummies(merged_test_data, columns=categorical_columns, drop_first=True)
    housing_data_encoded_train = preprocess_data_remove_features(housing_data_train, categorical_columns)
    housing_data_encoded_test = preprocess_data_remove_features(merged_test_data, categorical_columns)
    # Align columns of train and test after dummy encoding:
    all_columns = list(set(housing_data_encoded_train.columns) | set(housing_data_encoded_test.columns))
    all_columns.remove('Sale_Price')  # Remove 'Sale_Price' if it's in all_columns

    housing_data_encoded_train = housing_data_encoded_train.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    housing_data_encoded_test = housing_data_encoded_test.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    #print("housing_data_encoded_train shape", housing_data_encoded_train.shape)
    #print("housing_data_encoded_test shape", housing_data_encoded_test.shape)

    # Handle missing values by imputing them in train and test data
    imputer = SimpleImputer(strategy='median')
    numeric_columns = housing_data_encoded_train.select_dtypes(include=[np.number]).columns.tolist()
    numeric_columns = [col for col in numeric_columns if col not in ['Sale_Price', 'PID']]
    housing_data_encoded_imputed_train = housing_data_encoded_train.copy()
    housing_data_encoded_imputed_train[numeric_columns] = imputer.fit_transform(housing_data_encoded_train[numeric_columns])

    housing_data_encoded_imputed_test = housing_data_encoded_test.copy()
    housing_data_encoded_imputed_test[numeric_columns] = imputer.transform(housing_data_encoded_test[numeric_columns])
    #print("housing_data_encoded_imputed_train shape", housing_data_encoded_imputed_train.shape)
    #print("housing_data_encoded_imputed_test shape", housing_data_encoded_imputed_test.shape)

    # Remove 'PID' from train and test datasets
    housing_data_encoded_imputed_train = housing_data_encoded_imputed_train.drop(columns=['PID'])
    if 'PID' in housing_data_encoded_imputed_test.columns:
        housing_data_encoded_imputed_test = housing_data_encoded_imputed_test.drop(columns=['PID'])

    # Separate predictors (X) and target variable (y) for training
    X_trn = housing_data_encoded_imputed_train.drop(columns='Sale_Price')
    y_trn = np.log(housing_data_encoded_imputed_train['Sale_Price'])

    # Scale the training data using preprocess_data function (returns DataFrame with column names)
    X_trn_scaled, scaler = preprocess_data(X_trn)

    # 1. Use Lasso regression for feature selection
    lasso_model = Lasso(alpha=0.01)  # Adjust the alpha value as necessary
    lasso_model.fit(X_trn_scaled, y_trn)
    selected_features = X_trn_scaled.columns[lasso_model.coef_ != 0]
    #print(f"Selected features after Lasso: {len(selected_features)} features")

    # Scale the test data using the same scaler
    X_tst_scaled = pd.DataFrame(scaler.transform(housing_data_encoded_imputed_test.drop(columns=['Sale_Price'])), columns=housing_data_encoded_imputed_test.drop(columns=['Sale_Price']).columns)
    #print("X_tst_scaled shape", X_tst_scaled.shape)

    # 2. Grid Search for Ridge Regression to find the best alpha
    alpha_values = np.exp(np.linspace(-5, 5, 100))
    param_grid = {'alpha': alpha_values}

    ridge_model = Ridge()
    grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_trn_scaled[selected_features], y_trn)
    best_alpha = grid_search.best_params_['alpha']
    #print("Best alpha for Ridge regression:", best_alpha)

    # 3. Train the final Ridge model with the best alpha
    final_ridge_model = Ridge(alpha=best_alpha)
    final_ridge_model.fit(X_trn_scaled[selected_features], y_trn)

    # Predict on the training set
    y_trn_pred = final_ridge_model.predict(X_trn_scaled[selected_features])
    r2_train = r2_score(y_trn, y_trn_pred)
    adj_r2_train = adjusted_r_squared(r2_train, len(y_trn), X_trn_scaled[selected_features].shape[1])
    train_rmse = calc_rmse(y_trn, y_trn_pred)
    #print(f"Training R-squared: {r2_train}")
    #print(f"Adjusted R-squared (Training): {adj_r2_train}")
    print(f"Training RMSE: {train_rmse}")

    # Predict on the test set
    y_tst_pred_log = final_ridge_model.predict(X_tst_scaled[selected_features])

    # Reverse the log-transformation to get predictions in the original scale
    y_tst_pred = np.exp(y_tst_pred_log)

    # Merge the predictions with actual sale prices from test_y.csv using 'PID'
    predictions = pd.DataFrame({
        'PID': merged_test_data['PID'],
        'Predicted_Sale_Price': y_tst_pred
    })
    merged_test_data = pd.merge(test_y_data, predictions, on='PID', how='inner')

    # Calculate RMSE and R-squared for the test set using actual sale prices
    test_rmse = calc_rmse(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    r2_test = r2_score(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    adj_r2_test = adjusted_r_squared(r2_test, len(merged_test_data), X_tst_scaled.shape[1])
    print(f"Test RMSE: {test_rmse}")
    #print(f"Test R-squared: {r2_test}")
    #print(f"Adjusted R-squared (Test): {adj_r2_test}")

    # Save the predictions to a CSV file
    predictions_file = f'fold{fold_num}_test_predictions.csv'
    predictions.to_csv(predictions_file, index=False)
    #print(f"Predictions for fold{fold_num} saved to '{predictions_file}'.")

def main():
    # Loop over folds 1 to 10
    for fold_num in range(1, 11):
        process_fold(fold_num)

if __name__ == "__main__":
    main()



Processing fold1...
Training RMSE: 0.09594796418915669
Test RMSE: 0.10470095736772288

Processing fold2...
Training RMSE: 0.09582772225902751
Test RMSE: 0.10919573341706115

Processing fold3...
Training RMSE: 0.09899734279687067
Test RMSE: 0.11067459176582921

Processing fold4...
Training RMSE: 0.09706450792910584
Test RMSE: 0.10137524410487105

Processing fold5...
Training RMSE: 0.0983135931549487
Test RMSE: 0.10311582332938206

Processing fold6...
Training RMSE: 0.09632112502074647
Test RMSE: 0.114918089189173

Processing fold7...
Training RMSE: 0.09489886531676978
Test RMSE: 0.1215892691168203

Processing fold8...
Training RMSE: 0.09892128410565307
Test RMSE: 0.10394924096076101

Processing fold9...
Training RMSE: 0.0982606024647681
Test RMSE: 0.11315701014386603

Processing fold10...
Training RMSE: 0.09836567892003986
Test RMSE: 0.11474662950842018


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from scipy import stats  # Importing for z-score calculation

# Helper functions
def preprocess_data(X_trn):
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit the scaler on the training data and transform it
    X_trn_scaled = scaler.fit_transform(X_trn)

    # Return the scaled training data as a DataFrame (to preserve column names) and the scaler object
    X_trn_scaled_df = pd.DataFrame(X_trn_scaled, columns=X_trn.columns)

    return X_trn_scaled_df, scaler

def calc_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def adjusted_r_squared(r2, n, p):
    # Calculate adjusted R-squared
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

def process_fold(fold_num):
    print(f"\nProcessing fold{fold_num}...")

    # Load the train, test, and test_y datasets
    train_file_path = f'fold{fold_num}/train.csv'
    test_file_path = f'fold{fold_num}/test.csv'
    test_y_file_path = f'fold{fold_num}/test_y.csv'

    housing_data_train = pd.read_csv(train_file_path)
    housing_data_test = pd.read_csv(test_file_path)
    test_y_data = pd.read_csv(test_y_file_path)  # Load the actual Sale Price for the test set

    # Ensure that test_y_data and housing_data_test have matching PIDs
    merged_test_data = pd.merge(housing_data_test, test_y_data, on='PID', how='inner')


    # Identify categorical columns
    categorical_columns = ['MS_SubClass', 'MS_Zoning',  'Alley', 'Lot_Shape', 'Land_Contour',
                           'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1',  'Bldg_Type',
                           'House_Style', 'Overall_Qual', 'Overall_Cond', 'Roof_Style',  'Exterior_1st',
                           'Exterior_2nd', 'Mas_Vnr_Type', 'Exter_Qual', 'Exter_Cond', 'Foundation', 'Bsmt_Qual',
                           'Bsmt_Cond', 'Bsmt_Exposure', 'BsmtFin_Type_1', 'BsmtFin_Type_2',  'Heating_QC',
                           'Central_Air', 'Electrical', 'Kitchen_Qual', 'Functional', 'Fireplace_Qu', 'Garage_Type',
                           'Garage_Finish', 'Garage_Qual', 'Garage_Cond', 'Paved_Drive',  'Fence',
                           'Sale_Type', 'Sale_Condition']

    # Perform dummy encoding on both train and test datasets
    #housing_data_encoded_train = pd.get_dummies(housing_data_train, columns=categorical_columns, drop_first=True)
    #housing_data_encoded_test = pd.get_dummies(merged_test_data, columns=categorical_columns, drop_first=True)
    housing_data_encoded_train = preprocess_data_remove_features(housing_data_train, categorical_columns)
    housing_data_encoded_test = preprocess_data_remove_features(merged_test_data, categorical_columns)
    # Align columns of train and test after dummy encoding:
    all_columns = list(set(housing_data_encoded_train.columns) | set(housing_data_encoded_test.columns))
    all_columns.remove('Sale_Price')  # Remove 'Sale_Price' if it's in all_columns

    housing_data_encoded_train = housing_data_encoded_train.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    housing_data_encoded_test = housing_data_encoded_test.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    #print("housing_data_encoded_train shape", housing_data_encoded_train.shape)
    #print("housing_data_encoded_test shape", housing_data_encoded_test.shape)

    # Handle missing values by imputing them in train and test data
    imputer = SimpleImputer(strategy='median')
    numeric_columns = housing_data_encoded_train.select_dtypes(include=[np.number]).columns.tolist()
    numeric_columns = [col for col in numeric_columns if col not in ['Sale_Price', 'PID']]
    housing_data_encoded_imputed_train = housing_data_encoded_train.copy()
    housing_data_encoded_imputed_train[numeric_columns] = imputer.fit_transform(housing_data_encoded_train[numeric_columns])

    housing_data_encoded_imputed_test = housing_data_encoded_test.copy()
    housing_data_encoded_imputed_test[numeric_columns] = imputer.transform(housing_data_encoded_test[numeric_columns])
    #print("housing_data_encoded_imputed_train shape", housing_data_encoded_imputed_train.shape)
    #print("housing_data_encoded_imputed_test shape", housing_data_encoded_imputed_test.shape)

    # Remove 'PID' from train and test datasets
    housing_data_encoded_imputed_train = housing_data_encoded_imputed_train.drop(columns=['PID'])
    if 'PID' in housing_data_encoded_imputed_test.columns:
        housing_data_encoded_imputed_test = housing_data_encoded_imputed_test.drop(columns=['PID'])

    # Separate predictors (X) and target variable (y) for training
    X_trn = housing_data_encoded_imputed_train.drop(columns='Sale_Price')
    y_trn = np.log(housing_data_encoded_imputed_train['Sale_Price'])

    # Scale the training data using preprocess_data function (returns DataFrame with column names)
    X_trn_scaled, scaler = preprocess_data(X_trn)

    # 1. Use Lasso regression for feature selection
    lasso_model = Lasso(alpha=0.01)  # Adjust the alpha value as necessary
    lasso_model.fit(X_trn_scaled, y_trn)
    selected_features = X_trn_scaled.columns[lasso_model.coef_ != 0]
    #print(f"Selected features after Lasso: {len(selected_features)} features")

    # Scale the test data using the same scaler
    X_tst_scaled = pd.DataFrame(scaler.transform(housing_data_encoded_imputed_test.drop(columns=['Sale_Price'])), columns=housing_data_encoded_imputed_test.drop(columns=['Sale_Price']).columns)
    #print("X_tst_scaled shape", X_tst_scaled.shape)

    # 2. Grid Search for Ridge Regression to find the best alpha
    alpha_values = np.exp(np.linspace(-5, 5, 100))
    param_grid = {'alpha': alpha_values}

    ridge_model = Ridge()
    grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_trn_scaled[selected_features], y_trn)
    best_alpha = grid_search.best_params_['alpha']
    #print("Best alpha for Ridge regression:", best_alpha)

    # 3. Train the final Ridge model with the best alpha
    final_ridge_model = Ridge(alpha=best_alpha)
    final_ridge_model.fit(X_trn_scaled[selected_features], y_trn)

    # Predict on the training set
    y_trn_pred = final_ridge_model.predict(X_trn_scaled[selected_features])
    r2_train = r2_score(y_trn, y_trn_pred)
    adj_r2_train = adjusted_r_squared(r2_train, len(y_trn), X_trn_scaled[selected_features].shape[1])
    train_rmse = calc_rmse(y_trn, y_trn_pred)
    #print(f"Training R-squared: {r2_train}")
    #print(f"Adjusted R-squared (Training): {adj_r2_train}")
    print(f"Training RMSE: {train_rmse}")

    # Predict on the test set
    y_tst_pred_log = final_ridge_model.predict(X_tst_scaled[selected_features])

    # Reverse the log-transformation to get predictions in the original scale
    y_tst_pred = np.exp(y_tst_pred_log)

    # Merge the predictions with actual sale prices from test_y.csv using 'PID'
    predictions = pd.DataFrame({
        'PID': merged_test_data['PID'],
        'Predicted_Sale_Price': y_tst_pred
    })
    merged_test_data = pd.merge(test_y_data, predictions, on='PID', how='inner')

    # Calculate RMSE and R-squared for the test set using actual sale prices
    test_rmse = calc_rmse(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    r2_test = r2_score(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    adj_r2_test = adjusted_r_squared(r2_test, len(merged_test_data), X_tst_scaled.shape[1])
    print(f"Test RMSE: {test_rmse}")
    #print(f"Test R-squared: {r2_test}")
    #print(f"Adjusted R-squared (Test): {adj_r2_test}")

    # Save the predictions to a CSV file
    predictions_file = f'fold{fold_num}_test_predictions.csv'
    predictions.to_csv(predictions_file, index=False)
    #print(f"Predictions for fold{fold_num} saved to '{predictions_file}'.")

def main():
    # Loop over folds 1 to 10
    for fold_num in range(1, 11):
        process_fold(fold_num)

if __name__ == "__main__":
    main()



Processing fold1...
Training RMSE: 0.09594588428518307
Test RMSE: 0.10469223283328424

Processing fold2...
Training RMSE: 0.09582819397381018
Test RMSE: 0.10917123031397012

Processing fold3...
Training RMSE: 0.09899029824624016
Test RMSE: 0.11063167697189338

Processing fold4...
Training RMSE: 0.09706420392046199
Test RMSE: 0.10138446890311324

Processing fold5...
Training RMSE: 0.09831349136602181
Test RMSE: 0.10311477433681965

Processing fold6...
Training RMSE: 0.09632037026795903
Test RMSE: 0.11490563249669387

Processing fold7...
Training RMSE: 0.0948991434075486
Test RMSE: 0.12160155025132481

Processing fold8...
Training RMSE: 0.09876341419037267
Test RMSE: 0.10385505305489069

Processing fold9...
Training RMSE: 0.09825948927589798
Test RMSE: 0.11316541032209632

Processing fold10...
Training RMSE: 0.098399397559317
Test RMSE: 0.1142611021048103


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from scipy import stats  # Importing for z-score calculation

# Helper functions
def preprocess_data(X_trn):
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit the scaler on the training data and transform it
    X_trn_scaled = scaler.fit_transform(X_trn)

    # Return the scaled training data as a DataFrame (to preserve column names) and the scaler object
    X_trn_scaled_df = pd.DataFrame(X_trn_scaled, columns=X_trn.columns)

    return X_trn_scaled_df, scaler

def calc_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def adjusted_r_squared(r2, n, p):
    # Calculate adjusted R-squared
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

def process_fold(fold_num):
    print(f"\nProcessing fold{fold_num}...")

    # Load the train, test, and test_y datasets
    train_file_path = f'fold{fold_num}/train.csv'
    test_file_path = f'fold{fold_num}/test.csv'
    test_y_file_path = f'fold{fold_num}/test_y.csv'

    # Define the list of features to remove (this is just an example, adjust as needed)
    features_to_remove = ["Longitude", "Latitude", "Street", "Utilities", "Condition_2",
                      "Roof_Matl", "Heating", "Pool_QC", "Misc_Feature",  "Misc_Val","Low_Qual_Fin_SF", "Sale_Type", "Pool_Area"]  # Specify the features you want to remove



    housing_data_train = pd.read_csv(train_file_path)
    housing_data_test = pd.read_csv(test_file_path)
    test_y_data = pd.read_csv(test_y_file_path)  # Load the actual Sale Price for the test set

    # Call the remove_features function to remove unwanted features from X_trn
    housing_data_train = remove_features(housing_data_train, features_to_remove)

    # Ensure that test_y_data and housing_data_test have matching PIDs
    merged_test_data = pd.merge(housing_data_test, test_y_data, on='PID', how='inner')

    # Identify categorical columns
    categorical_columns_test = ['MS_SubClass', 'MS_Zoning', 'Street', 'Alley', 'Lot_Shape', 'Land_Contour', 'Utilities',
                           'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1', 'Condition_2', 'Bldg_Type',
                           'House_Style', 'Overall_Qual', 'Overall_Cond', 'Roof_Style', 'Roof_Matl', 'Exterior_1st',
                           'Exterior_2nd', 'Mas_Vnr_Type', 'Exter_Qual', 'Exter_Cond', 'Foundation', 'Bsmt_Qual',
                           'Bsmt_Cond', 'Bsmt_Exposure', 'BsmtFin_Type_1', 'BsmtFin_Type_2', 'Heating', 'Heating_QC',
                           'Central_Air', 'Electrical', 'Kitchen_Qual', 'Functional', 'Fireplace_Qu', 'Garage_Type',
                           'Garage_Finish', 'Garage_Qual', 'Garage_Cond', 'Paved_Drive', 'Pool_QC', 'Fence',
                           'Misc_Feature', 'Sale_Type', 'Sale_Condition']

    categorical_columns_train = ['MS_SubClass', 'MS_Zoning', 'Alley', 'Lot_Shape', 'Land_Contour',
                            'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1', 'Bldg_Type',
                            'House_Style', 'Overall_Qual', 'Overall_Cond', 'Roof_Style',
                            'Exterior_1st', 'Exterior_2nd', 'Mas_Vnr_Type', 'Exter_Qual',
                            'Exter_Cond', 'Foundation', 'Bsmt_Qual', 'Bsmt_Cond', 'Bsmt_Exposure',
                            'BsmtFin_Type_1', 'BsmtFin_Type_2', 'Heating_QC', 'Central_Air',  'Electrical',
                            'Kitchen_Qual', 'Functional', 'Fireplace_Qu', 'Garage_Type', 'Garage_Finish',
                            'Garage_Qual', 'Garage_Cond', 'Paved_Drive', 'Fence',
                            'Sale_Condition']


    # Perform dummy encoding on both train and test datasets
    housing_data_encoded_train = pd.get_dummies(housing_data_train, columns=categorical_columns_train, drop_first=True)
    housing_data_encoded_test = pd.get_dummies(merged_test_data, columns=categorical_columns_test, drop_first=True)

    # Align columns of train and test after dummy encoding:
    all_columns = list(set(housing_data_encoded_train.columns) | set(housing_data_encoded_test.columns))
    all_columns.remove('Sale_Price')  # Remove 'Sale_Price' if it's in all_columns

    housing_data_encoded_train = housing_data_encoded_train.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    housing_data_encoded_test = housing_data_encoded_test.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    print("housing_data_encoded_train shape", housing_data_encoded_train.shape)
    print("housing_data_encoded_test shape", housing_data_encoded_test.shape)

    # Handle missing values by imputing them in train and test data
    imputer = SimpleImputer(strategy='median')
    numeric_columns = housing_data_encoded_train.select_dtypes(include=[np.number]).columns.tolist()
    numeric_columns = [col for col in numeric_columns if col not in ['Sale_Price', 'PID']]
    housing_data_encoded_imputed_train = housing_data_encoded_train.copy()
    housing_data_encoded_imputed_train[numeric_columns] = imputer.fit_transform(housing_data_encoded_train[numeric_columns])

    housing_data_encoded_imputed_test = housing_data_encoded_test.copy()
    housing_data_encoded_imputed_test[numeric_columns] = imputer.transform(housing_data_encoded_test[numeric_columns])
    print("housing_data_encoded_imputed_train shape", housing_data_encoded_imputed_train.shape)
    print("housing_data_encoded_imputed_test shape", housing_data_encoded_imputed_test.shape)

    # Remove 'PID' from train and test datasets
    housing_data_encoded_imputed_train = housing_data_encoded_imputed_train.drop(columns=['PID'])
    if 'PID' in housing_data_encoded_imputed_test.columns:
        housing_data_encoded_imputed_test = housing_data_encoded_imputed_test.drop(columns=['PID'])

    # Separate predictors (X) and target variable (y) for training
    X_trn = housing_data_encoded_imputed_train.drop(columns='Sale_Price')
    y_trn = np.log(housing_data_encoded_imputed_train['Sale_Price'])

    # Scale the training data using preprocess_data function (returns DataFrame with column names)
    X_trn_scaled, scaler = preprocess_data(X_trn)

    # 1. Use Lasso regression for feature selection
    lasso_model = Lasso(alpha=0.01)  # Adjust the alpha value as necessary
    lasso_model.fit(X_trn_scaled, y_trn)
    selected_features = X_trn_scaled.columns[lasso_model.coef_ != 0]
    print(f"Selected features after Lasso: {len(selected_features)} features")

    # Scale the test data using the same scaler
    X_tst_scaled = pd.DataFrame(scaler.transform(housing_data_encoded_imputed_test.drop(columns=['Sale_Price'])), columns=housing_data_encoded_imputed_test.drop(columns=['Sale_Price']).columns)
    print("X_tst_scaled shape", X_tst_scaled.shape)

    # 2. Grid Search for Ridge Regression to find the best alpha
    alpha_values = np.exp(np.linspace(-5, 5, 100))
    param_grid = {'alpha': alpha_values}

    ridge_model = Ridge()
    grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_trn_scaled[selected_features], y_trn)
    best_alpha = grid_search.best_params_['alpha']
    print("Best alpha for Ridge regression:", best_alpha)

    # 3. Train the final Ridge model with the best alpha
    final_ridge_model = Ridge(alpha=best_alpha)
    final_ridge_model.fit(X_trn_scaled[selected_features], y_trn)

    # Predict on the training set
    y_trn_pred = final_ridge_model.predict(X_trn_scaled[selected_features])
    r2_train = r2_score(y_trn, y_trn_pred)
    adj_r2_train = adjusted_r_squared(r2_train, len(y_trn), X_trn_scaled[selected_features].shape[1])
    train_rmse = calc_rmse(y_trn, y_trn_pred)
    print(f"Training R-squared: {r2_train}")
    print(f"Adjusted R-squared (Training): {adj_r2_train}")
    print(f"Training RMSE: {train_rmse}")

    # Predict on the test set
    y_tst_pred_log = final_ridge_model.predict(X_tst_scaled[selected_features])

    # Reverse the log-transformation to get predictions in the original scale
    y_tst_pred = np.exp(y_tst_pred_log)

    # Merge the predictions with actual sale prices from test_y.csv using 'PID'
    predictions = pd.DataFrame({
        'PID': merged_test_data['PID'],
        'Predicted_Sale_Price': y_tst_pred
    })
    merged_test_data = pd.merge(test_y_data, predictions, on='PID', how='inner')

    # Calculate RMSE and R-squared for the test set using actual sale prices
    test_rmse = calc_rmse(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    r2_test = r2_score(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    adj_r2_test = adjusted_r_squared(r2_test, len(merged_test_data), X_tst_scaled.shape[1])
    print(f"Test RMSE: {test_rmse}")
    print(f"Test R-squared: {r2_test}")
    print(f"Adjusted R-squared (Test): {adj_r2_test}")

    # Save the predictions to a CSV file
    predictions_file = f'fold{fold_num}_test_predictions.csv'
    predictions.to_csv(predictions_file, index=False)
    print(f"Predictions for fold{fold_num} saved to '{predictions_file}'.")

def main():
    # Loop over folds 1 to 10
    for fold_num in range(1, 11):
        process_fold(fold_num)

if __name__ == "__main__":
    main()



Processing fold1...


TypeError: object of type 'NoneType' has no len()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from scipy import stats  # Importing for z-score calculation

# Helper functions
def preprocess_data(X_trn):
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit the scaler on the training data and transform it
    X_trn_scaled = scaler.fit_transform(X_trn)

    # Return the scaled training data as a DataFrame (to preserve column names) and the scaler object
    X_trn_scaled_df = pd.DataFrame(X_trn_scaled, columns=X_trn.columns)

    return X_trn_scaled_df, scaler

def calc_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def adjusted_r_squared(r2, n, p):
    # Calculate adjusted R-squared
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

def process_fold(fold_num):
    print(f"\nProcessing fold{fold_num}...")

    # Load the train, test, and test_y datasets
    train_file_path = f'fold{fold_num}/train.csv'
    test_file_path = f'fold{fold_num}/test.csv'
    test_y_file_path = f'fold{fold_num}/test_y.csv'

    # Define the list of features to remove (this is just an example, adjust as needed)
    features_to_remove = ["Longitude", "Latitude", "Street", "Utilities", "Condition_2",
                      "Roof_Matl", "Heating", "Pool_QC", "Misc_Feature",  "Misc_Val","Low_Qual_Fin_SF", "Sale_Type", "Pool_Area"]  # Specify the features you want to remove



    housing_data_train = pd.read_csv(train_file_path)
    housing_data_test = pd.read_csv(test_file_path)
    test_y_data = pd.read_csv(test_y_file_path)  # Load the actual Sale Price for the test set

    # Call the remove_features function to remove unwanted features from X_trn
    housing_data_train = remove_features(housing_data_train, features_to_remove)

    # Ensure that test_y_data and housing_data_test have matching PIDs
    merged_test_data = pd.merge(housing_data_test, test_y_data, on='PID', how='inner')

    # Identify categorical columns
    categorical_columns_test = ['MS_SubClass', 'MS_Zoning', 'Street', 'Alley', 'Lot_Shape', 'Land_Contour', 'Utilities',
                           'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1', 'Condition_2', 'Bldg_Type',
                           'House_Style', 'Overall_Qual', 'Overall_Cond', 'Roof_Style', 'Roof_Matl', 'Exterior_1st',
                           'Exterior_2nd', 'Mas_Vnr_Type', 'Exter_Qual', 'Exter_Cond', 'Foundation', 'Bsmt_Qual',
                           'Bsmt_Cond', 'Bsmt_Exposure', 'BsmtFin_Type_1', 'BsmtFin_Type_2', 'Heating', 'Heating_QC',
                           'Central_Air', 'Electrical', 'Kitchen_Qual', 'Functional', 'Fireplace_Qu', 'Garage_Type',
                           'Garage_Finish', 'Garage_Qual', 'Garage_Cond', 'Paved_Drive', 'Pool_QC', 'Fence',
                           'Misc_Feature', 'Sale_Type', 'Sale_Condition']

    categorical_columns_train = ['MS_SubClass', 'MS_Zoning', 'Alley', 'Lot_Shape', 'Land_Contour',
                            'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1', 'Bldg_Type',
                            'House_Style', 'Overall_Qual', 'Overall_Cond', 'Roof_Style',
                            'Exterior_1st', 'Exterior_2nd', 'Mas_Vnr_Type', 'Exter_Qual',
                            'Exter_Cond', 'Foundation', 'Bsmt_Qual', 'Bsmt_Cond', 'Bsmt_Exposure',
                            'BsmtFin_Type_1', 'BsmtFin_Type_2', 'Heating_QC', 'Central_Air',  'Electrical',
                            'Kitchen_Qual', 'Functional', 'Fireplace_Qu', 'Garage_Type', 'Garage_Finish',
                            'Garage_Qual', 'Garage_Cond', 'Paved_Drive', 'Fence',
                            'Sale_Condition']


    # Perform dummy encoding on both train and test datasets
    housing_data_encoded_train = pd.get_dummies(housing_data_train, columns=categorical_columns_train, drop_first=True)
    housing_data_encoded_test = pd.get_dummies(merged_test_data, columns=categorical_columns_test, drop_first=True)

    # Align columns of train and test after dummy encoding:
    all_columns = list(set(housing_data_encoded_train.columns) | set(housing_data_encoded_test.columns))
    all_columns.remove('Sale_Price')  # Remove 'Sale_Price' if it's in all_columns

    housing_data_encoded_train = housing_data_encoded_train.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    housing_data_encoded_test = housing_data_encoded_test.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    print("housing_data_encoded_train shape", housing_data_encoded_train.shape)
    print("housing_data_encoded_test shape", housing_data_encoded_test.shape)

    # Handle missing values by imputing them in train and test data
    imputer = SimpleImputer(strategy='median')
    numeric_columns = housing_data_encoded_train.select_dtypes(include=[np.number]).columns.tolist()
    numeric_columns = [col for col in numeric_columns if col not in ['Sale_Price', 'PID']]
    housing_data_encoded_imputed_train = housing_data_encoded_train.copy()
    housing_data_encoded_imputed_train[numeric_columns] = imputer.fit_transform(housing_data_encoded_train[numeric_columns])

    housing_data_encoded_imputed_test = housing_data_encoded_test.copy()
    housing_data_encoded_imputed_test[numeric_columns] = imputer.transform(housing_data_encoded_test[numeric_columns])
    print("housing_data_encoded_imputed_train shape", housing_data_encoded_imputed_train.shape)
    print("housing_data_encoded_imputed_test shape", housing_data_encoded_imputed_test.shape)

    # Remove 'PID' from train and test datasets
    housing_data_encoded_imputed_train = housing_data_encoded_imputed_train.drop(columns=['PID'])
    if 'PID' in housing_data_encoded_imputed_test.columns:
        housing_data_encoded_imputed_test = housing_data_encoded_imputed_test.drop(columns=['PID'])

    # Separate predictors (X) and target variable (y) for training
    X_trn = housing_data_encoded_imputed_train.drop(columns='Sale_Price')
    y_trn = np.log(housing_data_encoded_imputed_train['Sale_Price'])

    # Scale the training data using preprocess_data function (returns DataFrame with column names)
    X_trn_scaled, scaler = preprocess_data(X_trn)

    # 1. Use Lasso regression for feature selection
    lasso_model = Lasso(alpha=0.01)  # Adjust the alpha value as necessary
    lasso_model.fit(X_trn_scaled, y_trn)
    selected_features = X_trn_scaled.columns[lasso_model.coef_ != 0]
    print(f"Selected features after Lasso: {len(selected_features)} features")

    # Scale the test data using the same scaler
    X_tst_scaled = pd.DataFrame(scaler.transform(housing_data_encoded_imputed_test.drop(columns=['Sale_Price'])), columns=housing_data_encoded_imputed_test.drop(columns=['Sale_Price']).columns)
    print("X_tst_scaled shape", X_tst_scaled.shape)

    # 2. Grid Search for Ridge Regression to find the best alpha
    alpha_values = np.exp(np.linspace(-5, 5, 100))
    param_grid = {'alpha': alpha_values}

    ridge_model = Ridge()
    grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_trn_scaled[selected_features], y_trn)
    best_alpha = grid_search.best_params_['alpha']
    print("Best alpha for Ridge regression:", best_alpha)

    # 3. Train the final Ridge model with the best alpha
    final_ridge_model = Ridge(alpha=best_alpha)
    final_ridge_model.fit(X_trn_scaled[selected_features], y_trn)

    # Predict on the training set
    y_trn_pred = final_ridge_model.predict(X_trn_scaled[selected_features])
    r2_train = r2_score(y_trn, y_trn_pred)
    adj_r2_train = adjusted_r_squared(r2_train, len(y_trn), X_trn_scaled[selected_features].shape[1])
    train_rmse = calc_rmse(y_trn, y_trn_pred)
    print(f"Training R-squared: {r2_train}")
    print(f"Adjusted R-squared (Training): {adj_r2_train}")
    print(f"Training RMSE: {train_rmse}")

    # Predict on the test set
    y_tst_pred_log = final_ridge_model.predict(X_tst_scaled[selected_features])

    # Reverse the log-transformation to get predictions in the original scale
    y_tst_pred = np.exp(y_tst_pred_log)

    # Merge the predictions with actual sale prices from test_y.csv using 'PID'
    predictions = pd.DataFrame({
        'PID': merged_test_data['PID'],
        'Predicted_Sale_Price': y_tst_pred
    })
    merged_test_data = pd.merge(test_y_data, predictions, on='PID', how='inner')

    # Calculate RMSE and R-squared for the test set using actual sale prices
    test_rmse = calc_rmse(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    r2_test = r2_score(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    adj_r2_test = adjusted_r_squared(r2_test, len(merged_test_data), X_tst_scaled.shape[1])
    print(f"Test RMSE: {test_rmse}")
    print(f"Test R-squared: {r2_test}")
    print(f"Adjusted R-squared (Test): {adj_r2_test}")

    # Save the predictions to a CSV file
    predictions_file = f'fold{fold_num}_test_predictions.csv'
    predictions.to_csv(predictions_file, index=False)
    print(f"Predictions for fold{fold_num} saved to '{predictions_file}'.")

def main():
    # Loop over folds 1 to 10
    for fold_num in range(1, 11):
        process_fold(fold_num)

if __name__ == "__main__":
    main()



Processing fold1...


TypeError: object of type 'NoneType' has no len()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from scipy import stats  # Importing for z-score calculation

# Helper functions
def preprocess_data(X_trn):
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit the scaler on the training data and transform it
    X_trn_scaled = scaler.fit_transform(X_trn)

    # Return the scaled training data as a DataFrame (to preserve column names) and the scaler object
    X_trn_scaled_df = pd.DataFrame(X_trn_scaled, columns=X_trn.columns)

    return X_trn_scaled_df, scaler

def calc_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def adjusted_r_squared(r2, n, p):
    # Calculate adjusted R-squared
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

def process_fold(fold_num):
    print(f"\nProcessing fold{fold_num}...")

    # Load the train, test, and test_y datasets
    train_file_path = f'fold{fold_num}/train.csv'
    test_file_path = f'fold{fold_num}/test.csv'
    test_y_file_path = f'fold{fold_num}/test_y.csv'

    # Define the list of features to remove (this is just an example, adjust as needed)
    features_to_remove = ["Longitude", "Latitude", "Street", "Utilities", "Condition_2",
                      "Roof_Matl", "Heating", "Pool_QC", "Misc_Feature",  "Misc_Val","Low_Qual_Fin_SF", "Sale_Type", "Pool_Area"]  # Specify the features you want to remove



    housing_data_train = pd.read_csv(train_file_path)
    housing_data_test = pd.read_csv(test_file_path)
    test_y_data = pd.read_csv(test_y_file_path)  # Load the actual Sale Price for the test set

    # Call the remove_features function to remove unwanted features from X_trn
    housing_data_train = remove_features(housing_data_train, features_to_remove)

    # Ensure that test_y_data and housing_data_test have matching PIDs
    merged_test_data = pd.merge(housing_data_test, test_y_data, on='PID', how='inner')

    # Identify categorical columns
    categorical_columns_test = ['MS_SubClass', 'MS_Zoning', 'Street', 'Alley', 'Lot_Shape', 'Land_Contour', 'Utilities',
                           'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1', 'Condition_2', 'Bldg_Type',
                           'House_Style', 'Overall_Qual', 'Overall_Cond', 'Roof_Style', 'Roof_Matl', 'Exterior_1st',
                           'Exterior_2nd', 'Mas_Vnr_Type', 'Exter_Qual', 'Exter_Cond', 'Foundation', 'Bsmt_Qual',
                           'Bsmt_Cond', 'Bsmt_Exposure', 'BsmtFin_Type_1', 'BsmtFin_Type_2', 'Heating', 'Heating_QC',
                           'Central_Air', 'Electrical', 'Kitchen_Qual', 'Functional', 'Fireplace_Qu', 'Garage_Type',
                           'Garage_Finish', 'Garage_Qual', 'Garage_Cond', 'Paved_Drive', 'Pool_QC', 'Fence',
                           'Misc_Feature', 'Sale_Type', 'Sale_Condition']

    categorical_columns_train = ['MS_SubClass', 'MS_Zoning', 'Alley', 'Lot_Shape', 'Land_Contour',
                            'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1', 'Bldg_Type',
                            'House_Style', 'Overall_Qual', 'Overall_Cond', 'Roof_Style',
                            'Exterior_1st', 'Exterior_2nd', 'Mas_Vnr_Type', 'Exter_Qual',
                            'Exter_Cond', 'Foundation', 'Bsmt_Qual', 'Bsmt_Cond', 'Bsmt_Exposure',
                            'BsmtFin_Type_1', 'BsmtFin_Type_2', 'Heating_QC', 'Central_Air',  'Electrical',
                            'Kitchen_Qual', 'Functional', 'Fireplace_Qu', 'Garage_Type', 'Garage_Finish',
                            'Garage_Qual', 'Garage_Cond', 'Paved_Drive', 'Fence',
                            'Sale_Type', 'Sale_Condition']


    # Perform dummy encoding on both train and test datasets
    housing_data_encoded_train = pd.get_dummies(housing_data_train, columns=categorical_columns_train, drop_first=True)
    housing_data_encoded_test = pd.get_dummies(merged_test_data, columns=categorical_columns_test, drop_first=True)

    # Align columns of train and test after dummy encoding:
    all_columns = list(set(housing_data_encoded_train.columns) | set(housing_data_encoded_test.columns))
    all_columns.remove('Sale_Price')  # Remove 'Sale_Price' if it's in all_columns

    housing_data_encoded_train = housing_data_encoded_train.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    housing_data_encoded_test = housing_data_encoded_test.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    print("housing_data_encoded_train shape", housing_data_encoded_train.shape)
    print("housing_data_encoded_test shape", housing_data_encoded_test.shape)

    # Handle missing values by imputing them in train and test data
    imputer = SimpleImputer(strategy='median')
    numeric_columns = housing_data_encoded_train.select_dtypes(include=[np.number]).columns.tolist()
    numeric_columns = [col for col in numeric_columns if col not in ['Sale_Price', 'PID']]
    housing_data_encoded_imputed_train = housing_data_encoded_train.copy()
    housing_data_encoded_imputed_train[numeric_columns] = imputer.fit_transform(housing_data_encoded_train[numeric_columns])

    housing_data_encoded_imputed_test = housing_data_encoded_test.copy()
    housing_data_encoded_imputed_test[numeric_columns] = imputer.transform(housing_data_encoded_test[numeric_columns])
    print("housing_data_encoded_imputed_train shape", housing_data_encoded_imputed_train.shape)
    print("housing_data_encoded_imputed_test shape", housing_data_encoded_imputed_test.shape)

    # Remove 'PID' from train and test datasets
    housing_data_encoded_imputed_train = housing_data_encoded_imputed_train.drop(columns=['PID'])
    if 'PID' in housing_data_encoded_imputed_test.columns:
        housing_data_encoded_imputed_test = housing_data_encoded_imputed_test.drop(columns=['PID'])

    # Separate predictors (X) and target variable (y) for training
    X_trn = housing_data_encoded_imputed_train.drop(columns='Sale_Price')
    y_trn = np.log(housing_data_encoded_imputed_train['Sale_Price'])

    # Scale the training data using preprocess_data function (returns DataFrame with column names)
    X_trn_scaled, scaler = preprocess_data(X_trn)

    # 1. Use Lasso regression for feature selection
    lasso_model = Lasso(alpha=0.01)  # Adjust the alpha value as necessary
    lasso_model.fit(X_trn_scaled, y_trn)
    selected_features = X_trn_scaled.columns[lasso_model.coef_ != 0]
    print(f"Selected features after Lasso: {len(selected_features)} features")

    # Scale the test data using the same scaler
    X_tst_scaled = pd.DataFrame(scaler.transform(housing_data_encoded_imputed_test.drop(columns=['Sale_Price'])), columns=housing_data_encoded_imputed_test.drop(columns=['Sale_Price']).columns)
    print("X_tst_scaled shape", X_tst_scaled.shape)

    # 2. Grid Search for Ridge Regression to find the best alpha
    alpha_values = np.exp(np.linspace(-5, 5, 100))
    param_grid = {'alpha': alpha_values}

    ridge_model = Ridge()
    grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_trn_scaled[selected_features], y_trn)
    best_alpha = grid_search.best_params_['alpha']
    print("Best alpha for Ridge regression:", best_alpha)

    # 3. Train the final Ridge model with the best alpha
    final_ridge_model = Ridge(alpha=best_alpha)
    final_ridge_model.fit(X_trn_scaled[selected_features], y_trn)

    # Predict on the training set
    y_trn_pred = final_ridge_model.predict(X_trn_scaled[selected_features])
    r2_train = r2_score(y_trn, y_trn_pred)
    adj_r2_train = adjusted_r_squared(r2_train, len(y_trn), X_trn_scaled[selected_features].shape[1])
    train_rmse = calc_rmse(y_trn, y_trn_pred)
    print(f"Training R-squared: {r2_train}")
    print(f"Adjusted R-squared (Training): {adj_r2_train}")
    print(f"Training RMSE: {train_rmse}")

    # Predict on the test set
    y_tst_pred_log = final_ridge_model.predict(X_tst_scaled[selected_features])

    # Reverse the log-transformation to get predictions in the original scale
    y_tst_pred = np.exp(y_tst_pred_log)

    # Merge the predictions with actual sale prices from test_y.csv using 'PID'
    predictions = pd.DataFrame({
        'PID': merged_test_data['PID'],
        'Predicted_Sale_Price': y_tst_pred
    })
    merged_test_data = pd.merge(test_y_data, predictions, on='PID', how='inner')

    # Calculate RMSE and R-squared for the test set using actual sale prices
    test_rmse = calc_rmse(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    r2_test = r2_score(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    adj_r2_test = adjusted_r_squared(r2_test, len(merged_test_data), X_tst_scaled.shape[1])
    print(f"Test RMSE: {test_rmse}")
    print(f"Test R-squared: {r2_test}")
    print(f"Adjusted R-squared (Test): {adj_r2_test}")

    # Save the predictions to a CSV file
    predictions_file = f'fold{fold_num}_test_predictions.csv'
    predictions.to_csv(predictions_file, index=False)
    print(f"Predictions for fold{fold_num} saved to '{predictions_file}'.")

def main():
    # Loop over folds 1 to 10
    for fold_num in range(1, 11):
        process_fold(fold_num)

if __name__ == "__main__":
    main()



Processing fold1...


TypeError: object of type 'NoneType' has no len()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from scipy import stats  # Importing for z-score calculation

# Helper functions
def preprocess_data(X_trn):
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit the scaler on the training data and transform it
    X_trn_scaled = scaler.fit_transform(X_trn)

    # Return the scaled training data as a DataFrame (to preserve column names) and the scaler object
    X_trn_scaled_df = pd.DataFrame(X_trn_scaled, columns=X_trn.columns)

    return X_trn_scaled_df, scaler

def calc_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def adjusted_r_squared(r2, n, p):
    # Calculate adjusted R-squared
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

def main():
    # Load the train, test, and test_y datasets
    train_file_path = 'fold8/train.csv'
    test_file_path = 'fold8/test.csv'
    test_y_file_path = 'fold8/test_y.csv'
    housing_data_train = pd.read_csv(train_file_path)
    housing_data_test = pd.read_csv(test_file_path)
    test_y_data = pd.read_csv(test_y_file_path)  # Load the actual Sale Price for the test set

    # Ensure that test_y_data and housing_data_test have matching PIDs
    merged_test_data = pd.merge(housing_data_test, test_y_data, on='PID', how='inner')

    # Identify categorical columns
    categorical_columns = ['MS_SubClass', 'MS_Zoning', 'Street', 'Alley', 'Lot_Shape', 'Land_Contour', 'Utilities',
                           'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1', 'Condition_2', 'Bldg_Type',
                           'House_Style', 'Overall_Qual', 'Overall_Cond', 'Roof_Style', 'Roof_Matl', 'Exterior_1st',
                           'Exterior_2nd', 'Mas_Vnr_Type', 'Exter_Qual', 'Exter_Cond', 'Foundation', 'Bsmt_Qual',
                           'Bsmt_Cond', 'Bsmt_Exposure', 'BsmtFin_Type_1', 'BsmtFin_Type_2', 'Heating', 'Heating_QC',
                           'Central_Air', 'Electrical', 'Kitchen_Qual', 'Functional', 'Fireplace_Qu', 'Garage_Type',
                           'Garage_Finish', 'Garage_Qual', 'Garage_Cond', 'Paved_Drive', 'Pool_QC', 'Fence',
                           'Misc_Feature', 'Sale_Type', 'Sale_Condition']

    # Perform dummy encoding on both train and test datasets
    housing_data_encoded_train = pd.get_dummies(housing_data_train, columns=categorical_columns, drop_first=True)
    housing_data_encoded_test = pd.get_dummies(merged_test_data, columns=categorical_columns, drop_first=True)

    # Align columns of train and test after dummy encoding:
    all_columns = list(set(housing_data_encoded_train.columns) | set(housing_data_encoded_test.columns))
    all_columns.remove('Sale_Price')  # Remove 'Sale_Price' if it's in all_columns

    housing_data_encoded_train = housing_data_encoded_train.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    housing_data_encoded_test = housing_data_encoded_test.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    print("housing_data_encoded_train shape", housing_data_encoded_train.shape)
    print("housing_data_encoded_test shape", housing_data_encoded_test.shape)

    # Handle missing values by imputing them in train and test data
    imputer = SimpleImputer(strategy='median')
    numeric_columns = housing_data_encoded_train.select_dtypes(include=[np.number]).columns.tolist()
    #numeric_columns = [col for col in numeric_columns if col != 'Sale_Price']
    # Ensure 'Sale_Price' and 'PID' are excluded from numeric_columns
    numeric_columns = [col for col in numeric_columns if col not in ['Sale_Price', 'PID']]
    housing_data_encoded_imputed_train = housing_data_encoded_train.copy()
    housing_data_encoded_imputed_train[numeric_columns] = imputer.fit_transform(housing_data_encoded_train[numeric_columns])

    housing_data_encoded_imputed_test = housing_data_encoded_test.copy()
    housing_data_encoded_imputed_test[numeric_columns] = imputer.transform(housing_data_encoded_test[numeric_columns])
    print("housing_data_encoded_imputed_train shape", housing_data_encoded_imputed_train.shape)
    print("housing_data_encoded_imputed_test shape", housing_data_encoded_imputed_test.shape)

    # Remove 'PID' from train and test datasets
    housing_data_encoded_imputed_train = housing_data_encoded_imputed_train.drop(columns=['PID'])
    if 'PID' in housing_data_encoded_imputed_test.columns:
        housing_data_encoded_imputed_test = housing_data_encoded_imputed_test.drop(columns=['PID'])

    # Separate predictors (X) and target variable (y) for training
    X_trn = housing_data_encoded_imputed_train.drop(columns='Sale_Price')
    y_trn = np.log(housing_data_encoded_imputed_train['Sale_Price'])


    # EXperiment starts

    # Experiment ends
    # Scale the training data using preprocess_data function (returns DataFrame with column names)
    X_trn_scaled, scaler = preprocess_data(X_trn)

    # 1. Use Lasso regression for feature selection
    lasso_model = Lasso(alpha=0.01)  # Adjust the alpha value as necessary
    #lasso_model.fit(X_trn_scaled, y_trn)
    lasso_model.fit(X_trn_scaled, y_trn)
    # Select the features with non-zero coefficients
    selected_features = X_trn_scaled.columns[lasso_model.coef_ != 0]
    print(f"Selected features after Lasso: {len(selected_features)} features")

    # Scale the test data using the same scaler
    X_tst_scaled = pd.DataFrame(scaler.transform(housing_data_encoded_imputed_test.drop(columns=['Sale_Price'])), columns=housing_data_encoded_imputed_test.drop(columns=['Sale_Price']).columns)
    print("X_tst_scaled shape", X_tst_scaled.shape)



    # 2. Grid Search for Ridge Regression to find the best alpha
    #param_grid = {'alpha': [0.01, 0.1, 1.0, 10, 100]}

    # Generate 100 alpha values using np.exp and np.linspace
    alpha_values = np.exp(np.linspace(-5, 5, 100))

    # Set the alpha values in the parameter grid for Lasso
    param_grid = {'alpha': alpha_values}


    ridge_model = Ridge()
    grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
    #grid_search.fit(X_trn_scaled[selected_features], y_trn)
    grid_search.fit(X_trn_scaled[selected_features], y_trn)
    # Get the best alpha value from grid search
    best_alpha = grid_search.best_params_['alpha']
    print("Best alpha for Ridge regression:", best_alpha)

    # 3. Train the final Ridge model with the best alpha
    final_ridge_model = Ridge(alpha=best_alpha)
    #final_ridge_model.fit(X_trn_scaled[selected_features], y_trn)
    final_ridge_model.fit(X_trn_scaled[selected_features], y_trn)
    # Predict on the training set
    y_trn_pred = final_ridge_model.predict(X_trn_scaled[selected_features])

    # Calculate R-squared and RMSE for training data
    #r2_train = r2_score(y_trn, y_trn_pred)
    r2_train = r2_score(y_trn, y_trn_pred)
    adj_r2_train = adjusted_r_squared(r2_train, len(y_trn), X_trn_scaled[selected_features].shape[1])
    train_rmse = calc_rmse(y_trn, y_trn_pred)
    print(f"Training R-squared: {r2_train}")
    print(f"Adjusted R-squared (Training): {adj_r2_train}")
    print(f"Training RMSE: {train_rmse}")

    # Predict on the test set
    y_tst_pred_log = final_ridge_model.predict(X_tst_scaled[selected_features])

    # Reverse the log-transformation to get predictions in the original scale
    y_tst_pred = np.exp(y_tst_pred_log)

    # Merge the predictions with actual sale prices from test_y.csv using 'PID'
    predictions = pd.DataFrame({
        'PID': merged_test_data['PID'],
        'Predicted_Sale_Price': y_tst_pred
    })
    merged_test_data = pd.merge(test_y_data, predictions, on='PID', how='inner')

    # Calculate RMSE and R-squared for the test set using actual sale prices
    test_rmse = calc_rmse(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    r2_test = r2_score(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    adj_r2_test = adjusted_r_squared(r2_test, len(merged_test_data), X_tst_scaled.shape[1])
    print(f"Test RMSE: {test_rmse}")
    print(f"Test R-squared: {r2_test}")
    print(f"Adjusted R-squared (Test): {adj_r2_test}")

    # Save the predictions to a CSV file
    predictions.to_csv('test_predictions.csv', index=False)
    print("Predictions for test set saved to 'test_predictions.csv'.")

if __name__ == "__main__":
    main()


housing_data_encoded_train shape (2051, 307)
housing_data_encoded_test shape (879, 307)
housing_data_encoded_imputed_train shape (2051, 307)
housing_data_encoded_imputed_test shape (879, 307)
Selected features after Lasso: 69 features
X_tst_scaled shape (879, 305)
Best alpha for Ridge regression: 148.4131591025766
Training R-squared: 0.9162166366964976
Adjusted R-squared (Training): 0.9132983872931953
Training RMSE: 0.11969708675889157
Test RMSE: 0.13213083693906497
Test R-squared: 0.8868549781982086
Adjusted R-squared (Test): 0.8266294430332062
Predictions for test set saved to 'test_predictions.csv'.


Run in a For loop

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from scipy import stats  # Importing for z-score calculation

# Helper functions
def preprocess_data(X_trn):
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit the scaler on the training data and transform it
    X_trn_scaled = scaler.fit_transform(X_trn)

    # Return the scaled training data as a DataFrame (to preserve column names) and the scaler object
    X_trn_scaled_df = pd.DataFrame(X_trn_scaled, columns=X_trn.columns)

    return X_trn_scaled_df, scaler

def calc_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def adjusted_r_squared(r2, n, p):
    # Calculate adjusted R-squared
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

def process_fold(fold_num):
    print(f"\nProcessing fold{fold_num}...")

    # Load the train, test, and test_y datasets
    train_file_path = f'fold{fold_num}/train.csv'
    test_file_path = f'fold{fold_num}/test.csv'
    test_y_file_path = f'fold{fold_num}/test_y.csv'

    housing_data_train = pd.read_csv(train_file_path)
    housing_data_test = pd.read_csv(test_file_path)
    test_y_data = pd.read_csv(test_y_file_path)  # Load the actual Sale Price for the test set

    # Ensure that test_y_data and housing_data_test have matching PIDs
    merged_test_data = pd.merge(housing_data_test, test_y_data, on='PID', how='inner')

    # Identify categorical columns
    categorical_columns = ['MS_SubClass', 'MS_Zoning', 'Street', 'Alley', 'Lot_Shape', 'Land_Contour', 'Utilities',
                           'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1', 'Condition_2', 'Bldg_Type',
                           'House_Style', 'Overall_Qual', 'Overall_Cond', 'Roof_Style', 'Roof_Matl', 'Exterior_1st',
                           'Exterior_2nd', 'Mas_Vnr_Type', 'Exter_Qual', 'Exter_Cond', 'Foundation', 'Bsmt_Qual',
                           'Bsmt_Cond', 'Bsmt_Exposure', 'BsmtFin_Type_1', 'BsmtFin_Type_2', 'Heating', 'Heating_QC',
                           'Central_Air', 'Electrical', 'Kitchen_Qual', 'Functional', 'Fireplace_Qu', 'Garage_Type',
                           'Garage_Finish', 'Garage_Qual', 'Garage_Cond', 'Paved_Drive', 'Pool_QC', 'Fence',
                           'Misc_Feature', 'Sale_Type', 'Sale_Condition']

    # Perform dummy encoding on both train and test datasets
    housing_data_encoded_train = pd.get_dummies(housing_data_train, columns=categorical_columns, drop_first=True)
    housing_data_encoded_test = pd.get_dummies(merged_test_data, columns=categorical_columns, drop_first=True)

    # Align columns of train and test after dummy encoding:
    all_columns = list(set(housing_data_encoded_train.columns) | set(housing_data_encoded_test.columns))
    all_columns.remove('Sale_Price')  # Remove 'Sale_Price' if it's in all_columns

    housing_data_encoded_train = housing_data_encoded_train.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    housing_data_encoded_test = housing_data_encoded_test.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    print("housing_data_encoded_train shape", housing_data_encoded_train.shape)
    print("housing_data_encoded_test shape", housing_data_encoded_test.shape)

    # Handle missing values by imputing them in train and test data
    imputer = SimpleImputer(strategy='median')
    numeric_columns = housing_data_encoded_train.select_dtypes(include=[np.number]).columns.tolist()
    numeric_columns = [col for col in numeric_columns if col not in ['Sale_Price', 'PID']]
    housing_data_encoded_imputed_train = housing_data_encoded_train.copy()
    housing_data_encoded_imputed_train[numeric_columns] = imputer.fit_transform(housing_data_encoded_train[numeric_columns])

    housing_data_encoded_imputed_test = housing_data_encoded_test.copy()
    housing_data_encoded_imputed_test[numeric_columns] = imputer.transform(housing_data_encoded_test[numeric_columns])
    print("housing_data_encoded_imputed_train shape", housing_data_encoded_imputed_train.shape)
    print("housing_data_encoded_imputed_test shape", housing_data_encoded_imputed_test.shape)

    # Remove 'PID' from train and test datasets
    housing_data_encoded_imputed_train = housing_data_encoded_imputed_train.drop(columns=['PID'])
    if 'PID' in housing_data_encoded_imputed_test.columns:
        housing_data_encoded_imputed_test = housing_data_encoded_imputed_test.drop(columns=['PID'])

    # Separate predictors (X) and target variable (y) for training
    X_trn = housing_data_encoded_imputed_train.drop(columns='Sale_Price')
    y_trn = np.log(housing_data_encoded_imputed_train['Sale_Price'])

    # Scale the training data using preprocess_data function (returns DataFrame with column names)
    X_trn_scaled, scaler = preprocess_data(X_trn)

    # 1. Use Lasso regression for feature selection
    lasso_model = Lasso(alpha=0.01)  # Adjust the alpha value as necessary
    lasso_model.fit(X_trn_scaled, y_trn)
    selected_features = X_trn_scaled.columns[lasso_model.coef_ != 0]
    print(f"Selected features after Lasso: {len(selected_features)} features")

    # Scale the test data using the same scaler
    X_tst_scaled = pd.DataFrame(scaler.transform(housing_data_encoded_imputed_test.drop(columns=['Sale_Price'])), columns=housing_data_encoded_imputed_test.drop(columns=['Sale_Price']).columns)
    print("X_tst_scaled shape", X_tst_scaled.shape)

    # 2. Grid Search for Ridge Regression to find the best alpha
    alpha_values = np.exp(np.linspace(-5, 5, 100))
    param_grid = {'alpha': alpha_values}

    ridge_model = Ridge()
    grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_trn_scaled[selected_features], y_trn)
    best_alpha = grid_search.best_params_['alpha']
    print("Best alpha for Ridge regression:", best_alpha)

    # 3. Train the final Ridge model with the best alpha
    final_ridge_model = Ridge(alpha=best_alpha)
    final_ridge_model.fit(X_trn_scaled[selected_features], y_trn)

    # Predict on the training set
    y_trn_pred = final_ridge_model.predict(X_trn_scaled[selected_features])
    r2_train = r2_score(y_trn, y_trn_pred)
    adj_r2_train = adjusted_r_squared(r2_train, len(y_trn), X_trn_scaled[selected_features].shape[1])
    train_rmse = calc_rmse(y_trn, y_trn_pred)
    print(f"Training R-squared: {r2_train}")
    print(f"Adjusted R-squared (Training): {adj_r2_train}")
    print(f"Training RMSE: {train_rmse}")

    # Predict on the test set
    y_tst_pred_log = final_ridge_model.predict(X_tst_scaled[selected_features])

    # Reverse the log-transformation to get predictions in the original scale
    y_tst_pred = np.exp(y_tst_pred_log)

    # Merge the predictions with actual sale prices from test_y.csv using 'PID'
    predictions = pd.DataFrame({
        'PID': merged_test_data['PID'],
        'Predicted_Sale_Price': y_tst_pred
    })
    merged_test_data = pd.merge(test_y_data, predictions, on='PID', how='inner')

    # Calculate RMSE and R-squared for the test set using actual sale prices
    test_rmse = calc_rmse(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    r2_test = r2_score(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    adj_r2_test = adjusted_r_squared(r2_test, len(merged_test_data), X_tst_scaled.shape[1])
    print(f"Test RMSE: {test_rmse}")
    print(f"Test R-squared: {r2_test}")
    print(f"Adjusted R-squared (Test): {adj_r2_test}")

    # Save the predictions to a CSV file
    predictions_file = f'fold{fold_num}_test_predictions.csv'
    predictions.to_csv(predictions_file, index=False)
    print(f"Predictions for fold{fold_num} saved to '{predictions_file}'.")

def main():
    # Loop over folds 1 to 10
    for fold_num in range(1, 11):
        process_fold(fold_num)

if __name__ == "__main__":
    main()



Processing fold1...
housing_data_encoded_train shape (2051, 307)
housing_data_encoded_test shape (879, 307)
housing_data_encoded_imputed_train shape (2051, 307)
housing_data_encoded_imputed_test shape (879, 307)
Selected features after Lasso: 62 features
X_tst_scaled shape (879, 305)
Best alpha for Ridge regression: 99.08316975251431
Training R-squared: 0.9295738799235193
Adjusted R-squared (Training): 0.9273774918728444
Training RMSE: 0.11054379841181113
Test RMSE: 0.15655045343161975
Test R-squared: 0.8350420203182289
Adjusted R-squared (Test): 0.7472371620233944
Predictions for fold1 saved to 'fold1_test_predictions.csv'.

Processing fold2...
housing_data_encoded_train shape (2051, 307)
housing_data_encoded_test shape (879, 307)
housing_data_encoded_imputed_train shape (2051, 307)
housing_data_encoded_imputed_test shape (879, 307)
Selected features after Lasso: 71 features
X_tst_scaled shape (879, 305)
Best alpha for Ridge regression: 148.4131591025766
Training R-squared: 0.9205382

Preprocess to remove features

In [None]:
def combine_rare_categories(df, categorical_columns, threshold=0.02):
    """
    Combine rare categories into 'Other' for categorical columns in the dataframe.

    Args:
    df: pandas DataFrame - The dataset.
    categorical_columns: list - List of categorical columns to process.
    threshold: float - The minimum proportion a category should have to not be grouped into 'Other'.

    Returns:
    df: pandas DataFrame - The dataset with rare categories combined into 'Other'.
    """
    for col in categorical_columns:
        # Calculate the frequency of each category in the column
        freq = df[col].value_counts(normalize=True)

        # Categories to be replaced with 'Other'
        rare_categories = freq[freq < threshold].index

        # Replace rare categories with 'Other'
        df[col] = df[col].replace(rare_categories, 'Other')
    print("df", df)
    return df
def remove_high_nan_columns(df, threshold=0.5):
    """
    Remove columns from the DataFrame that have more than the specified threshold of NaN values.

    Args:
    df: pandas DataFrame - The dataset.
    threshold: float - The proportion of NaN values required to drop the column (default is 0.5 for 50%).

    Returns:
    df: pandas DataFrame - The dataset with high NaN columns removed.
    """
    # Calculate the proportion of NaN values for each column
    nan_proportions = df.isnull().mean()

    print("NaN Proportions for each column:")
    print(nan_proportions)
    # Identify columns with NaN proportions greater than the threshold
    columns_to_drop = nan_proportions[nan_proportions > threshold].index

    # Drop the columns with high NaN values
    df = df.drop(columns=columns_to_drop)

    print(f"Columns dropped due to high NaN values: {list(columns_to_drop)}")

    return df

# Usage:


def process_fold(fold_num):
    print(f"\nProcessing fold{fold_num}...")

    # Load the train, test, and test_y datasets
    train_file_path = f'fold{fold_num}/train.csv'
    test_file_path = f'fold{fold_num}/test.csv'
    test_y_file_path = f'fold{fold_num}/test_y.csv'

    housing_data_train = pd.read_csv(train_file_path)
    housing_data_test = pd.read_csv(test_file_path)
    test_y_data = pd.read_csv(test_y_file_path)  # Load the actual Sale Price for the test set

    # Ensure that test_y_data and housing_data_test have matching PIDs
    merged_test_data = pd.merge(housing_data_test, test_y_data, on='PID', how='inner')

    # Identify categorical columns
    categorical_columns = ['MS_SubClass', 'MS_Zoning', 'Street', 'Alley', 'Lot_Shape', 'Land_Contour', 'Utilities',
                           'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1', 'Condition_2', 'Bldg_Type',
                           'House_Style', 'Overall_Qual', 'Overall_Cond', 'Roof_Style', 'Roof_Matl', 'Exterior_1st',
                           'Exterior_2nd', 'Mas_Vnr_Type', 'Exter_Qual', 'Exter_Cond', 'Foundation', 'Bsmt_Qual',
                           'Bsmt_Cond', 'Bsmt_Exposure', 'BsmtFin_Type_1', 'BsmtFin_Type_2', 'Heating', 'Heating_QC',
                           'Central_Air', 'Electrical', 'Kitchen_Qual', 'Functional', 'Fireplace_Qu', 'Garage_Type',
                           'Garage_Finish', 'Garage_Qual', 'Garage_Cond', 'Paved_Drive', 'Pool_QC', 'Fence',
                           'Sale_Type', 'Sale_Condition']

    # Combine rare categories in both train and test datasets
    housing_data_train = combine_rare_categories(housing_data_train, categorical_columns)
    housing_data_test = combine_rare_categories(housing_data_test, categorical_columns)

    # Remove "Misc_Feature" only from the train dataset
    housing_data_train = housing_data_train.drop(columns=['Misc_Feature'], errors='ignore')
    housing_data_test = housing_data_test.drop(columns=['Misc_Feature'], errors='ignore')
    # Perform dummy encoding on both train and test datasets
    housing_data_encoded_train = pd.get_dummies(housing_data_train, columns=categorical_columns, drop_first=True)
    housing_data_encoded_test = pd.get_dummies(merged_test_data, columns=categorical_columns, drop_first=True)

    # Align columns of train and test after dummy encoding:
    all_columns = list(set(housing_data_encoded_train.columns) | set(housing_data_encoded_test.columns))
    all_columns.remove('Sale_Price')  # Remove 'Sale_Price' if it's in all_columns

    housing_data_encoded_train = housing_data_encoded_train.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    housing_data_encoded_test = housing_data_encoded_test.reindex(columns=all_columns + ['Sale_Price'], fill_value=0)
    print("housing_data_encoded_train shape", housing_data_encoded_train.shape)
    print("housing_data_encoded_test shape", housing_data_encoded_test.shape)

    # Handle missing values by imputing them in train and test data
    imputer = SimpleImputer(strategy='median')
    numeric_columns = housing_data_encoded_train.select_dtypes(include=[np.number]).columns.tolist()
    numeric_columns = [col for col in numeric_columns if col not in ['Sale_Price', 'PID']]
    housing_data_encoded_imputed_train = housing_data_encoded_train.copy()
    housing_data_encoded_imputed_train[numeric_columns] = imputer.fit_transform(housing_data_encoded_train[numeric_columns])

    # Coding starts for removing NaN features

    # Assuming housing_data_encoded_imputed_train is your DataFrame
    threshold = 0.5  # For example, set a threshold of 50% missing values
    housing_data_encoded_imputed_train_cleaned = remove_high_nan_columns(housing_data_encoded_imputed_train, threshold)

    print("Cleaned dataset shape:", housing_data_encoded_imputed_train_cleaned.shape)



    # Coding ends for removing NAn features


    housing_data_encoded_imputed_test = housing_data_encoded_test.copy()
    housing_data_encoded_imputed_test[numeric_columns] = imputer.transform(housing_data_encoded_test[numeric_columns])
    print("housing_data_encoded_imputed_train shape", housing_data_encoded_imputed_train.shape)
    print("housing_data_encoded_imputed_test shape", housing_data_encoded_imputed_test.shape)

    # Remove 'PID' from train and test datasets
    housing_data_encoded_imputed_train = housing_data_encoded_imputed_train.drop(columns=['PID'])
    if 'PID' in housing_data_encoded_imputed_test.columns:
        housing_data_encoded_imputed_test = housing_data_encoded_imputed_test.drop(columns=['PID'])

    # Separate predictors (X) and target variable (y) for training
    X_trn = housing_data_encoded_imputed_train.drop(columns='Sale_Price')
    y_trn = np.log(housing_data_encoded_imputed_train['Sale_Price'])

    # Scale the training data using preprocess_data function (returns DataFrame with column names)
    X_trn_scaled, scaler = preprocess_data(X_trn)

    # 1. Use Lasso regression for feature selection
    lasso_model = Lasso(alpha=0.01)  # Adjust the alpha value as necessary
    lasso_model.fit(X_trn_scaled, y_trn)
    selected_features = X_trn_scaled.columns[lasso_model.coef_ != 0]
    print(f"Selected features after Lasso: {len(selected_features)} features")

    # Scale the test data using the same scaler
    X_tst_scaled = pd.DataFrame(scaler.transform(housing_data_encoded_imputed_test.drop(columns=['Sale_Price'])), columns=housing_data_encoded_imputed_test.drop(columns=['Sale_Price']).columns)
    print("X_tst_scaled shape", X_tst_scaled.shape)

    # 2. Grid Search for Ridge Regression to find the best alpha
    alpha_values = np.exp(np.linspace(-5, 5, 100))
    param_grid = {'alpha': alpha_values}

    ridge_model = Ridge()
    grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_trn_scaled[selected_features], y_trn)
    best_alpha = grid_search.best_params_['alpha']
    print("Best alpha for Ridge regression:", best_alpha)

    # 3. Train the final Ridge model with the best alpha
    final_ridge_model = Ridge(alpha=best_alpha)
    final_ridge_model.fit(X_trn_scaled[selected_features], y_trn)

    # Predict on the training set
    y_trn_pred = final_ridge_model.predict(X_trn_scaled[selected_features])
    r2_train = r2_score(y_trn, y_trn_pred)
    adj_r2_train = adjusted_r_squared(r2_train, len(y_trn), X_trn_scaled[selected_features].shape[1])
    train_rmse = calc_rmse(y_trn, y_trn_pred)
    print(f"Training R-squared: {r2_train}")
    print(f"Adjusted R-squared (Training): {adj_r2_train}")
    print(f"Training RMSE: {train_rmse}")

    # Predict on the test set
    y_tst_pred_log = final_ridge_model.predict(X_tst_scaled[selected_features])

    # Reverse the log-transformation to get predictions in the original scale
    y_tst_pred = np.exp(y_tst_pred_log)

    # Merge the predictions with actual sale prices from test_y.csv using 'PID'
    predictions = pd.DataFrame({
        'PID': merged_test_data['PID'],
        'Predicted_Sale_Price': y_tst_pred
    })
    merged_test_data = pd.merge(test_y_data, predictions, on='PID', how='inner')

    # Calculate RMSE and R-squared for the test set using actual sale prices
    test_rmse = calc_rmse(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    r2_test = r2_score(np.log(merged_test_data['Sale_Price']), np.log(merged_test_data['Predicted_Sale_Price']))
    adj_r2_test = adjusted_r_squared(r2_test, len(merged_test_data), X_tst_scaled.shape[1])
    print(f"Test RMSE: {test_rmse}")
    print(f"Test R-squared: {r2_test}")
    print(f"Adjusted R-squared (Test): {adj_r2_test}")

    # Save the predictions to a CSV file
    predictions_file = f'fold{fold_num}_test_predictions.csv'
    predictions.to_csv(predictions_file, index=False)
    print(f"Predictions for fold{fold_num} saved to '{predictions_file}'.")

def main():
    # Loop over folds 1 to 10
    for fold_num in range(1, 11):
        process_fold(fold_num)

if __name__ == "__main__":
    main()



Processing fold1...
df             PID                          MS_SubClass                MS_Zoning  \
0     526350040  One_Story_1946_and_Newer_All_Styles                    Other   
1     526351010  One_Story_1946_and_Newer_All_Styles  Residential_Low_Density   
2     527105010             Two_Story_1946_and_Newer  Residential_Low_Density   
3     527105030             Two_Story_1946_and_Newer  Residential_Low_Density   
4     527127150         One_Story_PUD_1946_and_Newer  Residential_Low_Density   
...         ...                                  ...                      ...   
2046  923275080                  Split_or_Multilevel  Residential_Low_Density   
2047  923276100  One_Story_1946_and_Newer_All_Styles  Residential_Low_Density   
2048  923400125                                Other  Residential_Low_Density   
2049  924100070  One_Story_1946_and_Newer_All_Styles  Residential_Low_Density   
2050  924151050             Two_Story_1946_and_Newer  Residential_Low_Density   

   

ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 'Shed'