In [46]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GridSearchCV, train_test_split


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import os
import xgboost as xgb

# Function to calculate correlation after imputation
def evaluate_imputation(data, column, target, strategy, constant_value=None):
    """
    Evaluate the correlation between a column and the target column after imputation.

    Parameters:
        data (pd.DataFrame): The dataset containing the column and target.
        column (str): The name of the column to impute.
        target (str): The name of the target column.
        strategy (str): The imputation strategy ('mean', 'constant').
        constant_value: The value to use for constant imputation (only needed if strategy is 'constant').

    Returns:
        float: Correlation between the imputed column and the target column.
    """
    imputer = None
    if strategy == 'constant':
        if constant_value is None:
            raise ValueError("Constant value must be provided for 'constant' strategy.")
        imputer = SimpleImputer(strategy='constant', fill_value=constant_value)
    elif strategy == 'mean':
        imputer = SimpleImputer(strategy='mean')
    else:
        raise ValueError(f"Unsupported strategy: {strategy}")

    # Impute the missing values
    imputed_column = imputer.fit_transform(data[[column]])
    
    # Replace original column with imputed values for correlation calculation
    data_imputed = data.copy()
    data_imputed[column] = imputed_column
    
    # Calculate correlation
    correlation = data_imputed[column].corr(data_imputed[target])
    return correlation
    
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def clean_data(df):
    """
    Cleans the dataset by handling missing values, infinite values, and preparing it for analysis.
    
    Parameters:
    - df: pd.DataFrame - The dataset to clean.
    
    Returns:
    - cleaned_df: pd.DataFrame - The cleaned dataset.
    """
    print("\nCleaning Data...")
    # Replace infinities with NaN
    df.replace([float('inf'), -float('inf')], float('nan'), inplace=True)
    
    # Report missing values before cleaning
    missing_before = df.isnull().sum().sum()
    print(f"Missing values before cleaning: {missing_before}")
    
    # Fill NaN values with column means for numerical features
    df.fillna(df.mean(numeric_only=True), inplace=True)
    
    # Report missing values after cleaning
    missing_after = df.isnull().sum().sum()
    print(f"Missing values after cleaning: {missing_after}")
    
    return df
    
def analyze_data(df, target_column=None, correlation_columns=None, plot_columns=None, calculate_correlations=False, calculate_mi=False):
    """
    Analyze data by calculating statistics, correlations, unique values, empty values, and creating visualizations.
    
    Parameters:
    - df: pd.DataFrame - The dataset to analyze.
    - target_column: str - The target column for correlation analysis.
    - correlation_columns: list - List of columns to calculate correlations with the target column.
    - plot_columns: list - List of columns to visualize trends or distributions.
    
    Returns:
    - stats: pd.DataFrame - Summary statistics of the dataset.
    - correlations: pd.Series or None - Correlations with the target column, if specified.
    - unique_values: pd.Series - Count of unique values per column.
    - empty_values: pd.Series - Count of empty (NaN) values per column.
    """

    print("\nShape:")
    print(df.shape)
    
    print("\nInfo:")
    df.info()
    
    pd.set_option('display.max_columns', None)
    
    # 1. Display descriptive statistics
    print("\nDescriptive Statistics:")
    stats = df.describe(include='all')
    with pd.option_context('display.max_rows', None):  # Show all rows
        print(stats)
    
    # 2. Check unique values
    print("\nUnique Values per Column:")
    unique_values = df.nunique()
    with pd.option_context('display.max_rows', None):  # Show all rows
        print(unique_values)
    
    # 3. Check empty (NaN) values
    print("\nEmpty (NaN) Values per Column:")
    empty_values = df.isnull().sum()
    with pd.option_context('display.max_rows', None):  # Show all rows
        print(empty_values)
    
    # 4. Calculate correlations
    correlations = None
    if calculate_correlations and target_column:
        if correlation_columns is None:
            correlation_columns = [col for col in df.columns if col != target_column]
        print(f"\nCorrelations with '{target_column}':")
        correlations = df[correlation_columns].corrwith(df[target_column]).abs().sort_values(ascending=False)
        print(correlations)
    
    # 5. Visualize data
    if plot_columns:
        print("\nVisualizing data:")
        for column in plot_columns:
            if pd.api.types.is_numeric_dtype(df[column]):
                # Histogram for numerical data
                sns.histplot(df[column], kde=True)
                plt.title(f"Distribution of {column}")
                plt.show()
            else:
                # Bar plot for categorical data
                sns.countplot(data=df, x=column)
                plt.title(f"Countplot of {column}")
                plt.show()
    
    # 6. Calculate MI scores
    mi_scores = None
    if calculate_mi and target_column:
        print("\nMutual Information (MI) Scores:")
        df_encoded = pd.get_dummies(df, drop_first=True, dtype=int)
        X = df_encoded.drop(columns=[target_column])
        y = df_encoded[target_column]
        mi_scores_array = mutual_info_regression(X, y, random_state=42)
        mi_scores = pd.Series(mi_scores_array, index=X.columns).sort_values(ascending=False)
        
        with pd.option_context('display.max_rows', None):  # Show all rows
            print(mi_scores)
     
    return stats, correlations, unique_values, empty_values

Load and check the data 

In [47]:
# Any results you write to the current directory are saved as output.

train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
train_data.head()
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
test_data.head()

df = pd.DataFrame(train_data)

stats, correlations, unique_values, empty_values = analyze_data(
    df,
    target_column="SalePrice",  # Example target column
    calculate_mi = False
)


Shape:
(1460, 81)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-nu

In [48]:
td = pd.concat([train_data, test_data], ignore_index=True, sort  = False)

columns_to_drop = ["MoSold", "YrSold", "MiscFeature", "3SsnPorch", "RoofMatl", "BsmtHalfBath", "LowQualFinSF", "Condition2", "LandSlope","Street", "Alley","BsmtFinSF2","GarageQual"]
td = td.drop(columns=columns_to_drop)

#OneHotEncoder
#td = pd.get_dummies(td, drop_first = True)

print("\nFINAL DATA\n")
print(td.head())
#td still have SalePrice

X_test = td[td.SalePrice.isnull()]


FINAL DATA

   Id  MSSubClass MSZoning  LotFrontage  LotArea LotShape LandContour  \
0   1          60       RL         65.0     8450      Reg         Lvl   
1   2          20       RL         80.0     9600      Reg         Lvl   
2   3          60       RL         68.0    11250      IR1         Lvl   
3   4          70       RL         60.0     9550      IR1         Lvl   
4   5          60       RL         84.0    14260      IR1         Lvl   

  Utilities LotConfig Neighborhood Condition1 BldgType HouseStyle  \
0    AllPub    Inside      CollgCr       Norm     1Fam     2Story   
1    AllPub       FR2      Veenker      Feedr     1Fam     1Story   
2    AllPub    Inside      CollgCr       Norm     1Fam     2Story   
3    AllPub    Corner      Crawfor       Norm     1Fam     2Story   
4    AllPub       FR2      NoRidge       Norm     1Fam     2Story   

   OverallQual  OverallCond  YearBuilt  YearRemodAdd RoofStyle Exterior1st  \
0            7            5       2003          2003   

1. Drop not important column
2. update columns
   1. Preprocessing for numerical data, if empty, then numberic column change to median value
   2. for string, if it's empty then set to most frequently use
   3. Apply one hot 

proprocessor manually

In [49]:
X = td[td.SalePrice.notnull()]
y = X[[ 'SalePrice']].copy()
y_aligned = y.iloc[1:]  # Removes the first row

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Original Data")
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X.columns if
                    X[cname].nunique() < 10 and 
                    X[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

special_na_zero_cols = ["MasVnrArea"]

#remove special_na_zero_cols from numerical_cols
numerical_cols = [col for col in numerical_cols if col not in special_na_zero_cols]

# Preprocessing for numerical data, if empty, then numberic column change to median value, median should be better
numerical_transformer = SimpleImputer(strategy='median')
numeric_na_zero_transformer = SimpleImputer(strategy='constant', fill_value=0)  # For the special column

# Preprocessing for categorical data
#imputer change null to numeric
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('special', numeric_na_zero_transformer, special_na_zero_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply preprocessing manually
preprocessor.fit(X_train)  # Fit the preprocessor on training data
X_train_preprocessed = preprocessor.transform(X_train)  # Transform training data
X_val_preprocessed = preprocessor.transform(X_val)      # Transform validation data
X_test_preprocessed = preprocessor.transform(X_test)

print("After processing Data")
print(X_train_preprocessed.shape)
print(X_val_preprocessed.shape)
print(X_test_preprocessed.shape)
print(X_test_preprocessed)
print(y.shape)
print(y.head())

# Convert transformed data to DataFrame if needed (optional)
# This step is useful if you want feature names for debugging.
X_train_preprocessed = pd.DataFrame(X_train_preprocessed)
X_val_preprocessed = pd.DataFrame(X_val_preprocessed)

X_train_preprocessed.to_csv('transformed_data_train.csv', index=False)

Original Data
(1168, 68)
(292, 68)
(1168, 1)
(292, 1)
After processing Data
(1168, 194)
(292, 194)
(1459, 194)
[[1.461e+03 2.000e+01 8.000e+01 ... 0.000e+00 1.000e+00 0.000e+00]
 [1.462e+03 2.000e+01 8.100e+01 ... 0.000e+00 1.000e+00 0.000e+00]
 [1.463e+03 6.000e+01 7.400e+01 ... 0.000e+00 1.000e+00 0.000e+00]
 ...
 [2.917e+03 2.000e+01 1.600e+02 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.918e+03 8.500e+01 6.200e+01 ... 0.000e+00 1.000e+00 0.000e+00]
 [2.919e+03 6.000e+01 7.400e+01 ... 0.000e+00 1.000e+00 0.000e+00]]
(1460, 1)
   SalePrice
0   208500.0
1   181500.0
2   223500.0
3   140000.0
4   250000.0


In [50]:
#, early_stopping_rounds=50
model = xgb.XGBRegressor(n_estimators=40000, learning_rate=0.1, max_depth=8)
# Define the simplified pipeline with only the model
my_pipeline = Pipeline(steps=[
    ('model', model)
])

for col in categorical_cols:
    X_train[col] = X_train[col].astype('category')
    X_val[col] = X_val[col].astype('category')
    

my_pipeline.fit(
    X_train_preprocessed, y_train,
    model__eval_set=[(X_val_preprocessed, y_val)],  # Pass preprocessed validation data
    model__early_stopping_rounds=50,               # Set early stopping rounds
    model__verbose=True                            # Enable verbose output for training progress
)

# Parameter grid for GridSearchCV
param_grid = {
    'model__n_estimators': [40000],
    'model__max_depth': [8],
    'model__learning_rate': [0.01,0.05,0.1,0.2],
}

# GridSearchCV
grid_search = GridSearchCV(my_pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)

# Fit the pipeline with GridSearchCV
#grid_search.fit(X_train, y_train)

# Best parameters and score
#print(f"Best parameters: {grid_search.best_params_}")
#print(f"Best score: {grid_search.best_score_}")
#Best parameters: {'model__max_depth': 8, 'model__n_estimators': 20000}
#Best score: -35096476.62818256



# Preprocessing of validation data, get predictions
predictions_xgb = my_pipeline.predict(X_test_preprocessed)

score = mean_absolute_error(y_aligned, predictions_xgb)
print('MAE:', score)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:MSZoning: category, LotShape: category, LandContour: category, Utilities: category, LotConfig: category, Neighborhood: object, Condition1: category, BldgType: category, HouseStyle: category, RoofStyle: category, Exterior1st: object, Exterior2nd: object, MasVnrType: category, ExterQual: category, ExterCond: category, Foundation: category, BsmtQual: category, BsmtCond: category, BsmtExposure: category, BsmtFinType1: category, BsmtFinType2: category, Heating: category, HeatingQC: category, CentralAir: category, Electrical: category, KitchenQual: category, Functional: category, FireplaceQu: category, GarageType: category, GarageFinish: category, GarageCond: category, PavedDrive: category, PoolQC: category, Fence: category, SaleType: category, SaleCondition: category

In [None]:
output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predictions_xgb})
output.SalePrice = output.SalePrice.astype(int)
print(output.shape)

output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

print(output.head())