In [15]:
import pandas as pd
import numpy as np

try:
    import matplotlib.pyplot as plt
    print("Matplotlib successfully imported.")
except ImportError:
    print("Matplotlib not found. Please install it using: conda install matplotlib")

try:
    import seaborn as sns
    print("Seaborn successfully imported.")
except ImportError:
    print("Seaborn not found. Please install it using: conda install seaborn")

try:
    from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
    from sklearn.impute import KNNImputer, SimpleImputer
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.base import BaseEstimator, TransformerMixin
    from sklearn.model_selection import train_test_split, cross_val_score
    print("Scikit-learn modules successfully imported.")
except ImportError:
    print("Scikit-learn not found or incomplete. Please install it using: conda install scikit-learn")

print("All necessary libraries imported successfully!")

Matplotlib successfully imported.
Seaborn successfully imported.
Scikit-learn modules successfully imported.
All necessary libraries imported successfully!


In [23]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

try:
    import lime
    import lime.lime_tabular
    print("LIME imported successfully")
    
    explainer = lime.lime_tabular.LimeTabularExplainer(
        X_train.values, 
        feature_names=X.columns, 
        mode="regression"
    )
    exp = explainer.explain_instance(X_test.iloc[0], rf_model.predict)
    print("\nLIME explanation for the first instance:")
    for feature, value in exp.as_list():
        print(f"{feature}: {value:.4f}")
    print("LIME analysis completed successfully!")
except Exception as e:
    print(f"Error in LIME analysis: {str(e)}")

warnings.resetwarnings()

LIME imported successfully

LIME explanation for the first instance:
0.49 < E <= 0.76: -0.0617
0.51 < A <= 0.76: -0.0481
C <= 0.26: -0.0359
B <= 0.23: 0.0297
0.23 < D <= 0.48: -0.0062
LIME analysis completed successfully!


In [24]:
# Data cleaning and engineering

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Removed: import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score


class OutlierCapper(BaseEstimator, TransformerMixin):
    def __init__(self, lower_quantile=0.01, upper_quantile=0.99):
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile
        self.lower_bounds = None
        self.upper_bounds = None

    def fit(self, X, y=None):
        self.lower_bounds = np.quantile(X, self.lower_quantile, axis=0)
        self.upper_bounds = np.quantile(X, self.upper_quantile, axis=0)
        return self

    def transform(self, X):
        return np.clip(X, self.lower_bounds, self.upper_bounds)

class YearConverter(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.min_year = None
        self.max_year = None

    def fit(self, X, y=None):
        X_flat = X.ravel() if X.ndim > 1 else X
        self.min_year = np.min(X_flat)
        self.max_year = min(np.max(X_flat), pd.Timestamp.now().year)
        return self

    def transform(self, X):
        X_numeric = pd.to_numeric(X.ravel() if X.ndim > 1 else X, errors='coerce')
        X_clipped = np.clip(X_numeric, self.min_year, self.max_year)
        return X_clipped.reshape(X.shape)

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Create new features
        X['TotalSF'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']
        
        current_year = pd.Timestamp.now().year
        X['HouseAge'] = current_year - X['YearBuilt']
        X['TimeSinceRemodel'] = current_year - X['YearRemodAdd']
        
        X['TotalBathrooms'] = X['FullBath'] + (0.5 * X['HalfBath']) + X['BsmtFullBath'] + (0.5 * X['BsmtHalfBath'])
        X['IsNewHouse'] = (X['YearBuilt'] == X['YrSold']).astype(int)
        X['HasPool'] = (X['PoolArea'] > 0).astype(int)
        X['TotalPorchSF'] = X['OpenPorchSF'] + X['EnclosedPorch'] + X['3SsnPorch'] + X['ScreenPorch']
        X['OverallHouseCondition'] = X['OverallQual'] * X['OverallCond']
        
        # Create interaction features
        X['TotalSF_OverallQual'] = X['TotalSF'] * X['OverallQual']
        X['GrLivArea_TotRmsAbvGrd'] = X['GrLivArea'] * X['TotRmsAbvGrd']
        X['HouseAge_OverallQual'] = X['HouseAge'] * X['OverallQual']
        X['GarageArea_GarageCars'] = X['GarageArea'] * X['GarageCars']
        X['YearBuilt_YearRemodAdd'] = X['YearBuilt'] * X['YearRemodAdd']
        X['TotalSF_HouseAge'] = X['TotalSF'] * X['HouseAge']
        X['1stFlrSF_2ndFlrSF'] = X['1stFlrSF'] * X['2ndFlrSF']
        X['TotalSF_OverallCond'] = X['TotalSF'] * X['OverallCond']
        
        # Interaction with categorical variable (requires encoding)
        X['GrLivArea_Neighborhood'] = X['GrLivArea'] * pd.factorize(X['Neighborhood'])[0]
        
        return X

def pandas_to_numpy(X):
    return X.to_numpy() if isinstance(X, pd.DataFrame) else X

def preprocess_and_engineer(X):
    # Apply FeatureEngineer first
    feature_engineer = FeatureEngineer()
    X_engineered = feature_engineer.fit_transform(X.copy())
    
    # Identify numeric, categorical, and year columns
    numeric_features = X_engineered.select_dtypes(include=['int64', 'float64']).columns.drop(['YearBuilt', 'YearRemodAdd', 'YrSold'])
    categorical_features = X_engineered.select_dtypes(include=['object']).columns
    year_features = ['YearBuilt', 'YearRemodAdd', 'YrSold']
    
    print("Number of features before preprocessing:")
    print(f"Numeric: {len(numeric_features)}")
    print(f"Categorical: {len(categorical_features)}")
    print(f"Year: {len(year_features)}")
    
    # Create preprocessing steps
    numeric_transformer = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=5)),
        ('outlier_capper', OutlierCapper()),
        ('scaler', StandardScaler()),
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore', max_categories=10)),
    ])

    year_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('converter', YearConverter()),
    ])

    # Create and fit the preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
            ('year', year_transformer, year_features)
        ])
    
    X_preprocessed = preprocessor.fit_transform(X_engineered)
    
    # Generate feature names
    numeric_feature_names = list(numeric_features)
    categorical_feature_names = []
    onehot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
    
    print("\nCategorical feature encoding details:")
    for i, feature in enumerate(categorical_features):
        categories = onehot_encoder.categories_[i]
        n_categories = min(len(categories), 10)  # Account for max_categories=10
        n_encoded = n_categories - 1  # Subtract 1 due to drop='first'
        print(f"{feature}: {n_categories} categories, {n_encoded} encoded features")
        categorical_feature_names.extend([f"{feature}_{cat}" for cat in categories[1:n_categories]])
    
    year_feature_names = list(year_features)
    
    feature_names = (numeric_feature_names + 
                     categorical_feature_names + 
                     year_feature_names)
    
    print("\nNumber of features after preprocessing:")
    print(f"Numeric: {len(numeric_feature_names)}")
    print(f"Categorical (one-hot encoded): {len(categorical_feature_names)}")
    print(f"Year: {len(year_feature_names)}")
    
    print(f"\nTotal number of features: {len(feature_names)}")
    print(f"Number of columns in preprocessed data: {X_preprocessed.shape[1]}")
    
    # Ensure the number of feature names matches the number of columns in X_preprocessed
    if len(feature_names) != X_preprocessed.shape[1]:
        print(f"\nWarning: Number of feature names ({len(feature_names)}) "
              f"does not match number of columns in preprocessed data ({X_preprocessed.shape[1]})")
        print("Adjusting feature names...")
        if len(feature_names) > X_preprocessed.shape[1]:
            feature_names = feature_names[:X_preprocessed.shape[1]]
        else:
            feature_names += [f'Unknown_{i}' for i in range(X_preprocessed.shape[1] - len(feature_names))]
    
    # Store feature names as an attribute of the DataFrame
    df = pd.DataFrame(X_preprocessed, columns=feature_names, index=X.index)
    df.attrs['feature_names'] = feature_names
    
    return df

# Load the data
df = pd.read_csv('/Users/ttanaka/Desktop/Website/house-prices-advanced-regression-techniques/train.csv')

# Separate features and target
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# Full pipeline
full_pipeline = Pipeline([
    ('preprocess_and_engineer', FunctionTransformer(preprocess_and_engineer, validate=False)),
    ('to_numpy', FunctionTransformer(pandas_to_numpy))
])

# Apply the pipeline
X_processed = full_pipeline.fit_transform(X)

# Validation
print("\nFinal validation:")
print("Shape after preprocessing:", X_processed.shape)
print("Missing values after preprocessing:", np.isnan(X_processed).sum())

# Split the processed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Access feature names from the intermediate DataFrame
intermediate_df = full_pipeline.named_steps['preprocess_and_engineer'].transform(X)
feature_names = intermediate_df.attrs.get('feature_names', [])
print("Number of features:", len(feature_names))
print("First 10 feature names:", feature_names[:10])
print("Last 10 feature names:", feature_names[-10:])

Number of features before preprocessing:
Numeric: 51
Categorical: 43
Year: 3

Categorical feature encoding details:
MSZoning: 5 categories, 4 encoded features
Street: 2 categories, 1 encoded features
Alley: 3 categories, 2 encoded features
LotShape: 4 categories, 3 encoded features
LandContour: 4 categories, 3 encoded features
Utilities: 2 categories, 1 encoded features
LotConfig: 5 categories, 4 encoded features
LandSlope: 3 categories, 2 encoded features
Neighborhood: 10 categories, 9 encoded features
Condition1: 9 categories, 8 encoded features
Condition2: 8 categories, 7 encoded features
BldgType: 5 categories, 4 encoded features
HouseStyle: 8 categories, 7 encoded features
RoofStyle: 6 categories, 5 encoded features
RoofMatl: 8 categories, 7 encoded features
Exterior1st: 10 categories, 9 encoded features
Exterior2nd: 10 categories, 9 encoded features
MasVnrType: 4 categories, 3 encoded features
ExterQual: 4 categories, 3 encoded features
ExterCond: 5 categories, 4 encoded features

In [21]:
import os
os.environ['USE_NUMBA'] = '0'  # Disable Numba

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Create dummy data
X = pd.DataFrame(np.random.rand(100, 5), columns=['A', 'B', 'C', 'D', 'E'])
y = np.random.rand(100)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# SHAP
try:
    import shap
    print(f"SHAP version: {shap.__version__}")
    explainer = shap.TreeExplainer(rf_model)
    shap_values = explainer.shap_values(X_test)
    print("\nSHAP values for the first instance:")
    for feature, value in zip(X.columns, shap_values[0]):
        print(f"{feature}: {value:.4f}")
    print("SHAP analysis completed successfully!")
except Exception as e:
    print(f"Error in SHAP analysis: {str(e)}")

# LIME
try:
    import lime
    import lime.lime_tabular
    print(f"\nLIME version: {lime.__version__}")
    explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X.columns, mode="regression")
    exp = explainer.explain_instance(X_test.iloc[0], rf_model.predict)
    print("\nLIME explanation for the first instance:")
    for feature, value in exp.as_list():
        print(f"{feature}: {value:.4f}")
    print("LIME analysis completed successfully!")
except Exception as e:
    print(f"Error in LIME analysis: {str(e)}")

# ELI5
try:
    import eli5
    from eli5.sklearn import PermutationImportance
    print(f"\nELI5 version: {eli5.__version__}")
    perm = PermutationImportance(rf_model, random_state=42).fit(X_test, y_test)
    print("\nELI5 feature importance:")
    print(eli5.format_as_text(eli5.explain_weights(perm, feature_names=X.columns.tolist())))
    print("ELI5 analysis completed successfully!")
except Exception as e:
    print(f"Error in ELI5 analysis: {str(e)}")

# InterpretML
try:
    from interpret import show
    from interpret.blackbox import ShapKernel
    print("\nInterpretML analysis:")
    explainer = ShapKernel(rf_model.predict, X_train)
    shap_values = explainer.explain_local(X_test[:5])
    print("InterpretML SHAP values for the first 5 instances:")
    print(shap_values.data()[:5])
    print("InterpretML analysis completed successfully!")
except Exception as e:
    print(f"Error in InterpretML analysis: {str(e)}")

print("\nInstalled packages:")
import pkg_resources
installed_packages = [d for d in pkg_resources.working_set]
for package in sorted(installed_packages, key=lambda x: x.key):
    print(f"{package.key}=={package.version}")

SHAP version: 0.45.1

SHAP values for the first instance:
A: -0.0485
B: 0.0105
C: -0.0019
D: -0.0155
E: -0.0706
SHAP analysis completed successfully!
Error in LIME analysis: module 'lime' has no attribute '__version__'
Error in ELI5 analysis: cannot import name 'if_delegate_has_method' from 'sklearn.utils.metaestimators' (/Users/ttanaka/miniconda3/envs/fresh_env/lib/python3.9/site-packages/sklearn/utils/metaestimators.py)

InterpretML analysis:


  0%|          | 0/5 [00:00<?, ?it/s]

InterpretML SHAP values for the first 5 instances:
Error in InterpretML analysis: 'NoneType' object is not subscriptable

Installed packages:
anyio==4.2.0
appnope==0.1.2
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
asttokens==2.0.5
async-lru==2.0.4
attrs==23.1.0
autocommand==2.2.2
babel==2.11.0
backcall==0.2.0
backports.tarfile==1.2.0
beautifulsoup4==4.12.3
bleach==4.1.0
bottleneck==1.3.7
brotli==1.0.9
certifi==2024.7.4
cffi==1.16.0
charset-normalizer==3.3.2
cloudpickle==3.0.0
colorama==0.4.6
comm==0.2.1
contourpy==1.2.1
cycler==0.12.1
debugpy==1.6.7
decorator==5.1.1
defusedxml==0.7.1
eli5==0.13.0
exceptiongroup==1.2.0
executing==0.8.3
fastjsonschema==2.16.2
fonttools==4.53.1
graphviz==0.20.3
idna==3.7
imagecodecs==2023.1.23
imageio==2.34.2
importlib-metadata==7.0.1
importlib-resources==6.4.2
inflect==7.3.1
ipykernel==6.28.0
ipython==8.15.0
ipywidgets==8.1.2
jaraco.context==5.3.0
jaraco.functools==4.0.1
jaraco.text==3.12.1
jedi==0.19.1
jinja2==3.1.4
joblib==1.4.2
json5==0.9.6
jsons

