In [41]:
!pip install scikit-learn==1.5.2
!pip install "vegafusion[embed]>=1.5.0"
!pip install "vl-convert-python>=1.6.0"



In [42]:
import pandas as pd
import altair as alt
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

alt.data_transformers.enable("vegafusion")

housing = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             20000 non-null  int64  
 1   date           20000 non-null  object 
 2   bedrooms       20000 non-null  int64  
 3   bathrooms      20000 non-null  float64
 4   sqft_living    20000 non-null  int64  
 5   sqft_lot       20000 non-null  int64  
 6   floors         20000 non-null  float64
 7   waterfront     20000 non-null  int64  
 8   view           20000 non-null  int64  
 9   condition      20000 non-null  int64  
 10  grade          20000 non-null  int64  
 11  sqft_above     20000 non-null  int64  
 12  sqft_basement  20000 non-null  int64  
 13  yr_built       20000 non-null  int64  
 14  yr_renovated   20000 non-null  int64  
 15  zipcode        20000 non-null  int64  
 16  lat            20000 non-null  float64
 17  long           20000 non-null  float64
 18  sqft_l

In [45]:
def cleanData(df):
  # Bin house grade
  bins = {0, 3, 6, 7, 10, 13}
  labels = ["fail", "below average", "average", "above average", "highest quality"]

  df['quality'] = pd.cut(housing['grade'], bins=bins, labels=labels)


  # Get date of sale in year
  df['year_sold'] = df['date'].astype(str).str[:4].astype(int)


  # Combine bedrooms & bathrooms
  df['bed_bath'] = df['bedrooms'] + df['bathrooms']


  # Quality per sq foot
  df['quality_sqft'] = df['sqft_living'] / df['grade']


  # Cluster zip codes by like stats
  zip_stats = df.groupby("zipcode").agg({
      "price" : ["mean", "median", "std", "count"],
      "sqft_living" : ["mean", "median"]
  })

  zip_stats["price_per_sqft"] = (
      zip_stats["price"]["median"] / zip_stats["sqft_living"]["median"]
  )

  X_zip = StandardScaler().fit_transform(zip_stats)

  kmeans = KMeans(n_clusters=5)
  zip_stats["zip_cluster"] = kmeans.fit_predict(X_zip)
  zip_stats = zip_stats.round(2)
  zip_stats["zip_cluster"].value_counts()

  df['zip_cluster'] = df['zipcode'].map(zip_stats['zip_cluster'])


  return df

In [46]:
housing = cleanData(housing)
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   id             20000 non-null  int64   
 1   date           20000 non-null  object  
 2   bedrooms       20000 non-null  int64   
 3   bathrooms      20000 non-null  float64 
 4   sqft_living    20000 non-null  int64   
 5   sqft_lot       20000 non-null  int64   
 6   floors         20000 non-null  float64 
 7   waterfront     20000 non-null  int64   
 8   view           20000 non-null  int64   
 9   condition      20000 non-null  int64   
 10  grade          20000 non-null  int64   
 11  sqft_above     20000 non-null  int64   
 12  sqft_basement  20000 non-null  int64   
 13  yr_built       20000 non-null  int64   
 14  yr_renovated   20000 non-null  int64   
 15  zipcode        20000 non-null  int64   
 16  lat            20000 non-null  float64 
 17  long           20000 non-null  

In [None]:
features = ['grade', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'view', 'condition', 'yr_renovated', 'zip_clusters', 'year_sold', 'quality_sqft', 'quality']
categorical_cols = ['quality']
numeric_cols = [c for c in features if c not in categorical_cols]

X = housing[features]
y = housing['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', 'passthrough', numeric_cols)
])

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [None]:
model = XGBRegressor(tree_method='approx', learning_rate=.05, n_estimators=3000, max_depth=4)
# subsample=0.7-9, colsample_bytree = 0.6-9, min_child_weight=2-8, reg_alpha=0.0-5, reg_lambda=1-3
model.fit(X_train_processed, y_train, eval_metric=['rmse', 'mae'], early_stopping_rounds=100)
predictions = model.predict(X_test_processed)

result_rmse = root_mean_squared_error(y_test, predictions)
result_mae = mean_absolute_error(y_test, predictions)

print(result_rmse, result_mae)

In [None]:
model.score(X_test_processed, y_test)

In [None]:
alt.Chart(housing).mark_boxplot().encode(
    x='grade:O',
    y='price:Q'
)

In [None]:
alt.Chart(housing).mark_line(point=True).encode(
    x=alt.X("zipcode:N", sort="-y", title="ZIP Code"),
    y=alt.Y("mean(price):Q", title="Average Price"),
    tooltip=[
        alt.Tooltip("zipcode:N", title="ZIP Code "),
        alt.Tooltip("mean(price):Q", title="Avg Price")
    ]
)

In [None]:
alt.Chart(housing).mark_line(point=True).encode(
  x=alt.X("grade:Q", title="Grade"),
  y=alt.Y("price:Q", title="Price"),
  color=alt.Color("quality:O", title="Quality"),
  tooltip=[alt.Tooltip("quality:Q", title="Quality"), alt.Tooltip("price:Q", title="Price")]
)

Ideas


more knobs in xgboost caller