In [9]:
# Imports
import uuid
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
#  import seaborn as sns


from datetime import datetime

## sklearn 
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,mean_squared_log_error
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import set_config
from sklearn.model_selection import train_test_split

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, LassoLars, SGDRegressor, Ridge, LogisticRegression, BayesianRidge
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor



In [7]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m69.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
Collecting joblib>=1.1.1 (from scikit-learn)
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.2 threadpoolctl-3.1.0
Note: you may need to restart the kernel to use updated packages.


In [10]:
## Arize API and Space Keys
SPACE_KEY  = ""               # CTB / Kaggle: Wide LLMs
API_KEY    = ""

# INSTALL 
!pip install -q arize[AutoEmbeddings]

from arize.pandas.logger import Client
from arize.utils.types import Environments, ModelTypes, EmbeddingColumnNames, Schema

# Arize Client
arize_client = Client(space_key=SPACE_KEY, api_key=API_KEY)


model_type = ModelTypes.REGRESSION
if SPACE_KEY == "SPACE_KEY" or API_KEY == "API_KEY":
    raise ValueError("❌ NEED TO CHANGE SPACE AND/OR API_KEY")
else:
    print("✅ Import and Setup Arize Client Done! Now we can start using Arize!")



# Create generator for embedding vector
# `generator` creates a vector from a prompt in the LLMs response surface.
from arize.pandas.embeddings.tabular_generators import EmbeddingGeneratorForTabularFeatures
import arize.pandas.embeddings.base_generators

# EmbeddingGeneratorForTabularFeatures.list_pretrained_models()

generator = EmbeddingGeneratorForTabularFeatures(
    model_name="distilbert-base-uncased",
    tokenizer_max_length=512,
    #, dropout=0                                                   # Remove Drop-out
)

✅ Import and Setup Arize Client Done! Now we can start using Arize!
[38;21m  arize.utils.logging | INFO | Downloading pre-trained model 'distilbert-base-uncased'[0m


Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

[38;21m  arize.utils.logging | INFO | Downloading tokenizer for 'distilbert-base-uncased'[0m


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
# Important Columns
y_col = 'SalePrice'  # As it appears in the data
id_col = 'Id'

work_dir = 'house-prices-advanced-regression-techniques/'

In [12]:
# Read Data
train = pd.read_csv("train.csv").drop('Id', axis=1)

# test = Submissiion Data (e.g. Id, w/out SalePrice)
test  = pd.read_csv("test.csv")


In [13]:
def camel_case_split(s):
  '''Converts Camel Case to Spaced String'''
  idx = list(map(str.isupper, s))
  # mark change of case
  l = [0]
  for (i, (x, y)) in enumerate(zip(idx, idx[1:])):
      if x and not y:  # "Ul"
          l.append(i)
      elif not x and y:  # "lU"
          l.append(i+1)
  l.append(len(s))
  # for "lUl", index of "U" will pop twice, have to filter that
  return " ".join([s[x:y] for x, y in zip(l, l[1:]) if x < y])

In [14]:
# Applies camel case split to each column turning, e.g. HousePrices -> "House Prices"
for col in train.columns:
  dct = {}
  dct[col] = camel_case_split(col)
  train = train.rename(columns=dct)

for col in test.columns: 
  dct = {}
  dct[col] = camel_case_split(col)
  test  = test.rename(columns=dct)

# Also our identifiers
y_col = camel_case_split(y_col) 
id_col = camel_case_split(id_col)

In [15]:
# Define train_y, train_X, test_X

# TRAIN
train_y = train[ y_col ]                     # This will be split later
train_X = train.drop(y_col , axis=1)

# TEST
test_X = test.drop(id_col, axis=1)           # Used for scoring and submission



In [16]:
# Save names of columns 
tabular_columns = list(train_X.columns)  # List of train columns
# X_cols = train.columns.drop( y_col )

In [17]:
# Rows dont fit in *context window* so we split the rows into 4 groups
split_prompt_n = 4  # Number of sets of columns

the_cols    = list(train_X.columns)  # list of column names 
cols_per    = {}   # Dict key: int 0-n  value: list of cols in group
prompt_ln   = len(the_cols)//split_prompt_n # Number of col per split

for i in range(split_prompt_n):
  if i != split_prompt_n - 1: 
    cols_per[str(i)] = the_cols[ prompt_ln*(i):prompt_ln*(i + 1) ]
  else:
    cols_per[str(i)] = the_cols[ prompt_ln*(i): ]


In [18]:
# Avoid potential index errors
# See note at: https://docs.arize.com/arize/embeddings/let-arize-generate-your-embeddings

# train_test = train_test.reset_index(drop=True)
train_X  = train_X.reset_index(drop=True)
test_X   = test_X.reset_index(drop=True)

In [19]:
# Generate a set of embeddings for each split, the prompt window is 512 but 
# there are a decent number of columns so we split over 4 here

tabular_vector_columns = []  # list of tabular vectors 
prompt_columns         = []  # list of prompt columns  

# Iterate over each column_set
for i in range(split_prompt_n):
  tab_vec_col_name_i = 'tabular_vector_' + str(i)
  prompt_col_name_i = 'prompts_' + str(i)
  tabular_vector_columns += [tab_vec_col_name_i] 
  prompt_columns += [prompt_col_name_i]

  # train_X
  train_X[tab_vec_col_name_i ], train_X[prompt_col_name_i] = generator.generate_embeddings(
      train_X,
      selected_columns  = cols_per[str(i)],
      return_prompt_col = True
  )
 
  # test_X 
  test_X[tab_vec_col_name_i], test_X[prompt_col_name_i] = generator.generate_embeddings(
    test_X,
    selected_columns  = cols_per[str(i)],
    return_prompt_col = True
  )


[38;21m  arize.utils.logging | INFO | Generating embedding vectors[0m


Map:   0%|          | 0/1460 [00:00<?, ? examples/s]

[38;21m  arize.utils.logging | INFO | Generating embedding vectors[0m


Map:   0%|          | 0/1459 [00:00<?, ? examples/s]

[38;21m  arize.utils.logging | INFO | Generating embedding vectors[0m


Map:   0%|          | 0/1460 [00:00<?, ? examples/s]

[38;21m  arize.utils.logging | INFO | Generating embedding vectors[0m


Map:   0%|          | 0/1459 [00:00<?, ? examples/s]

[38;21m  arize.utils.logging | INFO | Generating embedding vectors[0m


Map:   0%|          | 0/1460 [00:00<?, ? examples/s]

[38;21m  arize.utils.logging | INFO | Generating embedding vectors[0m


Map:   0%|          | 0/1459 [00:00<?, ? examples/s]

[38;21m  arize.utils.logging | INFO | Generating embedding vectors[0m


Map:   0%|          | 0/1460 [00:00<?, ? examples/s]

[38;21m  arize.utils.logging | INFO | Generating embedding vectors[0m


Map:   0%|          | 0/1459 [00:00<?, ? examples/s]

In [20]:
def explode( col, prefix ):
  '''explodes single column embedding vector column to DataFrame'''
  n_cols = len( col[0] )
  col_names = [ prefix + str(i) for i in range(n_cols) ]

  return( pd.DataFrame( col.to_list(), columns=col_names) )


In [23]:
# Creates a column per embedding dimension for modeling 
# The template is vec_N_n, where N is the tabular vector, n is each dimension 
# of the tabular vector.

# Adds columns: vec_N_n to DataFrames

for i in range(split_prompt_n):
  tab_vec_name = 'tabular_vector_' + str(i)    #
  prefix = "vec_" + str(i) + "_" 
  
  # train_X
  exploded = explode( train_X[ tab_vec_name], prefix )
  train_X.loc[:, exploded.columns ] = exploded   # Idempotent replacement

  # test_X
  exploded = explode( test_X[ tab_vec_name], prefix )
  test_X.loc[:, exploded.columns ] = exploded    # Idempotent replacement


In [24]:
# Warning: This is only done for internal evaluation; 
#   do not do this for final model submission. Train on full data
   
train_X_sp, eval_X_sp, train_y_sp, eval_y_sp = train_test_split(train_X, train_y, 
                                               test_size=0.2, 
                                               random_state=0)

In [26]:
# Drop everything that is not an exploded-out tabular-vector columns of the form
# vec_N_n. Mig

import re

def get_matching_cols(df, regex):
  r = re.compile(regex)
  return( list( filter( r.match, df.columns) ) )
  
def get_embedding_cols(df):
  return( get_matching_cols(df, "vec_\d+_\d+") )


embed_cols = get_embedding_cols( train_X )


In [27]:
# Which data is being used to fit?  train_X or train_X_sp?

final = True;  # If final = `True` then no evaluation is availably                 

X_cols = embed_cols 

if final: 
  fit_X = train_X[ X_cols ]
  fit_y = train_y
else: 
  fit_X = train_X_sp[ X_cols ]
  fit_y = train_y_sp

eval_X = eval_X_sp[ X_cols ]
eval_y = eval_y_sp

In [28]:
# Define Evaluation metrics 
def evaluate(y_true, y_pred):
    mae = mean_absolute_error(y_true,y_pred)
    mse = mean_squared_error(y_true,y_pred)
    rsquare = r2_score(y_true,y_pred)
    rmse = mean_squared_error(y_true,y_pred,squared = False)
    try:
        rmsle = mean_squared_log_error(y_true,y_pred,squared = False)
    except:
        rmsle = np.nan
    return mae, mse, rsquare, rmse, rmsle

def evaluate_show(y_true, y_pred):
  '''display a table of the evaluation'''
  score = evaluate( y_true, y_pred )
  score_df = pd.DataFrame(score).T.round(5)

  score_df.columns = ['MAE','MSE','R2 Square','RMSE','RMSLE']
  score_df.style.set_properties(**{'background-color': 'aliceblue' ,'color':'black','border-color': '#8b8c8c'})
  print(score_df)
  return()
 

In [32]:
# XGBoost (Est. 15m) 
import xgboost as xgb

model_xgb = xgb.XGBRegressor(objective="reg:squarederror"
                            , max_depth=6
                            , n_estimators=10000
                            , learning_rate=0.01
                            , colsample_bytree=0.2
                            , min_child_weight=1.5
                            , reg_alpha=0.75
                            , reg_lambda=0.45
                            , subsample=0.6
                             ) 

In [None]:
# Train Model
model = model_xgb.fit(fit_X,fit_y)   # est. 15m

In [None]:
import joblib
joblib.dump(model, 'xgboost_model.joblib')
