#### Unused Models

#### Ridge With Regularization Removed

Closed form solution - inverse fails

In [None]:
def ridge_unreg(X,y,lam, unreg_idx):
    eye_len = len(X.columns)
    X = X.to_numpy()
    y = y.to_numpy()
    unreg_eye = np.eye(eye_len)
    unreg_eye[unreg_idx][unreg_idx] = 0
    B = np.linalg.inv(X.T @ X + lam * (unreg_eye)) @ X.T @ y
    return B

#### Modified Ridge (Don't Regularize Land Value)

Start with the coefficients from Sci-Kit's Ridge model to speed up convergence

In [None]:
class LVTRidge():
    def __init__(self, unreg_idx, alpha=.01, lam=.1, T=100, theta_init=None):
        self.unreg_idx = unreg_idx
        self.alpha = alpha
        self.lam = lam
        self.T = T
        self.theta_init = theta_init

    def loss(self, X, y, theta):
        """
        Return the loss for given inputs X, targets y, parameters theta,
        and regularization coefficient lam.
        """
        unreg_theta = theta.copy()
        unreg_theta[self.unreg_idx] = 0
        yhat = X @ theta
        return (((y-yhat)**2).sum() + (self.lam*(unreg_theta)**2)).sum()/len(y)

    def gradient_descent(self, X, y):
        if self.theta_init is not None:
            theta = self.theta_init.copy()
        else:
            theta = np.zeros(X.shape[1] + 1)
        m = len(y)
        
        # keep track of losses
        losses = np.zeros(self.T)

        for t in range(T):
            theta = theta - self.alpha * (-2 / m * (X.T @ (y - X @ theta)) - 2 * self.lam * theta)
            losses[t] = self.loss(X, y, theta)
        
        return theta, losses

    def fit(self, X, y):
        self.theta, losses = self.gradient_descent(X,y)

    def predict(self, X):
        yhat = X @ self.theta 
        return yhat

In [None]:
alpha = .01
lam = .1
T = 100
theta_sk = model_ridge_sk.coef_ # use the coefficients from the Ridge model as a starting point
unreg_col = XX.columns.get_loc("kernel_land_price_per_ft_log")

model_ridge_unreg = LVTRidge(unreg_col, alpha, lam, T, theta_sk)

In [None]:
results = cross_validate(XX,y,model_ridge_unreg)

In [None]:
results

#### Tab Transformer

In [None]:
import torch
import torch.nn as nn
from tab_transformer_pytorch import TabTransformer

cont_mean_std = torch.randn(10, 2)

model = TabTransformer(
    categories = (10, 5, 6, 5, 8),      # tuple containing the number of unique values within each category
    num_continuous = 10,                # number of continuous values
    dim = 32,                           # dimension, paper set at 32
    dim_out = 1,                        # binary prediction, but could be anything
    depth = 6,                          # depth, paper recommended 6
    heads = 8,                          # heads, paper recommends 8
    attn_dropout = 0.1,                 # post-attention dropout
    ff_dropout = 0.1,                   # feed forward dropout
    mlp_hidden_mults = (4, 2),          # relative multiples of each hidden dimension of the last mlp to logits
    mlp_act = nn.ReLU(),                # activation for final mlp, defaults to relu, but could be anything else (selu etc)
    continuous_mean_std = cont_mean_std # (optional) - normalize the continuous values before layer norm
)

x_categ = torch.randint(0, 5, (1, 5))     # category values, from 0 - max number of categories, in the order as passed into the constructor above
x_cont = torch.randn(1, 10)               # assume continuous values are already normalized individually

pred = model(x_categ, x_cont) # (1, 1)

In [None]:
pred

## OLD CODE

In [1]:
def get_building_cat(df,cat):
    return df[df[cat]==1]

def sale_ward_xy(df,ward):
    df = df[df['geographic_ward']==ward]
    return sale_xy(df)

def sale_cat_xy(df,cat):
    df = df[df['category_code']==cat]
    return sale_xy(df)

#### Sci-Kit Ridge

In [None]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import MinMaxScaler

In [None]:
alpha = .1
model = Ridge(alpha=alpha)
scaler = MinMaxScaler()
feature_selector = SequentialFeatureSelector(model,scoring='neg_mean_absolute_percentage_error',direction='backward')

In [None]:
ward_results = {}
ward_scores = {}
ward_results_trim = {}
ward_scores_trim = {}

TRIM THE TOP 5% AND BOTTOM 1%?

In [None]:
# for running smaller subsets of categories and wards
wards = sorted(X.geographic_ward.unique())
wards = [1,2]

cats = sorted(X.category_code.unique())
cats = [1,4,6]

cat_ward_cols = ['geographic_ward','category_code']

for ward in wards:
    for cat in cats:
            XX = X[(X.geographic_ward == ward) & (X.category_code == cat)].drop(cat_ward_cols,axis=1)
            yy = y[y.index.isin(XX.index)]
            if len(XX) > 10:
                XXs = scaler.fit_transform(XX)
                
                feature_selector.fit(XXs,yy)
                features = XX.columns[feature_selector.get_support()]
                XXss = feature_selector.transform(XXs)
                
                model.fit(XXss,yy)
                yyhat = model.predict(XXss)
                scores = np.exp(yyhat)/np.exp(yy)
                
                mean = np.mean(scores)
                med = np.median(scores)
                cod = np.mean((scores-med)/med)
                results = {"mean": mean,
                           "median": med,
                           "cod": cod,
                           "features": features}

                ward_results[(ward,cat)] = results
                ward_scores[(ward,cat)] = scores
            
                # remove top and bottom 5% as outliers
                outliers = len(scores)//20
                scores_trim = np.sort(scores)[outliers:-outliers]
                
                mean = np.mean(scores_trim)
                med = np.median(scores_trim)
                cod = np.mean((scores_trim-med)/med)
                
                results_trim = {"mean": mean,
                           "median": med,
                           "cod": cod,
                           "features": features}
                
                ward_results_trim[(ward,cat)] = results_trim
                ward_scores_trim[(ward,cat)] = scores_trim
        

In [None]:
outliers = len(scores)//20

In [None]:
len(scores)

243

In [None]:
len(np.sort(scores)[outliers:-outliers])

219

In [None]:
ward_scores_trim

{(1, 1): 1.2239863836318903,
 (1, 4): 1.1148891237261247,
 (1, 6): 2.5611982007188763,
 (2, 1): 1.147762745627715,
 (2, 4): 1.3402442207150163,
 (2, 6): 2.3172168970622553}

In [None]:
ward_results

In [None]:
ward_results_trim

In [None]:
results_trim

##### XGBoost

In [None]:
import xgboost as xgb
model = xgb.XGBRegressor(learning_rate=.1,max_depth=8,n_estimators=180)

In [None]:
model.fit(X.drop[['geographic_ward','category_code'],axis=1],y)

SyntaxError: invalid syntax (3072173994.py, line 1)

In [None]:
yhat = model.predict(X_log)

In [None]:
scores = abs(np.exp(yhat) - np.exp(y))/np.exp(y)
scores

objectid
75913755    0.325172
76138460    0.546302
75760180    0.550956
76128800    0.217778
76023304    0.368448
              ...   
75783517    1.617735
75937986    0.358455
76043115    0.050435
75799362    0.334209
75795261    0.636706
Name: sale_price, Length: 297903, dtype: float64

In [None]:
MAPE = scores.sum()/len(scores)
MAPE

1.014869961282009

In [None]:
from sklearn.linear_model import Lasso
alpha = .01
model = Lasso(alpha=alpha,max_iter=5000)

In [None]:
model.fit(X_log,y)

Lasso(alpha=0.01, max_iter=5000)

In [None]:
yhat = model.predict(X_log)

In [None]:
scores = abs(np.exp(yhat) - np.exp(y))/np.exp(y)

In [None]:
MAPE = scores.sum()/len(scores)
MAPE

1.3775764546037044

#### Cross-Validation

In [None]:
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

In [None]:
X,y = sale_xy(df)
X.head()

Unnamed: 0_level_0,geographic_ward,category_code,total_area,garage_spaces,off_street_open,garage_type_1,garage_type_2,garage_type_3,garage_type_A,garage_type_B,...,central_air_1,central_air_N,central_air_Y,view_type_A,view_type_B,view_type_C,view_type_D,view_type_E,view_type_H,view_type_I
objectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
75913755,65,1,1600.0,1.0,0.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
76138460,58,1,7456.0,1.0,0.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
75760180,8,1,848.0,1.0,0.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
76128800,26,1,644.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
76023304,42,1,586.8,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
X = X.drop(['geographic_ward','category_code', 'total_area'],axis=1)

In [None]:
X_s = scaler.fit_transform(X)

#### Lasso

In [None]:
from sklearn.model_selection import cross_val_score

ward_results = {}

for ward in df.geographic_ward.unique():
    if ward is not np.NaN:
        X,y = sale_ward_xy(df, ward)
        X = scaler.fit_transform(X)
        scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
        ward_results[ward] = scores


sorted_ward = [int(ward) for ward in ward_results.keys()]
sorted_ward.sort()

for ward in sorted_ward:
    scores = np.absolute(ward_results[ward])
    print(f"#### Geographic Ward {ward} ####")
    print(f'Mean MAE: {scores.mean():.1f} ({scores.std():.1f})')
    print(f'Max Error: {scores.max():.1f}')
    print(f'Median Error: {np.median(scores):.1f}')

##### Lasso on Building Categories

In [None]:
df.category_code.value_counts()

1     369532
6      35103
2      33920
3      11549
4      11145
5       3481
13        15
14        15
8          6
12         3
7          1
15         1
11         0
Name: category_code, dtype: int64

In [None]:
sparse_cats = [13,14,8,12,7,15,11]

In [None]:
cat_results = {}

for cat in df.category_code.unique():
    if cat is not np.NaN and cat not in sparse_cats:
        X,y = sale_cat_xy(df.drop('geographic_ward',axis=1), cat)
        X = scaler.fit_transform(X)
        scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
        cat_results[cat] = scores

In [None]:
sorted_cat = [int(cat) for cat in cat_results.keys()]
sorted_cat.sort()

for cat in sorted_cat:
    scores = np.absolute(cat_results[cat])
    print(f"#### Building Category {cat} ####")
    print(f'Mean MAE: {scores.mean():.1f} ({scores.std():.1f})')
    print(f'Max Error: {scores.max():.1f}')
    print(f'Median Error: {np.median(scores):.1f}')

#### Building Category 1 ####
Mean MAE: 750832.3 (63084.3)
Max Error: 941281.9
Median Error: 751788.0
#### Building Category 2 ####
Mean MAE: 406780.1 (33546.2)
Max Error: 491624.5
Median Error: 398721.5
#### Building Category 3 ####
Mean MAE: 169647.1 (18932.9)
Max Error: 225769.2
Median Error: 164257.4
#### Building Category 4 ####
Mean MAE: 3181874.2 (479806.0)
Max Error: 4026616.9
Median Error: 3082784.0
#### Building Category 5 ####
Mean MAE: 941291.0 (158330.4)
Max Error: 1371121.4
Median Error: 912111.0
#### Building Category 6 ####
Mean MAE: 317367.6 (52934.4)
Max Error: 430713.4
Median Error: 317795.0


##### Lasso on Wards and Categories

In [None]:
ward_cat_results = {}

cat_results = {}

for ward in df.geographic_ward.unique():
    if ward is not np.NaN:
        for cat in df.category_code.unique():
            if cat is not np.NaN and cat not in sparse_cats:
                X,y = sale_cat_xy(df[df['geographic_ward']==ward], cat)
                if X.shape[0] >= 10:
                    X = scaler.fit_transform(X)
                    scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
                    ward_cat_results[(ward,cat)] = scores

In [None]:
sorted_ward_cat = [_ for _ in ward_cat_results.keys()]
sorted_ward_cat.sort()

for ward_cat in sorted_ward_cat:
    scores = np.absolute(ward_cat_results[ward_cat])
    print(f"#### Ward {ward_cat[0]} Building Category {ward_cat[1]} ####")
    print(f'Mean MAE: {scores.mean():.1f} ({scores.std():.1f})')
    print(f'Max Error: {scores.max():.1f}')
    print(f'Median Error: {np.median(scores):.1f}')

#### Ridge

#### XGBoost

In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score

### Run Models For Different Wards & Property Types