In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [2]:
data = pd.read_csv('./AB_NYC_2019.csv')

In [3]:
excluding_list = [
    'price', 'id', 'latitude', 
    'longitude', 'host_id', 
    'last_review', 'name',
    'host_name'
    ]

categorical = [
    'neighbourhood_group',
    'neighbourhood',
    'room_type'
    ]

continuous = [
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
    ] 


low_card_categorical = [
    'neighbourhood_group',
    'room_type'
]

high_card_categorical = [
    'neighbourhood',
]

In [4]:
target_mean = (data["price"] > data["price"].mean()).astype(int)
target_median = (data["price"] > data["price"].median()).astype(int)
target_multiclass = pd.qcut(data["price"], q=5, labels=False)
target_regression = data["price"]

In [5]:
categorical_onehot = OneHotEncoder(handle_unknown='ignore')
categorical_ordinal = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
numeric_passthrough = SimpleImputer(strategy='constant', fill_value=0)
column_transform = ColumnTransformer(
    [
        ('low_card_categories', categorical_onehot, low_card_categorical),
        ('hugh_card_categories', categorical_ordinal, high_card_categorical),
        ('numeric', numeric_passthrough, continuous)
    ],
    remainder='drop', # Dropping any remaining unprocessed columns
    verbose_feature_names_out=False,
    sparse_threshold=0.0
)

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import KFold, cross_validate

In [11]:
accuracy = make_scorer(accuracy_score)
cv = KFold(n_splits=5, shuffle=True, random_state=0)

column_transform = ColumnTransformer(
    [
        ('low_card_categories', categorical_onehot, low_card_categorical),
        ('numeric', numeric_passthrough, continuous)
    ],
    remainder='drop', # Dropping any remaining unprocessed columns
    verbose_feature_names_out=False,
    sparse_threshold=0.0
)

model = DecisionTreeClassifier(random_state=0)
model_pipeline = Pipeline(
    steps=[
        ('processing', column_transform),
        ('model', model)
    ]
)

cv_scores = cross_validate(
    estimator=model_pipeline,
    X=data,
    y=target_median,
    scoring=accuracy,
    cv=cv,
    return_train_score=True,
    return_estimator=True,
)

train_cv = np.mean(cv_scores['train_score'])
mean_cv = np.mean(cv_scores['test_score'])
std_cv = np.std(cv_scores['test_score'])    
fit_time = np.mean(cv_scores['fit_time'])
score_time = np.mean(cv_scores['score_time'])   
print(f"Train: {train_cv:.3f} |",
      f"Test: {mean_cv:0.3f} ({std_cv:0.3f}) |", 
      f"fit: {fit_time:0.2f}", f"secs pred: {score_time:0.2f} secs")

Train: 0.965 | Test: 0.761 (0.005) | fit: 0.14 secs pred: 0.01 secs


### Bagging and Sampling

In [12]:
from sklearn.ensemble import BaggingClassifier

In [13]:
accuracy = make_scorer(accuracy_score)
cv = KFold(n_splits=5, shuffle=True, random_state=0)

column_transform = ColumnTransformer(
    [
        ('low_card_categories', categorical_onehot, low_card_categorical),
        ('numeric', numeric_passthrough, continuous)
    ],
    remainder='drop', # Dropping any remaining unprocessed columns
    verbose_feature_names_out=False,
    sparse_threshold=0.0
)

model = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=300,
    bootstrap=True,
    max_samples=1.0,
    max_features=1.0,
    random_state=0
)
model_pipeline = Pipeline(
    steps=[
        ('processing', column_transform),
        ('model', model)
    ]
)

cv_scores = cross_validate(
    estimator=model_pipeline,
    X=data,
    y=target_median,
    scoring=accuracy,
    cv=cv,
    return_train_score=True,
    return_estimator=True,
)

train_cv = np.mean(cv_scores['train_score'])
mean_cv = np.mean(cv_scores['test_score'])
std_cv = np.std(cv_scores['test_score'])    
fit_time = np.mean(cv_scores['fit_time'])
score_time = np.mean(cv_scores['score_time'])   
print(f"Train: {train_cv:.3f} |",
      f"Test: {mean_cv:0.3f} ({std_cv:0.3f}) |", 
      f"fit: {fit_time:0.2f}", f"secs pred: {score_time:0.2f} secs")

Train: 0.965 | Test: 0.809 (0.004) | fit: 26.01 secs pred: 0.56 secs


### Random Forest Classifier

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
accuracy = make_scorer(accuracy_score)
cv = KFold(n_splits=5, shuffle=True, random_state=0)

column_transform = ColumnTransformer(
    [
        ('low_card_categories', categorical_onehot, low_card_categorical),
        ('numeric', numeric_passthrough, continuous)
    ],
    remainder='drop', # Dropping any remaining unprocessed columns
    verbose_feature_names_out=False,
    sparse_threshold=0.0
)

model = RandomForestClassifier(
    n_estimators=300,
    min_samples_leaf=3,
    random_state=0
)
model_pipeline = Pipeline(
    steps=[
        ('processing', column_transform),
        ('model', model)
    ]
)

cv_scores = cross_validate(
    estimator=model_pipeline,
    X=data,
    y=target_median,
    scoring=accuracy,
    cv=cv,
    return_train_score=True,
    return_estimator=True,
)

train_cv = np.mean(cv_scores['train_score'])
mean_cv = np.mean(cv_scores['test_score'])
std_cv = np.std(cv_scores['test_score'])    
fit_time = np.mean(cv_scores['fit_time'])
score_time = np.mean(cv_scores['score_time'])   
print(f"Train: {train_cv:.3f} |",
      f"Test: {mean_cv:0.3f} ({std_cv:0.3f}) |", 
      f"fit: {fit_time:0.2f}", f"secs pred: {score_time:0.2f} secs")

Train: 0.883 | Test: 0.826 (0.004) | fit: 10.17 secs pred: 0.48 secs


### Extra Tree Classifier

In [24]:
from sklearn.ensemble import ExtraTreesClassifier

In [27]:
accuracy = make_scorer(accuracy_score)
cv = KFold(n_splits=5, shuffle=True, random_state=0)

column_transform = ColumnTransformer(
    [
        ('low_card_categories', categorical_onehot, low_card_categorical),
        ('numeric', numeric_passthrough, continuous)
    ],
    remainder='drop', # Dropping any remaining unprocessed columns
    verbose_feature_names_out=False,
    sparse_threshold=0.0
)

model = ExtraTreesClassifier(
    n_estimators=300,
    min_samples_leaf=3,
    random_state=0
)
model_pipeline = Pipeline(
    steps=[
        ('processing', column_transform),
        ('model', model)
    ]
)

cv_scores = cross_validate(
    estimator=model_pipeline,
    X=data,
    y=target_median,
    scoring=accuracy,
    cv=cv,
    return_train_score=True,
    return_estimator=True,
)

train_cv = np.mean(cv_scores['train_score'])
mean_cv = np.mean(cv_scores['test_score'])
std_cv = np.std(cv_scores['test_score'])    
fit_time = np.mean(cv_scores['fit_time'])
score_time = np.mean(cv_scores['score_time'])   
print(f"Train: {train_cv:.3f} |",
      f"Test: {mean_cv:0.3f} ({std_cv:0.3f}) |", 
      f"fit: {fit_time:0.2f}", f"secs pred: {score_time:0.2f} secs")

Train: 0.828 | Test: 0.823 (0.004) | fit: 2.84 secs pred: 0.23 secs


### Custom Gradient Boosting

In [6]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np

In [7]:
class GradientBoosting():
    def __init__(self, n_estimators=100, learning_rate=0.1, **params):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.params = params
        self.trees = list()
    
    def sigmoid(self, x):
        x = np.clip(x, -100, 100)
        return 1 / (1 + np.exp(-x))
    
    def logit(self, x, eps=1e-6):
        xp = np.clip(x, eps, 1-eps)
        return np.log(xp / (1 - xp))
    
    def gradient(self, y, y_pred):
        gradient = y_pred - y
        return gradient
    
    def fit(self, X, y):
        self.init = self.logit(np.mean(y))
        y_pred = self.init * np.ones((X.shape[0], ))
        for k in range(self.n_estimators):
            gradient = self.gradient(self.logit(y), y_pred)
            tree = DecisionTreeRegressor(**self.params)
            tree.fit(X, -gradient)
            self.trees.append(tree)
            y_pred += self.learning_rate * tree.predict(X)
    
    def predict_proba(self, X):
        y_pred = self.init * np.ones((X.shape[0], ))
        for tree in self.trees:
            y_pred += self.learning_rate * tree.predict(X)
        return self.sigmoid(y_pred)
    
    def predict(self, X, threshold=0.5):
        proba = self.predict_proba(X)
        return np.where(proba >= threshold, 1, 0)

In [10]:
categorical_onehot = OneHotEncoder(handle_unknown='ignore')
categorical_ordinal = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
numeric_passthrough = SimpleImputer(strategy='constant', fill_value=0)

column_transform = ColumnTransformer(
    [
        ('low_card_categories', categorical_onehot, low_card_categorical),
        ('numeric', numeric_passthrough, continuous)
    ],
    remainder='drop', # Dropping any remaining unprocessed columns
    verbose_feature_names_out=False,
    sparse_threshold=0.0
)

In [12]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(range(len(data)), test_size=0.2, random_state=0)

cls = GradientBoosting(n_estimators=300, learning_rate=0.1, min_samples_leaf=3, max_depth=4)
X = column_transform.fit_transform(data.iloc[train])
y = target_median[train]

cls.fit(X, y)
Xt = column_transform.transform(data.iloc[test])
yt = target_median[test]
ypred = cls.predict(Xt)
score = accuracy_score(yt, ypred)
print(f"Gradient Boosting Classifier Accuracy: {score:.5f}")

Gradient Boosting Classifier Accuracy: 0.82493


### Boosting in SkLearn

In [6]:
categorical_onehot = OneHotEncoder(handle_unknown='ignore')
categorical_ordinal = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
numeric_passthrough = SimpleImputer(strategy='constant', fill_value=0)

column_transform = ColumnTransformer(
    [
        ('low_card_categories', categorical_onehot, low_card_categorical),
        ('numeric', numeric_passthrough, continuous)
    ],
    remainder='drop', # Dropping any remaining unprocessed columns
    verbose_feature_names_out=False,
    sparse_threshold=0.0
)

In [13]:
from sklearn.ensemble import GradientBoostingClassifier

accuracy = make_scorer(accuracy_score)
cv = KFold(n_splits=5, shuffle=True, random_state=0)

model = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=4,
    min_samples_leaf=3,
    random_state=0
)

model_pipeline = Pipeline(
    steps=[
        ('processing', column_transform),
        ('model', model)
    ]
)

cv_scores = cross_validate(
    estimator=model_pipeline,
    X=data,
    y=target_median,
    scoring=accuracy,
    cv=cv,
    return_train_score=True,
    return_estimator=True,
)

train_cv = np.mean(cv_scores['train_score'])
mean_cv = np.mean(cv_scores['test_score'])
std_cv = np.std(cv_scores['test_score'])
fit_time = np.mean(cv_scores['fit_time'])
score_time = np.mean(cv_scores['score_time'])
print(f"Train: {train_cv:.3f} |",
      f"Test: {mean_cv:0.3f} ({std_cv:0.3f}) |", 
      f"fit: {fit_time:0.2f}", f"secs pred: {score_time:0.2f} secs")

Train: 0.839 | Test: 0.826 (0.004) | fit: 8.38 secs pred: 0.04 secs


In [7]:
from sklearn.ensemble import GradientBoostingClassifier

accuracy = make_scorer(accuracy_score)
cv = KFold(n_splits=5, shuffle=True, random_state=0)

model = GradientBoostingClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    validation_fraction=0.2,
    n_iter_no_change=10,
    max_depth=4,
    min_samples_leaf=3,
    random_state=0
)

model_pipeline = Pipeline(
    steps=[
        ('processing', column_transform),
        ('model', model)
    ]
)

cv_scores = cross_validate(
    estimator=model_pipeline,
    X=data,
    y=target_median,
    scoring=accuracy,
    cv=cv,
    return_train_score=True,
    return_estimator=True,
)

train_cv = np.mean(cv_scores['train_score'])
mean_cv = np.mean(cv_scores['test_score'])
std_cv = np.std(cv_scores['test_score'])
fit_time = np.mean(cv_scores['fit_time'])
score_time = np.mean(cv_scores['score_time'])
print(f"Train: {train_cv:.3f} |",
      f"Test: {mean_cv:0.3f} ({std_cv:0.3f}) |", 
      f"fit: {fit_time:0.2f}", f"secs pred: {score_time:0.2f} secs")

Train: 0.833 | Test: 0.826 (0.005) | fit: 7.87 secs pred: 0.05 secs


In [8]:
iters = [cv_scores["estimator"][i].named_steps["model"].n_estimators_ for i in range(5)] 
print(iters) 

[145, 109, 115, 163, 159]


In [None]:
np.mean(iters)
print(f"Number of iterations to use in full training: {1.2 * np.mean(iters):.0f}") # Increase the number of iterations by 20% since it is a validation fraction in CV.

Number of iterations to use in full training: 166


### XGBoost

In [11]:
from xgboost import XGBClassifier

In [12]:
accuracy = make_scorer(accuracy_score)
cv = KFold(n_splits=5, shuffle=True, random_state=0)
xgb = XGBClassifier(
    booster='gbtree',
    objective='reg:logistic',
    n_estimators=300,
    max_depth=4,
    min_child_weight=3, # equivalent to min_samples_leaf of SKLearn
)

model_pipeline = Pipeline(
    steps=[
        ('processing', column_transform),
        ('model', xgb)
    ]
)

cv_scores = cross_validate(
    estimator=model_pipeline,
    X=data,
    y=target_median,
    scoring=accuracy,
    cv=cv,
    return_train_score=True,
    return_estimator=True,
)

train_cv = np.mean(cv_scores['train_score'])
mean_cv = np.mean(cv_scores['test_score'])
std_cv = np.std(cv_scores['test_score'])
fit_time = np.mean(cv_scores['fit_time'])
score_time = np.mean(cv_scores['score_time'])
print(f"Train: {train_cv:.3f} |",
      f"Test: {mean_cv:0.3f} ({std_cv:0.3f}) |", 
      f"fit: {fit_time:0.2f}", f"secs pred: {score_time:0.2f} secs")

Train: 0.849 | Test: 0.826 (0.004) | fit: 0.27 secs pred: 0.02 secs


### Early Stopping in XGBoost

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train, test = train_test_split(
    range(len(data)),
    test_size=0.2,
    random_state=0
)

train, valid = train_test_split(
    train,
    test_size=0.2,
    random_state=0
)

xgb = XGBClassifier(
    booster='gbtree',
    objective='reg:logistic',
    n_estimators=1000,
    max_depth=4,
    min_child_weight=3, # equivalent to min_samples_leaf of SKLearn
    early_stopping_rounds=100,
    eval_metric='error',
)

X = column_transform.fit_transform(data.iloc[train])
y = target_median[train]
Xv = column_transform.transform(data.iloc[valid])
yv = target_median[valid]

xgb.fit(X, y, eval_set=[(Xv, yv)], verbose=False)

Xt = column_transform.transform(data.iloc[test])
yt = target_median[test]

preds = xgb.predict(Xt)
score = accuracy_score(yt, preds)
print(f"XGBoost Classifier Accuracy: {score:.5f}")

XGBoost Classifier Accuracy: 0.82657


In [16]:
xgb.best_iteration

125

### LightGBM

In [20]:
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [19]:
from lightgbm import LGBMClassifier

accuracy = make_scorer(accuracy_score)
cv = KFold(n_splits=5, shuffle=True, random_state=0)
lgb = LGBMClassifier(
    boosting_type='gbdt',
    n_estimators=300,
    max_depth=-1,
    min_child_samples=3,
    force_col_wise=True,
    verbosity=0,
)

model_pipeline = Pipeline(
    steps=[
        ('processing', column_transform),
        ('model', lgb)
    ]
)

cv_scores = cross_validate(
    estimator=model_pipeline,
    X=data,
    y=target_median,
    scoring=accuracy,
    cv=cv,
    return_train_score=True,
    return_estimator=True,
)

train_cv = np.mean(cv_scores['train_score'])
mean_cv = np.mean(cv_scores['test_score'])
std_cv = np.std(cv_scores['test_score'])
fit_time = np.mean(cv_scores['fit_time'])
score_time = np.mean(cv_scores['score_time'])
print(f"Train: {train_cv:.3f} |",
      f"Test: {mean_cv:0.3f} ({std_cv:0.3f}) |", 
      f"fit: {fit_time:0.2f}", f"secs pred: {score_time:0.2f} secs")



Train: 0.859 | Test: 0.826 (0.004) | fit: 0.62 secs pred: 0.04 secs


### LGBM Early Stopping

In [23]:
from lightgbm import log_evaluation

In [26]:
train, test = train_test_split(
    range(len(data)),
    test_size=0.2,
    random_state=0
)
train, valid = train_test_split(
    train,
    test_size=0.2,
    random_state=0
)

lgbm = LGBMClassifier(
    boosting_type='gbdt',
    early_stopping_round=150,
    n_estimators=1000,
    max_depth=-1,
    min_child_samples=3, # equivalent to min_samples_leaf of SKLearn
    force_col_wise=True,
    verbosity=0,
)
X = column_transform.fit_transform(data.iloc[train])
y = target_median[train]
Xv = column_transform.transform(data.iloc[valid])
yv = target_median[valid]

lgbm.fit(
    X, y, eval_set=[(Xv, yv)],
    eval_metric='accuracy',
    callbacks=[log_evaluation(0)]
)

Xt = column_transform.transform(data.iloc[test])
yt = target_median[test]
preds = lgbm.predict(Xt)
score = accuracy_score(yt, preds)   
print(f"LightGBM Classifier Accuracy: {score:.5f}")

LightGBM Classifier Accuracy: 0.82585


