In [None]:
modeling_result = pd.concat([lr_result, dt_result, rf_result, et_result, ad_result, xgb_result, lgbm_result])
modeling_result.sort_values(by='NDCG Score', ascending=False)

      Model	                NDCG Score
0	LGBMClassifier	        0.8496 +/- 0.0006
0	XGBClassifier	        0.8482 +/- 0.0004
0	RandomForestClassifier	0.8451 +/- 0.0006
0	AdaBoostClassifier	    0.8429 +/- 0.0019
0	ExtraTreesClassifier	0.839 +/- 0.0008
0	LogisticRegression	    0.8378 +/- 0.001
0	DecisionTreeClassifier	0.7242 +/- 0.0023

The <b>Light GBM Classifier</b> model was chosen for hyperparameter tuning, since it's fast to train and tune, whilst being also the one with the best result without any tuning. In addition to that, it's much better for deployment, as it's much lighter than a XGBoost or Random Forest for instance, especially given the fact that we're using a free deployment cloud.</p>

https://github.com/brunodifranco/project-airbnb-classification/blob/main/airbnb.ipynb

An important task in ML modeling is Hyperparameter Tuning, on which the goal is to find the <b>best possible combination of model hyperparameters</b>. This task will be performed fitting the model to the training data, and evaluating it in the test data, which was originally split in section 5.2. But firstly the test dataset has to go through all transformations the training dataset went through:

In [None]:
test_le = LabelEncoder()
df_test['country_destination'] = test_le.fit_transform(df_test['country_destination'])

X_test = df_test.drop('country_destination', axis=1).copy()
y_test = df_test['country_destination'].copy()

# One Hot Encoding 
X_test = pd.get_dummies(X_test, prefix=['gender'], columns=['gender'])
X_test = pd.get_dummies(X_test, prefix=['signup_method'], columns=['signup_method'])
X_test = pd.get_dummies(X_test, prefix=['signup_app'], columns=['signup_app'])
X_test = pd.get_dummies(X_test, prefix=['affiliate_channel'], columns=['affiliate_channel'])
X_test = pd.get_dummies(X_test, prefix=['first_affiliate_tracked'], columns=['first_affiliate_tracked'])
X_test = pd.get_dummies(X_test, prefix=['first_device_type'], columns=['first_device_type'])

# FrequencyEncoder
fe_language = (X_test.groupby('language').size()) / len(X_test)
X_test['language'] = X_test['language'].apply(lambda x : fe_language[x])

fe_action_type_most_common = (X_test.groupby('action_type_most_common').size()) / len(X_test)
X_test['action_type_most_common'] = X_test['action_type_most_common'].apply(lambda x : fe_action_type_most_common[x])

# RobustScaler
rs_age = RobustScaler()
rs_signup_flow = RobustScaler()
rs_secs_elapsed_median = RobustScaler()
X_test['age'] = rs_age.fit_transform(X_test[['age']].values)
X_test['signup_flow'] = rs_signup_flow.fit_transform(X_test[['signup_flow']].values)
X_test['secs_elapsed_median'] = rs_secs_elapsed_median.fit_transform(X_test[['secs_elapsed_median']].values)

# MinMaxScaler
mm_secs_elapsed_max = MinMaxScaler()
mm_secs_elapsed_mean = MinMaxScaler()
mm_secs_elapsed_sum = MinMaxScaler()
mm_secs_elapsed_std = MinMaxScaler()
mm_amount_of_sessions = MinMaxScaler()
X_test['secs_elapsed_max'] = mm_secs_elapsed_max.fit_transform(X_test[['secs_elapsed_max']].values)
X_test['secs_elapsed_mean'] = mm_secs_elapsed_mean.fit_transform(X_test[['secs_elapsed_mean']].values)
X_test['secs_elapsed_sum'] = mm_secs_elapsed_sum.fit_transform(X_test[['secs_elapsed_sum']].values)
X_test['secs_elapsed_std'] = mm_secs_elapsed_std.fit_transform(X_test[['secs_elapsed_std']].values)
X_test['amount_of_sessions'] = mm_amount_of_sessions.fit_transform(X_test[['amount_of_sessions']].values)

# StandardScaler
ss = StandardScaler()
X_test['action_type_unique'] = ss.fit_transform(X_test[['action_type_unique']].values)

# Transformation
cols = {'day_of_week_first_active': 7,   
        'month_account_created' : 12, 
        'day_first_active': 30,
        'week_of_year_account_created': 52}

for period, cycle in cols.items():
    nature_encode(X_test, period, cycle)

# final X_test
X_test = X_test[cols_selected_rf]

Out of the tested hyperparameters, Bayesian Optimization with Optuna provided the following as the best ones:

| Hyperparameter | Definition | Best Value |
|:---:|---|:---:|
| n_estimators | Number of boosting iterations | 300 |
| learning_rate | Shrinkage rate | 0.02 |
| num_leaves | max number of leaves in one tree | 45 |
| max_depth | Limit the max depth for tree model | 8 |
| min_child_samples | Minimal number of data in one leaf | 55 |
| min_child_weight | Minimal sum hessian in one leaf | 0.04 |
| subsample | Used to randomly select part of data without resampling| 0.8 |
| colsample_bytree| Used to randomly select a subset of features on each iteration| 0.85 |

In [None]:
# Tuned Parameters
best_param =  {'n_estimators': 300,
               'learning_rate': 0.02,
               'num_leaves': 45,
               'max_depth': 8,
               'min_child_samples': 55,
               'min_child_weight': 0.04,
               'subsample': 0.8,
               'colsample_bytree': 0.85}

# Final Model
model_lgbm_final = LGBMClassifier(**best_param)

y_pred_eval, df_model_eval = model_eval(model_lgbm_final, X_train_ml, y_train_ml, X_test, y_test)
# pickle.dump(model_lgbm_final, open('model/lgbm_airbnb.pkl', 'wb'))  # Saving for deployment

df_model_eval # final model score

lgbm i secmis ve hyper parametreleri bulup tune etmis 

https://github.com/brunodifranco/project-airbnb-classification/blob/main/airbnb.ipynb

#catboost

In [None]:
# categorical features must be str or int for catboost model

df[cat_features] = df[cat_features].astype('str')

In [None]:
model = CatBoostRegressor(iterations=2000, 
                          learning_rate=0.05,
                          depth=10,
                          l2_leaf_reg=15,
                          loss_function='Huber:delta=1.6',
                          # save_snapshot='
                         )

model.fit(
    X_train, y_train,
    cat_features=cat_features,
    verbose=250,
    eval_set=(X_validation, y_validation),
    plot=True
)

# Feature importance

Look at feature importance and chose only the features that help the model

In [None]:
portances = model.get_feature_importance()

sorted_importances = sorted(importances, reverse=True)

threshold = sorted_importances[len(sorted_importances) // 2]

selected_features = [f for i, f in enumerate(X_train.columns) if importances[i] >= threshold]

In [None]:
# new list of categorical features in the new df
all_columns = list(X_train[selected_features].columns)
num_columns = ['accommodates', 'bedrooms', 'beds', 'minimum_nights', 'maximum_nights', 'bathrooms', 'price']

cat_features_50 = []
for column in all_columns:
    if column not in num_columns:
        cat_features_50.append(column)

In [None]:
# train model with top 50 features
model_2 = CatBoostRegressor(iterations=2000, 
                          learning_rate=0.08,
                          depth=10,
                          l2_leaf_reg=15,
                          loss_function='Huber:delta=1.1',
                          #save_snapshot=True
                         )

model_2.fit(
    X_train[selected_features], y_train,
    cat_features=cat_features_50,
    verbose=250,
    eval_set=(X_validation[selected_features], y_validation),
    plot=True
)


In [None]:
## Hyperparameter Tuning
model_3 = CatBoostRegressor(loss_function='Huber:delta=1.1',
                            cat_features=cat_features_50,
                            verbose=False,
                            thread_count=-1,
                            early_stopping_rounds=5
                           )
                            

grid = {'learning_rate': [0.1, 0.13],
        'depth': [3,7,9],
        'l2_leaf_reg': [9, 13, 17],
        'iterations': [1500]
       }

randomized_search_result = model_3.randomized_search(grid,
                                                   X=X_train[selected_features],
                                                   y=y_train,
                                                   plot=True,
                                                   n_iter=5,
                                                   cv=5)


In [None]:
best_params = randomized_search_result['params']
best_params

{'depth': 7, 'l2_leaf_reg': 9, 'iterations': 1500, 'learning_rate': 0.13}

In [None]:
best_model = CatBoostRegressor(depth=7,
                               l2_leaf_reg=9,
                               iterations=2000,
                               learning_rate=0.13,
                               cat_features=cat_features_50,
                               thread_count=-1,
                               loss_function='Huber:delta=1.1')


best_model.fit(X=X_train[selected_features],
               y=y_train,
               plot=True,
               eval_set=(X_validation[selected_features],y_validation),
               verbose=False
               )

https://github.com/theoberva/airbnb_steamlit/blob/main/catboost_model.ipynb
catboost modeli icin link