In [None]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool

def catboost_drop_features(features):
    # id and label (not features)
    unused_feature_list = ['price']

    # Hurts performance
    unused_feature_list += ['lat', 'long']

    return features.drop(unused_feature_list, axis=1, errors='ignore')

# Drop useless feature
catboost_features = catboost_drop_features(df)
print("Number of features for CatBoost: {}".format(len(catboost_features.columns)))
catboost_features.head()

In [None]:
# Specify feature names and categorical features for CatBoost
feature_names = [s for s in catboost_features.columns]
categorical_features = ['status', 'add_attr', 'state', 'year', 'fireplace', 'parking', "subtype", 'sewer', 'water', 'app', 'heating',
       'cooling', 'materials', 'roof', 'foundation', 'interior']

categorical_indices = []
for index, name_col in enumerate(catboost_features.columns):
    if name_col in categorical_features:
        categorical_indices.append(index)
categorical_indices

In [None]:
# Prepare training and cross-validation data
# Label Catboost: Log error
catboost_label = df.price.astype(np.float32)

# Transform to Numpy matrices
catboost_X = catboost_features
catboost_y = pd.DataFrame(catboost_label)

# Perform shuffled train/test split
np.random.seed(42)
random.seed(10)
X_train, X_val, y_train, y_val = train_test_split(catboost_X, catboost_y, test_size=0.1)

# Remove outlier examples from X_train and y_train
# Keep them in X_val and y_val for proper cross-validation
#outlier_threshold = 0.4
#mask = (abs(y_train) <= outlier_threshold)
#X_train = X_train.loc[mask]
#y_train = y_train[mask]

print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}\n".format(y_train.shape))
print("X_val shape: {}".format(X_val.shape))
print("y_val shape: {}".format(y_val.shape))

In [None]:
# CatBoost parameters (Fine-tuning)
params = {}
params['loss_function'] = 'RMSE'
params['eval_metric'] = 'RMSE'
params['nan_mode'] = 'Min'              # Method to handle NaN (set NaN to either Min or Max)
params['random_seed'] = 42

params['iterations'] = 1000             # default 1000, use early stopping during training
params['learning_rate'] = 0.015         # default 0.03

params['border_count'] = 254            # default 254 (alias max_bin, suggested to keep at default for best quality)

params['max_depth'] = 6                 # default 6 (must be <= 16, 6 to 10 is recommended)
params['random_strength'] = 1           # default 1 (used during splitting to deal with over fitting, try different values)
params['l2_leaf_reg'] = 5               # default 3 (used for leaf value calculation, try different values)
params['bagging_temperature'] = 1       # default 1 (higher value -> more aggressive bagging, try different values)

In [None]:
# Train CatBoost Regressor with cross-validated early-stopping
val_pool = Pool(X_val, y_val, cat_features = categorical_indices)

# Training with seed random
np.random.seed(42)
random.seed(36)
model = CatBoostRegressor(**params)

# Training
model.fit(X_train, y_train,
          cat_features=categorical_indices,
          use_best_model=True, eval_set=val_pool, early_stopping_rounds=50, verbose=False)

# Evaluate model performance
print("Train score: {}".format(abs(model.predict(X_train).reshape(-1,1) - y_train).mean()))
print("Val score: {}".format(abs(model.predict(X_val).reshape(-1,1) - y_val).mean()))

In [None]:
# CatBoost feature importance
feature_importance = [(feature_names[i], value) for i, value in enumerate(model.get_feature_importance())]
feature_importance.sort(key=lambda x: x[1], reverse=True)
for k, v in feature_importance[:10]:
    print("{}: {}".format(k, v))

In [None]:
# Ensemble x20
bags = 20
models = []
params['iterations'] = 2500
for i in range(bags):
    print("Start training model {}".format(i))
    params['random_seed'] = i
    np.random.seed(42)
    random.seed(36)
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train, cat_features=categorical_indices, verbose=False)
    models.append(model)

# Sanity check (make sure scores on a small portion of the dataset are reasonable)
for i, model in enumerate(models):
    print("model {}: {}".format(i, abs(model.predict(X_val).reshape(-1,1) - y_val).mean()))

In [None]:
result = pd.DataFrame(abs(model.predict(X_val).reshape(-1,1) - y_val).astype(int))
result.head(10)

In [None]:
result.describe()

In [None]:
"""
    Helper method that makes predictions on the test set and exports results to csv file
    'models' is a list of models for ensemble prediction (len=1 means using just a single model)
"""
def predict_and_export(models, features, file_name):
    # Construct DataFrame for prediction results
    catboost = pd.DataFrame()

    catboost['Index'] = features.index
    
    test_features = catboost_drop_features(features)
    
    pred = []
    for i, model in enumerate(models):
        print("Start model {}".format(i))
        pred.append(model.predict(test_features))
    
    # Take average across all models
    mean_pred = np.mean(pred, axis=0)
    
    catboost['price'] = [float(format(x, '.4f')) for x in mean_pred]    
    
    print("Length of submission DataFrame: {}".format(len(catboost)))
    print("Submission header:")
    print(catboost.head())
    catboost.to_csv(file_name, index=False)
    return catboost, pred  # Return the results so that we can analyze or sanity check it

In [None]:
file_name = "/Users/charles/Desktop/DSAI/6_DataScience/project/DSP/20221/data/final_catboost_ensemble_x20-2500.csv"
catboost, pred = predict_and_export(models, X_val, file_name)
catboost.head()