In [None]:
def split_area(dataset):
    small = dataset[dataset["area_total"] < 70]
    big = dataset[dataset["area_total"] >= 70]
    return small, big

In [None]:
# normalize training data
train_data = dfTrain.copy()
# split data on area total 
train_data_small, train_data_big = split_area(train_data)

train_data_small = process_data(train_data_small, keep_col=['price'], remove_col=['rooms', 'phones', 'new', 'elevator_without', 'elevator_passenger', 'elevator_service', 'district']) 
train_data_big = process_data(train_data_big, keep_col=['price'], remove_col=['rooms', 'phones', 'new', 'elevator_without', 'elevator_passenger', 'elevator_service', 'district']) 

def fit_predict_lgbm(training_dataset):
    # split training data into test and training split
    X_train, X_test = train_test_split(training_dataset, test_size=0.1, random_state=42)

    # get price columns 
    y_train = X_train.pop("price")
    y_test = X_test.pop("price")

    cols = X_train.columns

    # log prices
    y_train = np.log(y_train)

    scaler = preprocessing.StandardScaler().fit(X_train)
    imputor=IterativeImputer(
        estimator=BayesianRidge(),
        imputation_order='ascending', 
        max_iter=100,
        tol=1e-5)

    X_train = imputor.fit_transform(X_train)
    X_train = scaler.transform(X_train)

    X_test = imputor.fit_transform(X_test)
    X_test = scaler.transform(X_test)

    lgbm_regressor = lgb.LGBMRegressor(
        num_leaves=52,
        max_depth=75, 
        random_state=42,
        metric='rmse',
        n_jobs=4, 
        n_estimators=3640,
        colsample_bytree=0.48432530282059805,
        subsample=0.9272124012179532,
        learning_rate=0.05084923664278231
    )

    lgbm_regressor.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_test, y_test)])

    lgbm_prediction = lgbm_regressor.predict(X_test)
    lgbm_prediction = np.exp(lgbm_prediction)

    lgbm_rmsle = round(np.sqrt(mean_squared_log_error(lgbm_prediction, y_test)), 4)
    print('Test RMSLE:', lgbm_rmsle)

    # red is real prices, green is predicted prices
    plt.figure(figsize=(50, 10))
    plt.plot(y_test.values[:500], color = 'red')
    plt.plot(lgbm_prediction[:500], color = 'green')
    plt.show()

    # correct predictions are on the diagonal
    plt.scatter(lgbm_prediction, y_test, s=2)
    plt.xlabel('LightGBM prediction')
    plt.ylabel('Ground Truth')
    plt.show()

    importances = lgbm_regressor.feature_importances_
    forest_importances = pd.Series(importances, index=cols)
    fig, ax=plt.subplots(1, 1, figsize=(14,6))
    forest_importances.plot.bar(ax=ax)
    ax.set_title("Gini Importance")
    ax.set_ylabel("Importance")
    fig.tight_layout()

    return lgbm_regressor, lgbm_prediction, lgbm_rmsle, y_test


lgbm_regressor_small, lgbm_prediction_small, lgbm_rmsle_small, y_test_small = fit_predict_lgbm(train_data_small)
lgbm_regressor_big, lgbm_prediction_big, lgbm_rmsle_big, y_test_big = fit_predict_lgbm(train_data_big)

# concatinate small and big apartments predictions and truth values
lgbm_prediction = np.concatenate((lgbm_prediction_small, lgbm_prediction_big), axis=0)
y_test = np.concatenate((y_test_small, y_test_big), axis=0)

lgbm_rmsle = round(np.sqrt(mean_squared_log_error(lgbm_prediction, y_test)), 4)
print('Test RMSLE:', lgbm_rmsle)

# red is real prices, green is predicted prices
plt.figure(figsize=(50, 10))
plt.plot(y_test.tolist()[:500], color = 'red')
plt.plot(lgbm_prediction[:500], color = 'green')
plt.show()

# correct predictions are on the diagonal
plt.scatter(lgbm_prediction, y_test, s=2)
plt.xlabel('LightGBM prediction')
plt.ylabel('Ground Truth')
plt.show()

# 0.154 
# 0.139 log
# 0.131 log + all_features + rmsle
# 0.131 log + all_features + rmse
# 0.132 log + all_features-(prison, airport)
# 0.132 log + all_features + keep_high_prices
# 0.129 -||- hyperparameter optimized
# 0.127 -||- hyperparameter optimized v2
# 0.111 0.03 test split
# 0.128 -||- 0.1 split, even more features
# 0.136 -||- only center

# 0.1290 all 
# 0.1258 remove=('rooms', 'phones', 'new', 'elevator_without', 'elevator_passenger', 'elevator_service')
# 0.1250 remove=('rooms', 'phones', 'new', 'elevator_without', 'elevator_passenger', 'elevator_service', 'district')

# 0.1256 added early stop
# 0.1252 without category type

# 0.1266 altered distance features+olympic
# 0.1261 reverted some alterations
# 0.1273

# 0.1252

