# PIPELINE

In [9]:
import numpy as np
import pandas as pd
import zipfile
import argparse
import logging.config
import typing
import pickle
import logging

# from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, Pool, cv
from tqdm import tqdm
from traceback import format_exc

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.exceptions import NotFittedError
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.exceptions import NotFittedError

## Вспомогательные функции

### Настройки 

In [10]:
TARGET = 'per_square_meter_price'
# признаки (или набор признаков), для которых применяем smoothed target encoding
CATEGORICAL_STE_FEATURES = ['region', 'city', 'realty_type']

# признаки, для которых применяем one hot encoding
CATEGORICAL_OHE_FEATURES = []

# численные признаки 
NUM_FEATURES = ['lat', 'lng', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
       'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in_0.01', 'osm_healthcare_points_in_0.005',
       'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01',
       'osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075',
       'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.005',
       'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01',
       'osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075',
       'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.001',
       'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075',
       'osm_offices_points_in_0.01', 'osm_shops_points_in_0.001',
       'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075',
       'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
       'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.005',
       'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01',
       'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.005',
       'osm_transport_stop_points_in_0.0075',
       'osm_transport_stop_points_in_0.01',
       'reform_count_of_houses_1000', 'reform_count_of_houses_500',
       'reform_house_population_1000', 'reform_house_population_500',
       'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
       'reform_mean_year_building_1000', 'reform_mean_year_building_500','total_square']

MODEL_PARAMS = dict(
            n_estimators=2000,
            learning_rate=0.01,
            reg_alpha=1,
            num_leaves=40,
            min_child_samples=5,
            importance_type="gain",
            n_jobs=1,
            random_state=563,
        )

LOGGING_CONFIG = {
    "version": 1,
    "disable_existing_loggers": False,
    "formatters": {
        "default": {"format": "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"},
    },
    "handlers": {
        "file_handler": {
            "level": "INFO",
            "formatter": "default",
            "class": "logging.FileHandler",
            "filename": 'train.log',
            "mode": "a",
        },
    },
    "loggers": {
        "": {"handlers": ["file_handler"], "level": "INFO", "propagate": False},
    },
}

### Настройки (catboost)

In [11]:
TARGET = 'per_square_meter_price'
# признаки (или набор признаков), для которых применяем smoothed target encoding
CATEGORICAL_STE_FEATURES = ['region', 'city', 'realty_type']

# признаки, для которых применяем one hot encoding
CATEGORICAL_OHE_FEATURES = []

# численные признаки
NUM_FEATURES = ['lat', 'lng', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
       'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in_0.01', 'osm_healthcare_points_in_0.005',
       'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01',
       'osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075',
       'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.005',
       'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01',
       'osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075',
       'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.001',
       'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075',
       'osm_offices_points_in_0.01', 'osm_shops_points_in_0.001',
       'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075',
       'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
       'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.005',
       'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01',
       'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.005',
       'osm_transport_stop_points_in_0.0075',
       'osm_transport_stop_points_in_0.01',
       'reform_count_of_houses_1000', 'reform_count_of_houses_500',
       'reform_house_population_1000', 'reform_house_population_500',
       'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
       'reform_mean_year_building_1000', 'reform_mean_year_building_500','total_square', 
       'poi_n_r0.001', 'poi_n_bus_stop_r0.001', 'poi_n_cafe_r0.001', 'poi_n_supermarket_r0.001',
       'poi_n_atm_r0.001', 'poi_n_bank_r0.001', 'poi_n_clothes_r0.001', 'poi_n_fast_food_r0.001',
       'poi_n_hairdresser_r0.001', 'poi_n_restaurant_r0.001', 'poi_n_tram_stop_r0.001']

MODEL_PARAMS = {
    'depth': 8, 
    'verbose': 50,
    'iterations': 1500,
    'loss_function': 'RMSE',
    'eval_metric': 'R2',
    'learning_rate': 0.15,
    'random_state': 5
}

LOGGING_CONFIG = {
    "version": 1,
    "disable_existing_loggers": False,
    "formatters": {
        "default": {"format": "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"},
    },
    "handlers": {
        "file_handler": {
            "level": "INFO",
            "formatter": "default",
            "class": "logging.FileHandler",
            "filename": 'train.log',
            "mode": "a",
        },
    },
    "loggers": {
        "": {"handlers": ["file_handler"], "level": "INFO", "propagate": False},
    },
}

### Метрики

In [12]:
THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1
EPS = 1e-8

def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    """
    Реализация кастомной метрики для хакатона.

    :param y_true: float, реальная цена
    :param y_pred: float, предсказанная цена
    :return: float, значение метрики
    """
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9


def deviation_metric(y_true: np.array, y_pred: np.array) -> float:
    return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]).mean()

def median_absolute_percentage_error(y_true: np.array, y_pred: np.array) -> float:
    return np.median(np.abs(y_pred-y_true)/y_true)

def metrics_stat(y_true: np.array, y_pred: np.array) -> typing.Dict[str,float]:
    mape = mean_absolute_percentage_error(y_true, y_pred)
    mdape = median_absolute_percentage_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    raif_metric = deviation_metric(y_true, y_pred)
    return {'mape':mape, 'mdape':mdape, 'rmse': rmse, 'r2': r2, 'raif_metric':raif_metric}

### Utils

In [13]:
from enum  import IntEnum

UNKNOWN_VALUE = 'missing'

class PriceTypeEnum(IntEnum):
    OFFER_PRICE = 0 # цена из объявления
    MANUAL_PRICE = 1 # цена, полученная путем ручной оценки

### Features

In [14]:
def prepare_categorical(df: pd.DataFrame) -> pd.DataFrame:
    """
    Заполняет пропущенные категориальные переменные
    :param df: dataframe, обучающая выборка
    :return: dataframe
    """
    df_new = df.copy()
    fillna_cols = ['region','city','street','realty_type']
    df_new[fillna_cols] = df_new[fillna_cols].fillna(UNKNOWN_VALUE)
    return df_new

### Сохранение и загрузка модели

In [38]:
def save_model(model, path: str):
    with open(path, "wb") as f:
        pickle.dump(model, f)
        

def load_model(path: str):
    with open(path, "rb") as f:
        model = pickle.load(f)
    return model

## Запуск

### Запуск обучения

In [30]:
path_to_train = '../data/df_train_with_poi.zip'
path_to_test = '../data/df_test_with_poi.zip'
path_to_save_model = '../models/test.pickle'

zip_file = zipfile.ZipFile(path_to_train)
train_df = pd.read_csv(zip_file.open(zip_file.namelist()[0]), low_memory=False)

train_df = prepare_categorical(train_df)

X_offer = train_df[train_df.price_type == PriceTypeEnum.OFFER_PRICE][NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES]
y_offer = train_df[train_df.price_type == PriceTypeEnum.OFFER_PRICE][TARGET]

X_manual = train_df[train_df.price_type == PriceTypeEnum.MANUAL_PRICE][NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES]
y_manual = train_df[train_df.price_type == PriceTypeEnum.MANUAL_PRICE][TARGET]

In [31]:
model = CatBoostRegressor(**MODEL_PARAMS)

In [32]:
model.fit(X_manual, y_manual, cat_features=CATEGORICAL_STE_FEATURES, verbose=100)

0:	learn: 0.0951359	total: 9.54ms	remaining: 14.3s
100:	learn: 0.8935422	total: 1.35s	remaining: 18.7s
200:	learn: 0.9499715	total: 2.73s	remaining: 17.6s
300:	learn: 0.9745466	total: 4.1s	remaining: 16.3s
400:	learn: 0.9848386	total: 5.51s	remaining: 15.1s
500:	learn: 0.9898728	total: 6.9s	remaining: 13.8s
600:	learn: 0.9928901	total: 9.57s	remaining: 14.3s
700:	learn: 0.9948977	total: 12.3s	remaining: 14.1s
800:	learn: 0.9961051	total: 15.7s	remaining: 13.7s
900:	learn: 0.9970410	total: 18.4s	remaining: 12.2s
1000:	learn: 0.9977223	total: 21.2s	remaining: 10.6s
1100:	learn: 0.9982029	total: 24s	remaining: 8.69s
1200:	learn: 0.9985638	total: 27.4s	remaining: 6.82s
1300:	learn: 0.9988123	total: 30.3s	remaining: 4.64s
1400:	learn: 0.9990027	total: 33.8s	remaining: 2.39s
1499:	learn: 0.9991592	total: 36.8s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7fcf1f8cf940>

In [49]:
# Save model
save_model(model, path_to_save_model)

predictions_offer = model.predict(X_offer)
metrics = metrics_stat(y_offer.values, predictions_offer/(1+model.corr_coef)) # для обучающей выборки с ценами из объявлений смотрим качество без коэффициента
print('Metrics stat for training data with offers prices:', metrics)

predictions_manual = model.predict(X_manual)
metrics = metrics_stat(y_manual.values, predictions_manual)
print('Metrics stat for training data with manual prices:', metrics)

[100935.37258084  26337.04030716  77222.87033636 ...  38837.78311407
  42758.67179338  40924.13013635]


In [46]:
# тесты
print(deviation_metric(np.array(y_manual.values),np.array(predictions_manual)) <= EPS)
print(np.abs(deviation_metric(np.array(y_manual.values),np.array(predictions_manual))-1) <= EPS)
print(np.abs(deviation_metric(np.array(y_manual.values),np.array(predictions_manual))-1*NEGATIVE_WEIGHT) <= EPS)
print(np.abs(deviation_metric(np.array(y_manual.values),np.array(predictions_manual))-9) <= EPS)
print(np.abs(deviation_metric(np.array(y_manual.values),np.array(predictions_manual))-9*NEGATIVE_WEIGHT) <= EPS)
print(np.abs(deviation_metric(np.array(y_manual.values),np.array(predictions_manual)) - 85/45) <= EPS)

True
True
True
True
True
True
True
True
True


### Запуск предикта

In [34]:
def parse_args():
    parser = argparse.ArgumentParser(
        description="""-""",
        formatter_class=argparse.RawTextHelpFormatter,
    )

    parser.add_argument("--test_data", "-d", type=str, dest="d", required=True, help="Путь до отложенной выборки")
    parser.add_argument("--model_path", "-mp", type=str, dest="mp", required=True,
                        help="Пусть до сериализованной ML модели")
    parser.add_argument("--output", "-o", type=str, dest="o", required=True, help="Путь до выходного файла")

    return parser.parse_args()


In [43]:
try:
    path_to_pickled_model = '../models/test.pickle'
    path_to_test_data = '../data/df_test_with_poi.zip'

    test_df = pd.read_csv(path_to_test_data, compression='zip')

    test_df = prepare_categorical(test_df)


    model = load_model(path_to_pickled_model)

    test_df['per_square_meter_price'] = model.predict(test_df[NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES])

    test_df[['id','per_square_meter_price']].to_csv(index=False)
except Exception as e:
    print('ERROR')
