In [2]:
import pandas as pd
import numpy as np

In [52]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [4]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

In [5]:
SEED = 42

In [6]:
TEST_SIZE = 0.25

In [7]:
train = pd.read_csv("train_final.csv")
test = pd.read_csv("test_final.csv")

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8219 entries, 0 to 8218
Data columns (total 35 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   target                           8219 non-null   float64
 1   fact_staff_value_lag_1           8219 non-null   float64
 2   fact_load_factor_lag_1           8219 non-null   float64
 3   num_available_couriers_lag_1     8219 non-null   float64
 4   fact_num_orders_lag_1            8219 non-null   float64
 5   fact_percent_lateness_lag_1      8219 non-null   float64
 6   city_nm                          8219 non-null   object 
 7   store_lifetime_in_days           8219 non-null   float64
 8   fact_staff_churn                 8219 non-null   float64
 9   flag_high_load_lag_1             8219 non-null   float64
 10  marketing_costs_lag_1            8219 non-null   float64
 11  fact_couriers_with_shifts_lag_1  8219 non-null   float64
 12  predicted_staff_valu

In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2438 entries, 0 to 2437
Data columns (total 34 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   fact_staff_value_lag_1           2438 non-null   int64  
 1   fact_load_factor_lag_1           2438 non-null   float64
 2   num_available_couriers_lag_1     2438 non-null   int64  
 3   fact_num_orders_lag_1            2438 non-null   int64  
 4   fact_percent_lateness_lag_1      2438 non-null   float64
 5   city_nm                          2438 non-null   object 
 6   store_lifetime_in_days           2438 non-null   float64
 7   fact_staff_churn                 2438 non-null   float64
 8   flag_high_load_lag_1             2438 non-null   int64  
 9   marketing_costs_lag_1            2438 non-null   float64
 10  fact_couriers_with_shifts_lag_1  2438 non-null   float64
 11  predicted_staff_value            2438 non-null   int64  
 12  predicted_num_orders

In [10]:
test.describe()

Unnamed: 0,fact_staff_value_lag_1,fact_load_factor_lag_1,num_available_couriers_lag_1,fact_num_orders_lag_1,fact_percent_lateness_lag_1,store_lifetime_in_days,fact_staff_churn,flag_high_load_lag_1,marketing_costs_lag_1,fact_couriers_with_shifts_lag_1,...,staff_change_pct,orders_change_pct,tension_index,marketing_efficiency,is_critical_late,is_low_staff,load_factor_log,load_factor_sqrt,load_factor_squared,load_factor_inv
count,2438.0,2438.0,2438.0,2438.0,2438.0,2438.0,2438.0,2438.0,2438.0,2438.0,...,2438.0,2438.0,2438.0,2438.0,2438.0,2438.0,2438.0,2438.0,2438.0,2438.0
mean,9.172272,3.612143,10.942576,41.068909,81.997489,995.836751,1.949549,0.708778,53337340000.0,12.656686,...,-0.04771,7.380175,32.623981,0.7588287,0.998359,0.445857,1.819611,2.286032,30.216172,0.200299
std,3.607295,1.71742,4.19175,15.611045,17.983029,577.01379,1.87382,0.454419,1346720000000.0,5.138526,...,0.426791,1.815655,11.521428,15.46033,0.04048,0.497162,0.210103,0.293587,17.66864,0.047992
min,1.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.923077,-1.0,0.0,0.0,0.0,0.0,1.360977,1.702939,8.41,0.078425
25%,7.0,2.421053,8.0,30.0,72.222222,529.0,0.0,0.0,149622000.0,9.0,...,-0.166667,6.743952,26.25,2.135486e-07,1.0,0.0,1.667707,2.073644,18.49,0.165536
50%,8.0,3.166667,10.0,38.0,85.714286,921.0,2.0,1.0,431786100.0,12.0,...,-0.071429,7.333333,31.428571,7.179481e-07,1.0,0.0,1.808289,2.258318,26.01,0.19604
75%,11.0,4.333333,13.0,48.75,100.0,1454.75,4.0,1.0,1439490000.0,15.0,...,0.0,7.888889,37.5,2.050696e-06,1.0,1.0,1.951608,2.457641,36.4816,0.232504
max,36.0,15.6,39.0,210.0,100.0,2520.0,5.0,1.0,62838680000000.0,36.0,...,14.0,51.0,270.0,500.0,1.0,1.0,2.621039,3.570714,162.5625,0.344709


In [11]:
def prepare_city_col(df, city_col="city_nm"):
    df[city_col] = df[city_col].astype("category")

In [12]:
def log_columns(df, cols):
    dataframe = df.copy(deep=True)
    for col in cols:
        dataframe[col] = dataframe[col].apply(lambda x: np.log1p(x))
    return dataframe

In [13]:
log_cols = ["fact_load_factor_lag_1", "fact_staff_value_lag_1", "fact_num_orders_lag_1", "store_lifetime_in_days", "marketing_costs_lag_1", "predicted_staff_value", "predicted_num_orders", "predicted_load_factor"]

In [14]:
train_log = log_columns(train, log_cols)

In [15]:
def wape(y_true, y_pred):
    """
    WAPE = sum(|y_true - y_pred|) / sum(|y_true|)
    Метрика, по которой будет качество проверяться
    """
    return np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true))

In [16]:
def add_dbscan_cluster(df, features, eps=0.5, min_samples=5, new_col='dbscan_cluster'):
    """Добавляет колонку с DBSCAN-кластерами"""
    df = df.copy()
    X = StandardScaler().fit_transform(df[features])
    clusters = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X)
    df[new_col] = clusters
    return df

In [17]:
cluster_cols = [
    'fact_staff_value_lag_1',
    'fact_load_factor_lag_1',
    'num_available_couriers_lag_1',
    'fact_num_orders_lag_1',
    'fact_percent_lateness_lag_1',
    'store_lifetime_in_days',
    'fact_staff_churn',
    'flag_high_load_lag_1',
    'marketing_costs_lag_1',
    'fact_couriers_with_shifts_lag_1',
    'predicted_staff_value',
    'predicted_num_orders',
    'predicted_load_factor',
    'staff_prediction_gap',
    'orders_prediction_gap',
    'past_productivity',
    'predicted_productivity',
    'month',
    'quarter'
]

In [18]:
df_with_clusters = add_dbscan_cluster(
    df=train_log,
    features=cluster_cols,
    eps=0.3,     
    min_samples=3,
    new_col='dbscan_smart'
)

In [19]:
df_with_clusters

Unnamed: 0,target,fact_staff_value_lag_1,fact_load_factor_lag_1,num_available_couriers_lag_1,fact_num_orders_lag_1,fact_percent_lateness_lag_1,city_nm,store_lifetime_in_days,fact_staff_churn,flag_high_load_lag_1,...,orders_change_pct,tension_index,marketing_efficiency,is_critical_late,is_low_staff,load_factor_log,load_factor_sqrt,load_factor_squared,load_factor_inv,dbscan_smart
0,1.0,0.000000,0.000000,0.0,0.000000,0.000000,пусто,0.000000,0.0,0.0,...,260.000000,260.000000,2.600000e+02,0,0,1.785070,2.227106,24.6016,0.201572,0
1,1.0,0.693147,0.422857,10.0,2.397895,80.000000,Ульяновск,6.790097,1.0,1.0,...,28.000000,29.000000,7.872782e-07,1,0,1.801710,2.249444,25.6036,0.197589,-1
2,4.0,2.197225,1.078810,13.0,3.526361,69.565217,Ульяновск,6.797940,1.0,1.0,...,7.484848,21.538462,6.484693e-07,1,1,1.768150,2.204541,23.6196,0.205719,-1
3,1.0,0.000000,0.000000,0.0,0.000000,0.000000,пусто,0.000000,0.0,0.0,...,220.000000,220.000000,2.200000e+02,0,0,1.969906,2.483948,38.0689,0.162048,1
4,1.0,0.693147,0.606136,10.0,2.397895,80.000000,Набережные Челны,6.426488,1.0,1.0,...,22.000000,23.000000,5.326712e-07,1,0,1.983756,2.503997,39.3129,0.159464,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8214,4.0,0.693147,0.361013,10.0,2.397895,80.000000,Пенза,7.272398,2.0,1.0,...,42.000000,43.000000,1.508574e-04,1,0,1.599388,1.987461,15.6025,0.253100,-1
8215,2.0,2.772589,1.072637,19.0,3.931826,53.846154,Пенза,7.277248,4.0,1.0,...,8.000000,23.684211,2.793103e-06,1,1,1.619388,2.012461,16.4025,0.246853,-1
8216,2.0,0.000000,0.000000,0.0,0.000000,0.000000,пусто,0.000000,0.0,0.0,...,390.000000,390.000000,3.900000e+02,0,0,1.887070,2.366432,31.3600,0.178540,6
8217,3.0,0.693147,0.462624,10.0,2.397895,80.000000,Самара,6.118097,2.0,1.0,...,39.000000,40.000000,4.661570e-06,1,0,1.856298,2.323790,29.1600,0.185151,-1


In [20]:
X, y = df_with_clusters.drop(columns=["target"]), df_with_clusters.target

In [21]:
prepare_city_col(X)

In [22]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=SEED
)

In [25]:
categorical_features = ["city_nm"]

In [133]:
params = {
    'iterations': 2500,         
    'learning_rate': 0.1,    
    'depth': 8,      
    'loss_function': 'MSE',     
    'verbose': 100,      
    'cat_features': categorical_features
}

In [134]:
train_data = lgb.Dataset(
    X_train, 
    label=y_train,
    categorical_feature=categorical_features
)

In [135]:
valid_data = lgb.Dataset(
    X_val,
    label=y_val,
    reference=train_data,
    categorical_feature=categorical_features
)

In [136]:
valid_sets = [train_data, valid_data]
valid_names = ['training', 'validation']

In [155]:
model = lgb.train(
    params,
    train_data,
    valid_sets=valid_sets, 
    valid_names=valid_names,
    num_boost_round=2500,
    callbacks=[
        lgb.log_evaluation(100),
        lgb.early_stopping(100)
    ]
)
print(params)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.845068
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.254580
[LightGBM] [Debug] init for col-wise cost 0.000964 seconds, init for row-wise cost 0.003567 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006267 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5138
[LightGBM] [Info] Number of data points in the train set: 6164, number of used features: 35
[LightGBM] [Info] Start training from score 2.380435
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Deb

In [156]:
y_val_pred = model.predict(X_val)

In [157]:
wape(y_val_pred, y_val)

np.float64(0.6379047188352212)

In [158]:
np.mean(y_val_pred)

np.float64(2.3742636846961007)

In [159]:
mean_absolute_error(y_val_pred, y_val)

1.5145540082267424

In [147]:
cat_model.feature_importances_

array([2.10398008, 1.12326171, 2.73109124, 2.15467941, 1.5301549 ,
       4.54297759, 1.99021144, 8.45447   , 4.64226663, 1.83732926,
       1.48279341, 3.07455172, 2.13393377, 4.97550927, 2.26741216,
       2.10741144, 1.24327261, 2.87646822, 2.61998189, 1.9383919 ,
       1.58503514, 5.41013521, 2.1930298 , 7.92830527, 1.88266372,
       2.07108244, 5.17571743, 2.1104165 , 0.35339619, 0.02609784,
       3.77746604, 3.87756233, 3.97539116, 3.26020704, 0.54334524])

In [148]:
importance = pd.DataFrame(cat_model.feature_importances_, index=X.columns, columns=["importance"])

In [149]:
importance.sort_values("importance")

Unnamed: 0,importance
is_low_staff,0.026098
is_critical_late,0.353396
dbscan_smart,0.543345
fact_load_factor_lag_1,1.123262
past_productivity,1.243273
fact_couriers_with_shifts_lag_1,1.482793
fact_percent_lateness_lag_1,1.530155
load_factor_gap,1.585035
marketing_costs_lag_1,1.837329
staff_change_pct,1.882664
