In [526]:
import numpy as np
from catboost import CatBoostRegressor
import pandas as pd
from sklearn.metrics import mean_absolute_error
from typing import Union
from tqdm import tqdm

In [540]:
class Subprg:
    cat_features = ['Статуспоэкспертизе', 'season', 'Кодзадачи', 'obj_key']

    def __init__(self, subprg: str, x_train, x_test, y_train, y_test):
        self.x_train, self.x_test, self.y_train, self.y_test = self.get_by_sub(x_train, x_test, y_train, y_test, subprg)
        self.model = CatBoostRegressor(iterations=2000, random_seed=18, loss_function='RMSE', learning_rate=0.1,
                                       max_depth=7,
                                       early_stopping_rounds=200, eval_metric="MAE",
                                       leaf_estimation_backtracking="AnyImprovement")

    @staticmethod
    def get_by_sub(x_train: pd.DataFrame, x_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame,
                   subprg: Union[str, int]) -> tuple:
        x_train = x_train[x_train['obj_subprg'] == subprg]
        x_test = x_test[x_test['obj_subprg'] == subprg]
        y_train = y_train[y_train['obj_subprg'] == subprg]
        y_test = y_test[y_test['obj_subprg'] == subprg]
        return x_train, x_test, y_train, y_test

    def fit(self):
        self.model.fit(self.x_train.drop(['obj_subprg'], axis=1), self.y_train.drop(['obj_subprg'], axis=1), use_best_model=True, cat_features=self.cat_features,
                       eval_set=(self.x_test.drop(['obj_subprg'], axis=1), self.y_test.drop(['obj_subprg'], axis=1)))

    def predict(self) -> np.ndarray:
        return self.model.predict(self.x_test.drop(['obj_subprg'], axis=1))

    @property
    def get_y_test(self) -> list:
        k = self.y_test.drop(['obj_subprg'], axis=1).values
        a = []
        for i in k:
            a.append(i[0])
        return a



In [528]:
class ModelPredictor:
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()
        self.subprgs = self.df['obj_subprg'].unique()
        self.split_index = int(len(self.df) * 0.8)
        self.x_train, self.x_test, self.y_train, self.y_test = self._get_test_values()
        self.models = self._create_models()
        self.ans = {}

    def _create_models(self) -> dict[Union[str, int], Subprg]:
        models = {}
        for sub in self.subprgs:
            models[sub] = Subprg(sub, self.x_train, self.x_test, self.y_train, self.y_test)
        return models

    def _get_test_values(self):
        x = self.df.drop(
            ['obj_prg', 'Время на выполнение', 'ДатаОкончанияЗадачи', 'date_report', 'Время на выполнение'],
            axis=1)
        y = self.df[['Время на выполнение', 'obj_subprg']]
        x_train = x[:self.split_index]
        x_test = x[self.split_index:]
        y_train = y[:self.split_index]
        y_test = y[self.split_index:]
        return x_train, x_test, y_train, y_test

    def fit(self):
        for i in tqdm(self.models):
            self.models[i].fit()

    def predict(self) -> dict:
        self.ans = {}
        for i in self.models:
            self.ans[i] = self.models[i].predict()
        return self.ans

    def MSE(self) -> None:
        for i in self.ans:
            print(i, mean_absolute_error(self.models[i].get_y_test, self.ans[i].tolist(), multioutput='raw_values'))

In [529]:
df = pd.read_csv("files\\pred_final.csv")
df

Unnamed: 0,obj_prg,obj_subprg,obj_key,Кодзадачи,ПроцентЗавершенияЗадачи,ДатаНачалаЗадачи,ДатаОкончанияЗадачи,Статуспоэкспертизе,Экспертиза,date_report,season,Скорость,Кол-во рабочих,Генподрядчик,Генпроектировщик,Время на выполнение
0,0,0,022-0527,1,0,2022-01-14,2023-03-30,0,0,2023-01-17,1,0.257576,0.0,0.0,0.0,440
1,0,0,022-0354,1,0,2022-12-01,2023-07-27,0,0,2023-01-17,1,0.462121,0.0,0.0,0.0,238
2,0,1,022-0513,1,0,2023-01-23,2023-09-29,0,0,2023-01-17,1,0.416667,0.0,0.0,0.0,249
3,0,1,020-0684,1,0,2020-11-03,2022-02-01,0,0,2023-01-17,4,0.000000,0.0,2.0,1.0,455
4,0,1,019-0589,1,0,2020-11-03,2022-05-16,0,0,2023-01-17,4,0.000000,0.0,2.0,1.0,559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35196,1,2,019-0477,7.4,93,2020-04-28,2023-05-31,0,0,2023-05-29,2,0.704545,5.0,12.0,16.0,1128
35197,0,0,020-0710,8,0,2023-08-31,2024-04-10,0,0,2023-05-29,3,0.000000,156.0,5.0,8.0,223
35198,0,0,019-0675,8,0,2023-09-01,2024-04-11,0,0,2023-05-29,4,0.000000,139.0,22.0,33.0,223
35199,0,1,020-0712,8,0,2023-06-19,2024-01-26,0,0,2023-05-29,2,0.000000,53.0,5.0,8.0,221


In [530]:
test_df = df.sort_values(['obj_key', 'Кодзадачи', 'ПроцентЗавершенияЗадачи', 'date_report'])
test_df

Unnamed: 0,obj_prg,obj_subprg,obj_key,Кодзадачи,ПроцентЗавершенияЗадачи,ДатаНачалаЗадачи,ДатаОкончанияЗадачи,Статуспоэкспертизе,Экспертиза,date_report,season,Скорость,Кол-во рабочих,Генподрядчик,Генпроектировщик,Время на выполнение
8,0,1,017-0520,1,100,2018-11-01,2022-02-15,0,0,2023-01-17,4,0.757576,0.0,5.0,8.0,1202
1699,0,1,017-0520,1,100,2018-11-01,2022-02-15,0,0,2023-01-23,4,0.757576,0.0,5.0,8.0,1202
3432,0,1,017-0520,1,100,2018-11-01,2022-02-15,0,0,2023-02-06,4,0.757576,0.0,5.0,8.0,1202
5219,0,1,017-0520,1,100,2018-11-01,2022-02-15,0,0,2023-02-13,4,0.757576,0.0,5.0,8.0,1202
7030,0,1,017-0520,1,100,2018-11-01,2022-02-15,0,0,2023-02-20,4,0.757576,0.0,5.0,8.0,1202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33242,1,3,022-0631,7.4,0,2024-10-14,2024-11-11,0,0,2023-05-22,4,0.000000,0.0,2.0,1.0,28
35153,1,3,022-0631,7.4,0,2024-10-14,2024-11-11,0,0,2023-05-29,4,0.000000,0.0,2.0,1.0,28
1665,1,3,022-0631,8,0,2024-12-02,2025-07-11,0,0,2023-01-17,1,0.000000,0.0,2.0,0.0,221
3393,1,3,022-0631,8,0,2024-12-02,2025-07-11,0,0,2023-01-23,1,0.000000,0.0,2.0,0.0,221


In [531]:
codes = test_df['Кодзадачи'].unique()
codes_d = {codes[i]: i for i in range(len(codes))}
test_df['Кодзадачи'] = test_df['Кодзадачи'].map(codes_d)
test_df

Unnamed: 0,obj_prg,obj_subprg,obj_key,Кодзадачи,ПроцентЗавершенияЗадачи,ДатаНачалаЗадачи,ДатаОкончанияЗадачи,Статуспоэкспертизе,Экспертиза,date_report,season,Скорость,Кол-во рабочих,Генподрядчик,Генпроектировщик,Время на выполнение
8,0,1,017-0520,0,100,2018-11-01,2022-02-15,0,0,2023-01-17,4,0.757576,0.0,5.0,8.0,1202
1699,0,1,017-0520,0,100,2018-11-01,2022-02-15,0,0,2023-01-23,4,0.757576,0.0,5.0,8.0,1202
3432,0,1,017-0520,0,100,2018-11-01,2022-02-15,0,0,2023-02-06,4,0.757576,0.0,5.0,8.0,1202
5219,0,1,017-0520,0,100,2018-11-01,2022-02-15,0,0,2023-02-13,4,0.757576,0.0,5.0,8.0,1202
7030,0,1,017-0520,0,100,2018-11-01,2022-02-15,0,0,2023-02-20,4,0.757576,0.0,5.0,8.0,1202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33242,1,3,022-0631,26,0,2024-10-14,2024-11-11,0,0,2023-05-22,4,0.000000,0.0,2.0,1.0,28
35153,1,3,022-0631,26,0,2024-10-14,2024-11-11,0,0,2023-05-29,4,0.000000,0.0,2.0,1.0,28
1665,1,3,022-0631,27,0,2024-12-02,2025-07-11,0,0,2023-01-17,1,0.000000,0.0,2.0,0.0,221
3393,1,3,022-0631,27,0,2024-12-02,2025-07-11,0,0,2023-01-23,1,0.000000,0.0,2.0,0.0,221


In [532]:
test_df['ДатаНачалаЗадачи'] = pd.to_datetime(test_df['ДатаНачалаЗадачи']).apply(lambda x: x.toordinal())
test_df['ДатаОкончанияЗадачи'] = pd.to_datetime(test_df['ДатаОкончанияЗадачи']).apply(lambda x: x.toordinal())
test_df

Unnamed: 0,obj_prg,obj_subprg,obj_key,Кодзадачи,ПроцентЗавершенияЗадачи,ДатаНачалаЗадачи,ДатаОкончанияЗадачи,Статуспоэкспертизе,Экспертиза,date_report,season,Скорость,Кол-во рабочих,Генподрядчик,Генпроектировщик,Время на выполнение
8,0,1,017-0520,0,100,736999,738201,0,0,2023-01-17,4,0.757576,0.0,5.0,8.0,1202
1699,0,1,017-0520,0,100,736999,738201,0,0,2023-01-23,4,0.757576,0.0,5.0,8.0,1202
3432,0,1,017-0520,0,100,736999,738201,0,0,2023-02-06,4,0.757576,0.0,5.0,8.0,1202
5219,0,1,017-0520,0,100,736999,738201,0,0,2023-02-13,4,0.757576,0.0,5.0,8.0,1202
7030,0,1,017-0520,0,100,736999,738201,0,0,2023-02-20,4,0.757576,0.0,5.0,8.0,1202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33242,1,3,022-0631,26,0,739173,739201,0,0,2023-05-22,4,0.000000,0.0,2.0,1.0,28
35153,1,3,022-0631,26,0,739173,739201,0,0,2023-05-29,4,0.000000,0.0,2.0,1.0,28
1665,1,3,022-0631,27,0,739222,739443,0,0,2023-01-17,1,0.000000,0.0,2.0,0.0,221
3393,1,3,022-0631,27,0,739222,739443,0,0,2023-01-23,1,0.000000,0.0,2.0,0.0,221


In [533]:
cat_features = ['obj_prg', 'obj_subprg', 'Статуспоэкспертизе', 'season', 'Кодзадачи']

In [534]:
test_df['Время на выполнение'].values

array([1202, 1202, 1202, ...,  221,  221,  221], dtype=int64)

In [535]:
test_df.values[0].tolist()

[0,
 1,
 '017-0520',
 0,
 100,
 736999,
 738201,
 0,
 0,
 '2023-01-17',
 4,
 0.7575757575757576,
 0.0,
 5.0,
 8.0,
 1202]

In [541]:
model = ModelPredictor(test_df)
model.fit()

  0%|          | 0/4 [00:00<?, ?it/s]

0:	learn: 162.5919376	test: 125.0025877	best: 125.0025877 (0)	total: 28.6ms	remaining: 57.1s
1:	learn: 156.9839226	test: 120.4588691	best: 120.4588691 (1)	total: 57.1ms	remaining: 57s
2:	learn: 148.8495952	test: 112.4874675	best: 112.4874675 (2)	total: 97.8ms	remaining: 1m 5s
3:	learn: 142.9229362	test: 107.9693619	best: 107.9693619 (3)	total: 121ms	remaining: 1m
4:	learn: 136.9056620	test: 103.6057187	best: 103.6057187 (4)	total: 149ms	remaining: 59.3s
5:	learn: 130.2215608	test: 97.2438255	best: 97.2438255 (5)	total: 178ms	remaining: 59s
6:	learn: 124.5143571	test: 91.0756479	best: 91.0756479 (6)	total: 206ms	remaining: 58.6s
7:	learn: 118.8650024	test: 87.0626904	best: 87.0626904 (7)	total: 233ms	remaining: 57.9s
8:	learn: 114.8190665	test: 83.4765724	best: 83.4765724 (8)	total: 261ms	remaining: 57.8s
9:	learn: 110.2508627	test: 79.8389048	best: 79.8389048 (9)	total: 294ms	remaining: 58.5s
10:	learn: 105.7079842	test: 76.5539273	best: 76.5539273 (10)	total: 323ms	remaining: 58.4s
11

 25%|██▌       | 1/4 [00:09<00:29,  9.89s/it]

299:	learn: 26.5401875	test: 62.6083378	best: 62.2933579 (101)	total: 9.7s	remaining: 55s
300:	learn: 26.5066256	test: 62.6193024	best: 62.2933579 (101)	total: 9.74s	remaining: 55s
301:	learn: 26.4805842	test: 62.6016960	best: 62.2933579 (101)	total: 9.77s	remaining: 54.9s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 62.29335795
bestIteration = 101

Shrink model to first 102 iterations.
0:	learn: 186.5446326	test: 144.3438100	best: 144.3438100 (0)	total: 34.2ms	remaining: 1m 8s
1:	learn: 174.5767983	test: 134.2294715	best: 134.2294715 (1)	total: 78.6ms	remaining: 1m 18s
2:	learn: 164.8192690	test: 125.4580426	best: 125.4580426 (2)	total: 114ms	remaining: 1m 16s
3:	learn: 154.5638210	test: 116.4958466	best: 116.4958466 (3)	total: 161ms	remaining: 1m 20s
4:	learn: 146.1432159	test: 109.4033589	best: 109.4033589 (4)	total: 195ms	remaining: 1m 17s
5:	learn: 138.3592285	test: 103.8069863	best: 103.8069863 (5)	total: 228ms	remaining: 1m 15s
6:	learn: 131.0614164	test: 9

 50%|█████     | 2/4 [00:21<00:21, 10.87s/it]

307:	learn: 23.9873116	test: 60.1616857	best: 53.3981174 (107)	total: 11.4s	remaining: 1m 2s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 53.39811737
bestIteration = 107

Shrink model to first 108 iterations.
0:	learn: 206.3870650	test: 154.2016279	best: 154.2016279 (0)	total: 28.8ms	remaining: 57.6s
1:	learn: 191.5610163	test: 143.6116274	best: 143.6116274 (1)	total: 58.3ms	remaining: 58.2s
2:	learn: 178.1022374	test: 132.1471894	best: 132.1471894 (2)	total: 98.2ms	remaining: 1m 5s
3:	learn: 166.0706867	test: 123.4169245	best: 123.4169245 (3)	total: 128ms	remaining: 1m 3s
4:	learn: 154.6474840	test: 117.4408698	best: 117.4408698 (4)	total: 155ms	remaining: 1m 1s
5:	learn: 145.7714210	test: 111.2890044	best: 111.2890044 (5)	total: 185ms	remaining: 1m 1s
6:	learn: 137.2119988	test: 107.7156491	best: 107.7156491 (6)	total: 224ms	remaining: 1m 3s
7:	learn: 129.5919660	test: 107.7156491	best: 107.7156491 (6)	total: 252ms	remaining: 1m 2s
8:	learn: 122.5240753	test: 10

 75%|███████▌  | 3/4 [00:30<00:09,  9.97s/it]

257:	learn: 20.8127229	test: 63.5076339	best: 59.7291185 (60)	total: 8.7s	remaining: 58.7s
258:	learn: 20.7400647	test: 63.4610246	best: 59.7291185 (60)	total: 8.73s	remaining: 58.7s
259:	learn: 20.6724061	test: 63.2503825	best: 59.7291185 (60)	total: 8.76s	remaining: 58.7s
260:	learn: 20.6426390	test: 63.2149302	best: 59.7291185 (60)	total: 8.79s	remaining: 58.6s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 59.72911847
bestIteration = 60

Shrink model to first 61 iterations.
0:	learn: 293.5423356	test: 260.3329210	best: 260.3329210 (0)	total: 28.2ms	remaining: 56.4s
1:	learn: 269.6732737	test: 247.9994083	best: 247.9994083 (1)	total: 58.2ms	remaining: 58.2s
2:	learn: 248.7871469	test: 240.0519426	best: 240.0519426 (2)	total: 86.5ms	remaining: 57.6s
3:	learn: 229.8365639	test: 223.9564737	best: 223.9564737 (3)	total: 117ms	remaining: 58.4s
4:	learn: 212.8005306	test: 218.0281078	best: 218.0281078 (4)	total: 150ms	remaining: 59.7s
5:	learn: 196.8214008	test: 211.29

100%|██████████| 4/4 [00:50<00:00, 12.57s/it]

708:	learn: 9.2389892	test: 87.4101927	best: 86.5338254 (510)	total: 19.7s	remaining: 35.8s
709:	learn: 9.2254672	test: 87.4864731	best: 86.5338254 (510)	total: 19.7s	remaining: 35.8s
710:	learn: 9.2184470	test: 87.5059665	best: 86.5338254 (510)	total: 19.7s	remaining: 35.8s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 86.53382538
bestIteration = 510

Shrink model to first 511 iterations.





In [542]:
test_df

Unnamed: 0,obj_prg,obj_subprg,obj_key,Кодзадачи,ПроцентЗавершенияЗадачи,ДатаНачалаЗадачи,ДатаОкончанияЗадачи,Статуспоэкспертизе,Экспертиза,date_report,season,Скорость,Кол-во рабочих,Генподрядчик,Генпроектировщик,Время на выполнение
8,0,1,017-0520,0,100,736999,738201,0,0,2023-01-17,4,0.757576,0.0,5.0,8.0,1202
1699,0,1,017-0520,0,100,736999,738201,0,0,2023-01-23,4,0.757576,0.0,5.0,8.0,1202
3432,0,1,017-0520,0,100,736999,738201,0,0,2023-02-06,4,0.757576,0.0,5.0,8.0,1202
5219,0,1,017-0520,0,100,736999,738201,0,0,2023-02-13,4,0.757576,0.0,5.0,8.0,1202
7030,0,1,017-0520,0,100,736999,738201,0,0,2023-02-20,4,0.757576,0.0,5.0,8.0,1202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33242,1,3,022-0631,26,0,739173,739201,0,0,2023-05-22,4,0.000000,0.0,2.0,1.0,28
35153,1,3,022-0631,26,0,739173,739201,0,0,2023-05-29,4,0.000000,0.0,2.0,1.0,28
1665,1,3,022-0631,27,0,739222,739443,0,0,2023-01-17,1,0.000000,0.0,2.0,0.0,221
3393,1,3,022-0631,27,0,739222,739443,0,0,2023-01-23,1,0.000000,0.0,2.0,0.0,221


In [543]:
model.predict()

{1: array([306.66396858, 306.66396858, 306.66396858, ...,   5.0791048 ,
          5.0791048 , 137.43080822]),
 0: array([412.40963654, 412.40963654, 412.40963654, ...,   3.7908727 ,
         20.41547848, 147.16109411]),
 3: array([298.66352898, 298.66352898, 298.66352898, ..., 257.92882239,
        257.92882239, 257.92882239]),
 2: array([185.91395958, 185.91395958, 185.91395958, 185.91395958,
        143.36844813, 143.36844813, 143.36844813, 143.36844813,
        143.36844813, 143.36844813, 143.36844813, 143.36844813,
        143.36844813, 143.36844813, 304.07874784, 304.07874784,
        304.07874784, 304.07874784, 304.07874784, 304.07874784,
        304.07874784, 304.07874784, 304.07874784, 306.74121021,
        306.74121021, 306.74121021, 306.74121021, 306.74121021,
        306.74121021, 306.74121021, 306.74121021, 306.74121021,
        306.74121021, 244.3255348 , 244.3255348 , 244.3255348 ,
        244.3255348 , 244.3255348 , 244.3255348 , 244.3255348 ,
        244.3255348 , 244.3

In [550]:
model.models[0].x_train

Unnamed: 0,obj_subprg,obj_key,Кодзадачи,ПроцентЗавершенияЗадачи,ДатаНачалаЗадачи,Статуспоэкспертизе,Экспертиза,season,Скорость,Кол-во рабочих,Генподрядчик,Генпроектировщик
49,0,017-0527,0,93,736999,0,0,4,0.704545,0.0,5.0,8.0
1741,0,017-0527,0,93,736999,0,0,4,0.704545,0.0,5.0,8.0
3473,0,017-0527,0,93,736999,0,0,4,0.704545,0.0,5.0,8.0
5260,0,017-0527,0,93,736999,0,0,4,0.704545,0.0,5.0,8.0
7073,0,017-0527,0,93,736999,0,0,4,0.704545,0.0,5.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...
33259,0,022-0170,26,0,738725,0,0,3,0.000000,489.0,2.0,1.0
35170,0,022-0170,26,0,738725,0,0,3,0.000000,465.0,2.0,1.0
1684,0,022-0170,27,0,738999,0,0,2,0.000000,271.0,2.0,1.0
3416,0,022-0170,27,0,739007,0,0,2,0.000000,271.0,2.0,1.0


In [551]:
model.MSE()

1 [62.29335895]
0 [53.39811837]
3 [59.72911947]
2 [86.53382638]
