In [None]:
import pandas as pd
from tqdm import tqdm
import warnings
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
import sklearn as sk

warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("files\\pred_final.csv")
df

In [None]:
date_reports = df['date_report'].unique()
columns = ['obj_key', 'obj_subprg', 'Кодзадачи', 'ДатаНачалаЗадачи', 'Статуспоэкспертизе', 'Экспертиза', 'season',
           'Скорость', 'Кол-во рабочих', 'Генподрядчик', 'Генпроектировщик', 'Площадь', *date_reports, 'y']
columns

In [None]:
df = df.sort_values(['obj_key', 'Кодзадачи', 'ПроцентЗавершенияЗадачи', 'date_report'])

In [None]:
len(date_reports)

In [None]:
d_dates = {key: np.nan for key in date_reports}
list(d_dates.values())

In [None]:
new_df = pd.DataFrame(columns=columns)
keys = df['obj_key'].unique()
keys.sort()
# pd.Series(row_data, index=columns
for key in tqdm(keys):
    codes = df[df['obj_key'] == key]['Кодзадачи'].unique()
    for code in codes:
        objects = df[(df['obj_key'] == key) & (df['Кодзадачи'] == code)]
        start = objects.head(1)
        last = objects.tail(1)
        obj_key = key
        obj_subprg = last['obj_subprg'].values[0]
        obj_code = code
        start_date = start['ДатаНачалаЗадачи'].values[0]
        status_exp = last['Статуспоэкспертизе'].values[0]
        exp = last['Экспертиза'].values[0]
        season = last['season'].values[0]
        speed = last['Скорость'].values[0]
        rab_count = start['Кол-во рабочих'].values[0]
        gen_pod = last['Генподрядчик'].values[0]
        gen_proc = last['Генпроектировщик'].values[0]
        square = last['Площадь'].values[0]
        y_value = last['Время на выполнение'].values[0]
        dates = objects['date_report'].unique()
        d_dates = {key: np.nan for key in date_reports}
        row_data = [obj_key, obj_subprg, obj_code, start_date, status_exp, exp, season, speed, rab_count, gen_pod,
                    gen_proc, square]
        for date in dates:
            prc = objects[objects['date_report'] == date].head(1)['ПроцентЗавершенияЗадачи'].values[0]
            d_dates[date] = prc
        row_data.extend(list(d_dates.values()))
        row_data.append(y_value)
        new_df = new_df.append(pd.Series(row_data, index=columns), ignore_index=True)
new_df

In [None]:
def find_good_idx(vals: np.ndarray, start_index: int = 12) -> tuple[int, bool]:
    good_idx = 0
    for i in range(start_index, len(vals) - 2):
        if not np.isnan(vals[i]):
            good_idx = i
    if good_idx == 0:
        return 0, False
    else:
        return good_idx, True

In [None]:
for i, row in new_df.iterrows():
    has_nan = row.isna().any()
    if has_nan:
        vals = row.values
        for j in range(12, len(vals) - 1):
            if np.isnan(vals[j]):
                good_idx, ans = find_good_idx(vals, i)
                if ans:
                    vals[j] = vals[good_idx]
                else:
                    good_idx, ans = find_good_idx(vals, 12)
                    vals[j] = vals[good_idx]
        new_df.loc[i] = vals

In [None]:
new_df['Генпроектировщик'] = new_df['Генпроектировщик'].astype(int)
new_df['Генподрядчик'] = new_df['Генподрядчик'].astype(int)
new_df['Кол-во рабочих'] = new_df['Кол-во рабочих'].astype(int)
new_df['ДатаНачалаЗадачи'] = pd.to_datetime(new_df['ДатаНачалаЗадачи']).apply(lambda x: x.toordinal())
new_df['obj_subprg'] = new_df['obj_subprg'].astype(int)
new_df['Статуспоэкспертизе'] = new_df['Статуспоэкспертизе'].astype(int)
new_df['Экспертиза'] = new_df['Экспертиза'].astype(int)
new_df['season'] = new_df['season'].astype(int)
new_df['y'] = new_df['y'].astype(int)
for column in date_reports:
    new_df[column] = new_df[column].astype(int)
new_df.dtypes

In [None]:
cat_features = ["Кодзадачи", 'obj_subprg', 'Статуспоэкспертизе', 'Экспертиза', 'season', 'Генподрядчик',
                'Генпроектировщик']

In [None]:
new_df.to_csv("files\\times.csv", index=False)

In [None]:
import pickle

encoder = sk.preprocessing.OneHotEncoder()
encoded_features = encoder.fit_transform(new_df[cat_features])
with open("encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)
encoded_df = pd.DataFrame(encoded_features.toarray()).reset_index(drop=True)
df_no_cat = new_df.drop(cat_features, axis=1).reset_index(drop=True)
new_data = pd.concat([encoded_df, df_no_cat], axis=1)

new_data

In [None]:
new_df3 = new_df.copy()
new_df.drop(cat_features, inplace=True, axis=1)
new_data.columns = [str(col) for col in
                    new_data.columns]  # добавляем префикс к имени каждой колонки, чтобы избежать дубликатов
new_df2 = new_data.copy()
new_df2

In [None]:
len(new_df)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(new_df2.drop(['obj_key', 'y'], axis=1), new_df2['y'],
                                                    test_size=0.28,
                                                    random_state=228)

In [None]:
# leaf_estimation_backtracking="AnyImprovement"
model = CatBoostRegressor(iterations=15000, loss_function='MAE', learning_rate=0.05, max_depth=5, eval_metric="MAE",
                          leaf_estimation_backtracking="AnyImprovement", subsample=0.7, bagging_temperature=0.3,
                          langevin=True)

In [None]:
model.fit(x_train, y_train, use_best_model=True, eval_set=(x_test, y_test))

In [None]:
model.get_feature_importance(prettified=True)

In [None]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, model.predict(x_test))

In [None]:
vals = new_df['y'].unique()
vals.sort()
print(min(vals), max(vals))


In [None]:
len(new_df)
model.save_model('cat_model')

In [None]:
len(model.predict(x_test))

In [None]:
new_df3 = new_df3[:517]
new_df3['yy'] = model.predict(x_test)


In [None]:
new_df3['yy'] = new_df3['yy'].apply(lambda x: 0 if x < 0 else x)
new_df3['yy'] = new_df3['yy'].round().astype(int)
new_df3

In [None]:
import datetime


new_df3['ДатаНачалаЗадачи'] = new_df3['ДатаНачалаЗадачи'].apply(lambda x: datetime.date.fromordinal(x))
new_df3

In [None]:
new_df3['kkk'] = new_df3['ДатаНачалаЗадачи'] + pd.to_timedelta(new_df3['yy'], unit='d')
new_df3

In [None]:
df