In [1]:
# импорт библиотек

import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from phik import phik_matrix
import category_encoders as ce
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from datetime import timedelta
from catboost import CatBoostRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from prophet import Prophet
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re


pd.set_option('display.max_columns', 30)
warnings.filterwarnings("ignore")

In [2]:
# чтение файлов

sales_df_train = pd.read_csv('D:\\yandex\\sp_sales_task\\sales_df_train.csv')
pr_df = pd.read_csv('D:\\yandex\\sp_sales_task\\pr_df.csv')
st_df = pd.read_csv('D:\\yandex\\sp_sales_task\\st_df.csv')
sales_submission = pd.read_csv('D:\\yandex\\sp_sales_task\\sales_submission.csv')

In [3]:
# объединение таблиц

full_df = sales_df_train.merge(st_df, on='st_id')

In [4]:
# объединение таблиц

full_df = full_df.merge(pr_df, on='pr_sku_id')

In [5]:
# изменение типа date

full_df['date'] = pd.to_datetime(full_df['date'])

In [6]:
full_df['pr_sales_in_units'] = full_df['pr_sales_in_units'].astype('int64')

In [7]:
# добавляем день недели и месяц

full_df['day_of_week'] = full_df['date'].dt.dayofweek
full_df['month'] = full_df['date'].dt.month

In [8]:
# добавляем столбик с значением выходного дня (1 - день выходной, 0 - рабочий)
full_df['weekend'] = 0
full_df.loc[full_df['day_of_week'].isin([6, 7]), 'weekend'] = 1

In [9]:
# сортируем таблицу

full_df.sort_values(by=['st_id', 'pr_sku_id', 'date'], inplace=True)

In [10]:
# удаляем строки без продаж

full_df = full_df[full_df['pr_sales_in_rub'] >= 0]

In [11]:
# удаляем аномально большие значения

full_df = full_df[full_df['pr_sales_in_rub'] < 10000]

In [12]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 879062 entries, 882986 to 652221
Data columns (total 21 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   st_id                    879062 non-null  object        
 1   pr_sku_id                879062 non-null  object        
 2   date                     879062 non-null  datetime64[ns]
 3   pr_sales_type_id         879062 non-null  int64         
 4   pr_sales_in_units        879062 non-null  int64         
 5   pr_promo_sales_in_units  879062 non-null  float64       
 6   pr_sales_in_rub          879062 non-null  float64       
 7   pr_promo_sales_in_rub    879062 non-null  float64       
 8   st_city_id               879062 non-null  object        
 9   st_division_code         879062 non-null  object        
 10  st_type_format_id        879062 non-null  int64         
 11  st_type_loc_id           879062 non-null  int64         
 12  st_type_siz

# предобработка

In [13]:
# заполним нули в количестве продаж

# сначала узнаем цену одной единицы

full_df['one_unit_price'] = full_df.apply(
    lambda row: row['pr_sales_in_rub'] / row['pr_sales_in_units'] 
                if (row['pr_sales_in_rub'] != 0 and 
                    row['pr_sales_in_units'] != 0 and 
                    row['pr_promo_sales_in_rub'] == 0) 
                else None, axis=1)

# добавляем цену за одну единицу по промо 

full_df['one_unit_price'] = full_df.apply(
    lambda row: row['pr_promo_sales_in_rub'] / row['pr_promo_sales_in_units']
                 if (np.isnan(row['one_unit_price']) and
                     row['pr_promo_sales_in_rub'] != 0 and
                     row['pr_promo_sales_in_units'] != 0)
                 else row['one_unit_price'],
    axis=1
)

# заполним значения в цене за одну единицу предыдущими значениями

full_df['one_unit_price'] = full_df.groupby('pr_sku_id')['one_unit_price'].transform(lambda x: x.ffill())

In [14]:
# заполним пропуски в one_unit_price предыдущими соответствуцющими значениями

full_df['one_unit_price'] = full_df.groupby('pr_sku_id')['one_unit_price'].transform(lambda x: x.fillna(x.mean()))

In [15]:
# то, что не заполнилось - удаляем

full_df = full_df.dropna()

In [16]:
# заполняем нули в pr_sales_in_units 

full_df['pr_sales_in_units'] = full_df.apply(
    lambda row: row['pr_sales_in_rub'] / row['one_unit_price'] 
                if row['pr_sales_in_units'] == 0 
                else row['pr_sales_in_units'], axis=1)

In [17]:
# заполняем нули в pr_sales_in_rub

full_df['pr_sales_in_rub'] = full_df.apply(
    lambda row: row['pr_sales_in_units'] * row['one_unit_price'] 
                if row['pr_sales_in_rub'] == 0 
                else row['pr_sales_in_rub'], axis=1)

In [18]:
# удалим строки с полным отсутствием продаж

full_df = full_df[(full_df['pr_sales_in_rub'] != 0) & (full_df['pr_sales_in_units'] != 0)]

In [19]:
# теперь удалим столбцы pr_promo_sales_in_rub и pr_sales_in_rub

columns_to_drop = ['pr_sales_in_rub', 'pr_promo_sales_in_rub','pr_promo_sales_in_units', 'one_unit_price']

full_df = full_df.drop(columns=columns_to_drop,axis=1)

# Подготовим трейн и тест без лишних столбцов

In [20]:
test_columns = ['st_id','pr_sku_id','date','pr_sales_in_units','day_of_week','month','weekend']

In [21]:
df_without_columns = full_df[test_columns]

In [22]:
df_without_columns = df_without_columns[df_without_columns['date'] < '2022-11-16']

In [23]:
df_without_columns_train = df_without_columns[df_without_columns['date'] < '2022-10-30']

In [24]:
df_without_columns_test = df_without_columns[(df_without_columns['date'] >= '2022-10-30')&(df_without_columns['date'] < '2022-11-16')]

In [25]:
features_test_df_without_columns = df_without_columns_test.drop('pr_sales_in_units', axis=1)

In [26]:
target_test_df_without_columns = df_without_columns_test['pr_sales_in_units']

# Подготовим трейн и тест без удаления столбцов

In [27]:
df_whith_columns = full_df

In [28]:
df_whith_columns = df_whith_columns[df_whith_columns['date'] < '2022-11-16']

In [29]:
df_whith_columns_train = df_whith_columns[df_whith_columns['date'] < '2022-10-30']

In [30]:
df_whith_columns_test = df_whith_columns[(df_whith_columns['date'] >= '2022-10-30')&(df_whith_columns['date'] < '2022-11-15')]

In [31]:
features_test_df_whith_columns = df_whith_columns_test.drop('pr_sales_in_units', axis=1)

In [32]:
target_test_df_whith_columns = df_whith_columns_test['pr_sales_in_units']

In [33]:
features_test_df_whith_columns[features_test_df_whith_columns['st_city_id'] == 'c1f75cc0f7fe269dd0fd9bd5e24f9586']

Unnamed: 0,st_id,pr_sku_id,date,pr_sales_type_id,st_city_id,st_division_code,st_type_format_id,st_type_loc_id,st_type_size_id,st_is_active,pr_group_id,pr_cat_id,pr_subcat_id,pr_uom_id,day_of_week,month,weekend
168389,16a5cdae362b8d27a1d8f8c7b78b4330,00661699f543753ec7e911a64b9fd2f6,2022-10-30,1,c1f75cc0f7fe269dd0fd9bd5e24f9586,296bd0cc6e735f9d7488ebc8fbc19130,1,2,8,1,aab3238922bcc25a6f606eb525ffdc56,9701a1c165dd9420816bfec5edd6c2b1,8afe22eeb3f3f68de994a3c60388858c,1,6,10,1
168330,16a5cdae362b8d27a1d8f8c7b78b4330,00661699f543753ec7e911a64b9fd2f6,2022-10-31,1,c1f75cc0f7fe269dd0fd9bd5e24f9586,296bd0cc6e735f9d7488ebc8fbc19130,1,2,8,1,aab3238922bcc25a6f606eb525ffdc56,9701a1c165dd9420816bfec5edd6c2b1,8afe22eeb3f3f68de994a3c60388858c,1,0,10,0
168340,16a5cdae362b8d27a1d8f8c7b78b4330,00661699f543753ec7e911a64b9fd2f6,2022-11-01,1,c1f75cc0f7fe269dd0fd9bd5e24f9586,296bd0cc6e735f9d7488ebc8fbc19130,1,2,8,1,aab3238922bcc25a6f606eb525ffdc56,9701a1c165dd9420816bfec5edd6c2b1,8afe22eeb3f3f68de994a3c60388858c,1,1,11,0
168128,16a5cdae362b8d27a1d8f8c7b78b4330,00661699f543753ec7e911a64b9fd2f6,2022-11-02,1,c1f75cc0f7fe269dd0fd9bd5e24f9586,296bd0cc6e735f9d7488ebc8fbc19130,1,2,8,1,aab3238922bcc25a6f606eb525ffdc56,9701a1c165dd9420816bfec5edd6c2b1,8afe22eeb3f3f68de994a3c60388858c,1,2,11,0
168276,16a5cdae362b8d27a1d8f8c7b78b4330,00661699f543753ec7e911a64b9fd2f6,2022-11-03,1,c1f75cc0f7fe269dd0fd9bd5e24f9586,296bd0cc6e735f9d7488ebc8fbc19130,1,2,8,1,aab3238922bcc25a6f606eb525ffdc56,9701a1c165dd9420816bfec5edd6c2b1,8afe22eeb3f3f68de994a3c60388858c,1,3,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652096,16a5cdae362b8d27a1d8f8c7b78b4330,ff62e7bffaca5e0ac8029cd29298c8c3,2022-11-06,0,c1f75cc0f7fe269dd0fd9bd5e24f9586,296bd0cc6e735f9d7488ebc8fbc19130,1,2,8,1,c74d97b01eae257e44aa9d5bade97baf,1bc0249a6412ef49b07fe6f62e6dc8de,68f204f3838bfee4ada868b66e6a0814,17,6,11,1
652003,16a5cdae362b8d27a1d8f8c7b78b4330,ff62e7bffaca5e0ac8029cd29298c8c3,2022-11-08,1,c1f75cc0f7fe269dd0fd9bd5e24f9586,296bd0cc6e735f9d7488ebc8fbc19130,1,2,8,1,c74d97b01eae257e44aa9d5bade97baf,1bc0249a6412ef49b07fe6f62e6dc8de,68f204f3838bfee4ada868b66e6a0814,17,1,11,0
652036,16a5cdae362b8d27a1d8f8c7b78b4330,ff62e7bffaca5e0ac8029cd29298c8c3,2022-11-08,0,c1f75cc0f7fe269dd0fd9bd5e24f9586,296bd0cc6e735f9d7488ebc8fbc19130,1,2,8,1,c74d97b01eae257e44aa9d5bade97baf,1bc0249a6412ef49b07fe6f62e6dc8de,68f204f3838bfee4ada868b66e6a0814,17,1,11,0
652043,16a5cdae362b8d27a1d8f8c7b78b4330,ff62e7bffaca5e0ac8029cd29298c8c3,2022-11-10,0,c1f75cc0f7fe269dd0fd9bd5e24f9586,296bd0cc6e735f9d7488ebc8fbc19130,1,2,8,1,c74d97b01eae257e44aa9d5bade97baf,1bc0249a6412ef49b07fe6f62e6dc8de,68f204f3838bfee4ada868b66e6a0814,17,3,11,0


# Обучение модели

In [34]:
# Функция метрики WAPE

def wape(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true))

In [35]:
# Создание scorer для использования в cross_val_score

wape_scorer = make_scorer(wape, greater_is_better=False)

In [36]:
cat_features_1 = ['st_id', 'pr_sku_id', 'st_city_id', 'st_division_code', 'pr_group_id', 'pr_subcat_id','pr_cat_id']
cat_features_2 = ['st_id', 'pr_sku_id']

In [37]:
# Функция кросс-валидации временных рядов

def time_series_cross_validation(data, target_column, date_column, cat_features):
    
    # Задаем размеры обучающего, валидационного и тестового наборов
    train_size = timedelta(days=60)
    validation_size = timedelta(weeks=2)
    test_size = timedelta(weeks=2)
    
    # Задаем шаг сдвига
    step_size=timedelta(weeks=2)
    
    # Задаем начальную дату
    start_date = data[date_column].min()

    # Параметры для RandomizedSearchCV
    param_distributions = {
        'iterations': [500, 1000, 1500],
        'depth': [4, 6, 8, 10],
        'learning_rate': [0.001, 0.01, 0.1],
        'l2_leaf_reg': [1, 3, 5, 7, 9]
    }
    # переменные для хранения лучший значения
    best_model = None          # лучшая модель
    best_params = None         # лучшие параметры
    best_score = float('inf')  # лучшая метрика
    results = []

    while start_date + train_size + validation_size + test_size <= data[date_column].max():
        
        train_end = start_date + train_size
        validation_end = train_end + validation_size
        test_end = validation_end + test_size

        train = data[(data[date_column] >= start_date) & (data[date_column] < train_end)]
        validation = data[(data[date_column] >= train_end) & (data[date_column] < validation_end)]
        test = data[(data[date_column] >= validation_end) & (data[date_column] < test_end)]

        # Проверка на наличие достаточного количества данных
        if len(train) == 0 or len(validation) == 0 or len(test) == 0:
            print(f"Пропускаем из за отутствия данных. Train: {start_date} - {train_end}, Validation: {train_end} - {validation_end}")
            start_date += validation_size
            continue

        X_train, y_train = train.drop(columns=target_column), train[target_column]
        X_validation, y_validation = validation.drop(columns=target_column), validation[target_column]
        X_test, y_test = test.drop(columns=target_column), test[target_column]

        

        model = CatBoostRegressor(cat_features=cat_features, verbose=1)

        # Подбор гиперпараметров
        searcher = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, 
                                      scoring='neg_mean_absolute_error', n_iter=3,cv =3, verbose=1, random_state=42)
        searcher.fit(pd.concat([X_train, X_validation]), pd.concat([y_train, y_validation]))

        # Обучение с лучшими параметрами
        best_model = searcher.best_estimator_
        best_model.fit(pd.concat([X_train, X_validation]), pd.concat([y_train, y_validation]))

        # Предсказание на тестовом наборе
        predictions = best_model.predict(X_test)
        
        # Расчет WAPE
        error = wape(y_test, predictions)
        results.append(error)
        
        if -error < best_score:
            best_score = -searcher.best_score_
            best_model = searcher.best_estimator_
            best_params = searcher.best_params_

        print(f"Wape на интервале с {start_date} по {test_end}: {error:.4f}")

        start_date += step_size

    return results, best_model, best_params

# results, best_model, best_params = time_series_cross_validation(data, target_column, date_column)  - вызов функции

In [38]:
%%time
df_whith_columns_results, df_whith_columns_best_model, df_whith_columns_best_params = time_series_cross_validation(df_whith_columns_train, 'pr_sales_in_units', 'date',cat_features_1)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
0:	learn: 8.5118045	total: 225ms	remaining: 3m 44s
1:	learn: 8.4896391	total: 282ms	remaining: 2m 20s
2:	learn: 8.4679327	total: 342ms	remaining: 1m 53s
3:	learn: 8.4473208	total: 382ms	remaining: 1m 35s
4:	learn: 8.4270339	total: 435ms	remaining: 1m 26s
5:	learn: 8.4069066	total: 479ms	remaining: 1m 19s
6:	learn: 8.3864131	total: 518ms	remaining: 1m 13s
7:	learn: 8.3663187	total: 557ms	remaining: 1m 9s
8:	learn: 8.3448038	total: 604ms	remaining: 1m 6s
9:	learn: 8.3253436	total: 642ms	remaining: 1m 3s
10:	learn: 8.3067453	total: 684ms	remaining: 1m 1s
11:	learn: 8.2884974	total: 746ms	remaining: 1m 1s
12:	learn: 8.2680419	total: 796ms	remaining: 1m
13:	learn: 8.2504113	total: 834ms	remaining: 58.7s
14:	learn: 8.2324548	total: 872ms	remaining: 57.2s
15:	learn: 8.2128930	total: 921ms	remaining: 56.7s
16:	learn: 8.1953684	total: 967ms	remaining: 55.9s
17:	learn: 8.1763184	total: 1.02s	remaining: 55.9s
18:	learn: 8.1599726	total: 1

In [39]:
%%time
df_without_results, df_without_best_model, df_without_best_params = time_series_cross_validation(df_without_columns, 'pr_sales_in_units', 'date',cat_features_2)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
0:	learn: 8.5124363	total: 38.7ms	remaining: 38.7s
1:	learn: 8.4917236	total: 65.9ms	remaining: 32.9s
2:	learn: 8.4705375	total: 98.9ms	remaining: 32.9s
3:	learn: 8.4501214	total: 123ms	remaining: 30.7s
4:	learn: 8.4296663	total: 157ms	remaining: 31.3s
5:	learn: 8.4095696	total: 188ms	remaining: 31.2s
6:	learn: 8.3902665	total: 216ms	remaining: 30.6s
7:	learn: 8.3713408	total: 241ms	remaining: 29.9s
8:	learn: 8.3523534	total: 270ms	remaining: 29.7s
9:	learn: 8.3339648	total: 295ms	remaining: 29.2s
10:	learn: 8.3154801	total: 320ms	remaining: 28.7s
11:	learn: 8.2972292	total: 344ms	remaining: 28.4s
12:	learn: 8.2793326	total: 369ms	remaining: 28s
13:	learn: 8.2621406	total: 394ms	remaining: 27.7s
14:	learn: 8.2450787	total: 418ms	remaining: 27.5s
15:	learn: 8.2280751	total: 444ms	remaining: 27.3s
16:	learn: 8.2112873	total: 469ms	remaining: 27.1s
17:	learn: 8.1951998	total: 498ms	remaining: 27.1s
18:	learn: 8.1792205	total: 524m

In [40]:
with_column_predict = df_whith_columns_best_model.predict(features_test_df_whith_columns)

In [41]:
withot_column_predict = df_without_best_model.predict(features_test_df_without_columns)

In [42]:
with_column_column__wape = wape(target_test_df_whith_columns,with_column_predict)
with_column_column__wape

0.6341492524746696

In [43]:
withot_column_column__wape = wape(target_test_df_without_columns,withot_column_predict)
withot_column_column__wape

0.6665689424602034

In [44]:
df_whith_columns_best_model.save_model("catboost_model.cbm")

In [45]:
sales = pd.read_csv('D:\\yandex\\sp_sales_task\\forecasted_data.csv')

In [46]:
sales

Unnamed: 0,st_id,pr_sku_id,date,sales,forecast_day_1,forecast_day_2,forecast_day_3,forecast_day_4,forecast_day_5,forecast_day_6,forecast_day_7,forecast_day_8,forecast_day_9,forecast_day_10,forecast_day_11,forecast_day_12,forecast_day_13,forecast_day_14
0,c81e728d9d4c2f636f067f89cc14862c,c7b711619071c92bef604c7ad68380dd,2022-10-20,5.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,c81e728d9d4c2f636f067f89cc14862c,c7b711619071c92bef604c7ad68380dd,2023-02-02,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,c81e728d9d4c2f636f067f89cc14862c,c7b711619071c92bef604c7ad68380dd,2022-10-09,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,c81e728d9d4c2f636f067f89cc14862c,c7b711619071c92bef604c7ad68380dd,2023-06-22,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,c81e728d9d4c2f636f067f89cc14862c,c7b711619071c92bef604c7ad68380dd,2023-06-30,25.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
883010,084a8a9aa8cced9175bd07bc44998e75,c2718cfd2edcbadfe0162a4f4c91f3a0,2023-05-09,3.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
883011,084a8a9aa8cced9175bd07bc44998e75,c2718cfd2edcbadfe0162a4f4c91f3a0,2023-04-07,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
883012,084a8a9aa8cced9175bd07bc44998e75,be8d2843456cac871fc116ab25d02994,2023-06-11,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
883013,084a8a9aa8cced9175bd07bc44998e75,be8d2843456cac871fc116ab25d02994,2023-04-26,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [47]:
new_data = pd.read_csv('D:\\yandex\\sp_sales_task\\new_df.csv')

In [48]:
full_df['pr_sku_id'].nunique()

1972

In [49]:
new_data

Unnamed: 0,st_id,pr_sku_id,date,pr_sales_type_id,pr_sales_in_units,pr_promo_sales_in_units,pr_sales_in_rub,pr_promo_sales_in_rub,st_city_id,st_division_code,st_type_format_id,st_type_loc_id,st_type_size_id,st_is_active,pr_group_id,pr_cat_id,pr_subcat_id,pr_uom_id,day_of_week,month,weekend,one_unit_price
0,c81e728d9d4c2f636f067f89cc14862c,c7b711619071c92bef604c7ad68380dd,2022-10-20,1,5.0,5.0,825.0,825.0,908c9a564a86426585b29f5335b619bc,429a86ff6336c144ddaf6144f52ab52c,1,1,8,1,aab3238922bcc25a6f606eb525ffdc56,28fc2782ea7ef51c1104ccf7b9bea13d,d29b5ce9c2883f0b7e90f79071a2ca82,1,3,10,0,165.000000
1,42a0e188f5033bc65bf8d78622277c4e,c7b711619071c92bef604c7ad68380dd,2023-01-16,0,17.0,0.0,2388.0,0.0,b8b4b727d6f5d1b61fff7be687f7970f,da742b3cfca5388aaf9af28a726a3c30,1,1,12,1,aab3238922bcc25a6f606eb525ffdc56,28fc2782ea7ef51c1104ccf7b9bea13d,d29b5ce9c2883f0b7e90f79071a2ca82,1,0,1,0,140.470588
2,16a5cdae362b8d27a1d8f8c7b78b4330,c7b711619071c92bef604c7ad68380dd,2022-10-10,0,13.0,0.0,2128.0,0.0,c1f75cc0f7fe269dd0fd9bd5e24f9586,296bd0cc6e735f9d7488ebc8fbc19130,1,2,8,1,aab3238922bcc25a6f606eb525ffdc56,28fc2782ea7ef51c1104ccf7b9bea13d,d29b5ce9c2883f0b7e90f79071a2ca82,1,0,10,0,163.692308
3,f7e6c85504ce6e82442c770f7c8606f0,c7b711619071c92bef604c7ad68380dd,2023-07-14,1,1.0,1.0,233.0,233.0,3202111cf90e7c816a472aaceb72b0df,32586311f16876abf92901085bd87b99,1,1,12,1,aab3238922bcc25a6f606eb525ffdc56,28fc2782ea7ef51c1104ccf7b9bea13d,d29b5ce9c2883f0b7e90f79071a2ca82,1,4,7,0,233.000000
4,6364d3f0f495b6ab9dcf8d3b5c6e0b01,c7b711619071c92bef604c7ad68380dd,2022-10-30,1,2.0,2.0,394.0,394.0,b8b4b727d6f5d1b61fff7be687f7970f,da742b3cfca5388aaf9af28a726a3c30,1,1,12,1,aab3238922bcc25a6f606eb525ffdc56,28fc2782ea7ef51c1104ccf7b9bea13d,d29b5ce9c2883f0b7e90f79071a2ca82,1,6,10,1,197.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6239,084a8a9aa8cced9175bd07bc44998e75,0376a60d9a7ce7965beddc4815588697,2023-06-05,0,1.0,0.0,146.0,0.0,3202111cf90e7c816a472aaceb72b0df,32586311f16876abf92901085bd87b99,4,3,19,1,c74d97b01eae257e44aa9d5bade97baf,a368b0de8b91cfb3f91892fbf1ebd4b2,8feef08314d477a5b68ba18a6d35819d,17,0,6,0,146.000000
6240,084a8a9aa8cced9175bd07bc44998e75,88feeeb024d3f69da7322d76b7b53744,2023-03-30,1,1.0,1.0,103.0,103.0,3202111cf90e7c816a472aaceb72b0df,32586311f16876abf92901085bd87b99,4,3,19,1,c74d97b01eae257e44aa9d5bade97baf,a368b0de8b91cfb3f91892fbf1ebd4b2,8feef08314d477a5b68ba18a6d35819d,17,3,3,0,103.000000
6241,084a8a9aa8cced9175bd07bc44998e75,c2718cfd2edcbadfe0162a4f4c91f3a0,2023-04-28,0,1.0,0.0,187.0,0.0,3202111cf90e7c816a472aaceb72b0df,32586311f16876abf92901085bd87b99,4,3,19,1,c74d97b01eae257e44aa9d5bade97baf,a368b0de8b91cfb3f91892fbf1ebd4b2,8feef08314d477a5b68ba18a6d35819d,17,4,4,0,187.000000
6242,084a8a9aa8cced9175bd07bc44998e75,be8d2843456cac871fc116ab25d02994,2023-06-11,0,1.0,0.0,210.0,0.0,3202111cf90e7c816a472aaceb72b0df,32586311f16876abf92901085bd87b99,4,3,19,1,c74d97b01eae257e44aa9d5bade97baf,a368b0de8b91cfb3f91892fbf1ebd4b2,8feef08314d477a5b68ba18a6d35819d,17,6,6,1,210.000000


In [50]:
def main(today=date.today()):
    # Предсказание на 14 дней вперед и создание нового датасета
    new_data = []

    for _, row in full_df.iterrows():
        shop = row['st_id']
        product = row['pr_sku_id']

        # Здесь вызываем функцию forecast, передавая необходимые данные
        # Например, sales, информацию о продукте и магазине

        # Пример:
        # prediction = forecast(sales, product_info, shop_info)

        # Создаем запись с предсказанными значениями
        # Здесь нужно добавить предсказанные значения на 14 дней вперед в список prediction
        prediction = [0] * 14
        new_row = {
            'st_id': shop,
            'pr_sku_id': product,
            'date': row['date'],  # Дата из исходных данных
            'sales': row['pr_sales_in_units'],  # Фактические продажи из исходных данных
        }

        for i, day in enumerate(range(1, 15)):
            new_row[f'forecast_day_{i + 1}'] = prediction[i]

        new_data.append(new_row)

    # Создание нового датасета на основе new_data
    new_df = pd.DataFrame(new_data)

    # Сохранение нового датасета в CSV-файл
    new_df.to_csv('D:\\yandex\\sp_sales_task\\forecasted_data.csv', index=False)

NameError: name 'date' is not defined