In [2]:
import os

import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go

from tqdm import tqdm

In [3]:
from prophet import Prophet

In [4]:
#from prophet.plot import plot as fbplot

In [5]:
from oktmo_names import oktmo_names_decode as oktmo_names

In [6]:
from pylab import rcParams
rcParams['figure.figsize'] = 22, 7

In [7]:
#%pylab inline

In [8]:
PATH_DATA = os.path.join(Path.cwd(), 'data')
PATH_SUBM = os.path.join(Path.cwd(), 'submissions')

In [9]:
submission = pd.read_csv(os.path.join(PATH_DATA, 'sample_submission.csv'), 
                        parse_dates = ['date'])
submission.shape

(7735, 79)

In [15]:
submission.head()

Unnamed: 0,region,oktmo,okato,date,pasta,legumes,bread,flour,rice,groats,...,spice_value,tea_value,ai92,ai95,ai98,dt,ai92_value,ai95_value,ai98_value,dt_value
0,65,64000000000,64000000000,2021-06-30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,66,65000000000,65000000000,2021-06-30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,64,63000000000,63000000000,2021-06-30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,67,66000000000,66000000000,2021-06-30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,70,69000000000,69000000000,2021-06-30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
submission.tail()

Unnamed: 0,region,oktmo,okato,date,pasta,legumes,bread,flour,rice,groats,...,spice_value,tea_value,ai92,ai95,ai98,dt,ai92_value,ai95_value,ai98_value,dt_value
7730,43,33000000000,33000000000,2021-01-04,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7731,73,73000000000,73000000000,2021-01-04,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7732,74,75000000000,75000000000,2021-01-04,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7733,72,71000000000,71000000000,2021-01-04,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7734,75,76000000000,76000000000,2021-01-04,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
submission.date.min(), submission.date.max()

(Timestamp('2021-01-04 00:00:00'), Timestamp('2021-12-06 00:00:00'))

Read train data. Set type of all columns to float.

In [18]:
data = pd.read_csv(os.path.join(PATH_DATA, 'train.csv'),
                    sep = ';',
                    parse_dates=['date'],
                    infer_datetime_format=True,
                    decimal = ',',
                    thousands='\xa0',
                    engine='python',
                   )

items = data.columns.drop(['region', 'oktmo', 'okato', 'date'])
for el in items:
    data[el] = data[el].astype(float)
    
data.shape

(69785, 79)

Get aver over all oktmo (except Ingush in set)

In [19]:
def get_aver(inp_prod, inp_df, ignore_Ingush = True):
    
    if ignore_Ingush:
        use_cols = ['date', inp_prod, 'oktmo']
        #ignore_oktmo = oktmo_names[26000000000]
        ignore_oktmo = 26000000000
        return inp_df[use_cols].query('oktmo != @ignore_oktmo')[['date', inp_prod]].groupby('date').mean().sort_values(by='date')
    
    use_cols = ['date', inp_prod]        
    return inp_df[use_cols].groupby('date').mean().sort_values(by='date')

Calculate deviation from aver over all oktmo over all products

In [None]:
#dt = np.datetime64('2019-01-02')
#data.query('oktmo == 47000000000 and date == @dt')['bread']

In [27]:
oktmo = data.oktmo.unique()
#deviation_df = pd.DataFrame(columns = list(items), index = oktmo)

In [20]:
if os.path.exists(os.path.join(PATH_DATA, 'deviation_sum_nz.csv')):
    deviation_df = pd.read_csv(os.path.join(PATH_DATA, 'deviation_sum_nz.csv'),
#if os.path.exists(os.path.join(PATH_DATA, 'deviation_mult_nz.csv')):
    #deviation_df = pd.read_csv(os.path.join(PATH_DATA, 'deviation_mult_nz.csv'),
                              index_col = 0,
                              )

Make future dates from 01.04.2021 to 30.06.2021 

In [22]:
train = get_aver('fruit_value', data)
X = train.reset_index()[['date', 'fruit_value']]#.columns = ['ds', 'y']
X.columns=['ds', 'y']

In [23]:
model = Prophet(yearly_seasonality=True,daily_seasonality=True)
model.fit(X)

<prophet.forecaster.Prophet at 0x20b125bf708>

In [24]:
future = model.make_future_dataframe(periods=91)
future = future[821:]

Make models (without saving) and submissions

In [28]:
for itm in tqdm(items):
    train = get_aver(itm, data)
    X = train.reset_index()[['date', itm]]#.columns = ['ds', 'y']
    X.columns=['ds', 'y']
    
    model = Prophet(yearly_seasonality=True, daily_seasonality=True,
                    seasonality_mode='multiplicative',  # hz. future firecast more sharp
                    #changepoint_prior_scale=0.01,   # 0.1 - 0.15 looks adequately
                   )
    model.fit(X)
    
    forecast = model.predict(future)
    
    for dt in future.ds.values:
        for reg in oktmo:
            mult = deviation_df.loc[reg, itm]
            #value = forecast.query('ds == @dt')['yhat'] + mult
            value = forecast.loc[forecast.ds == dt, 'yhat'].values[0] + mult
            submission.loc[(submission.date == dt) & (submission.oktmo == reg), itm] = value

100%|██████████████████████████████████████████████████████████████████████████████████| 75/75 [15:46<00:00, 12.62s/it]


In [29]:
submission.head()

Unnamed: 0,region,oktmo,okato,date,pasta,legumes,bread,flour,rice,groats,...,spice_value,tea_value,ai92,ai95,ai98,dt,ai92_value,ai95_value,ai98_value,dt_value
0,65,64000000000,64000000000,2021-06-30,125.771711,120.960261,107.868709,46.528979,35.532168,38.593252,...,14123.198187,14370.858197,51.036595,50.699178,50.744782,56.381341,271514.3,209830.2,23700.290371,39492.074831
1,66,65000000000,65000000000,2021-06-30,79.691667,107.671082,79.190276,31.531367,86.199422,66.051402,...,113788.869396,109587.410809,45.055461,48.015626,47.806577,48.630819,1581581.0,1329051.0,1015.551481,390320.944234
2,64,63000000000,63000000000,2021-06-30,77.46595,99.551351,76.304977,31.487129,78.214855,68.826715,...,46593.745187,50020.648854,45.908386,49.043088,48.73714,47.379565,940324.1,444951.5,4249.800714,83427.69107
3,67,66000000000,66000000000,2021-06-30,76.997085,110.900545,79.107664,34.552517,93.409228,59.554521,...,23836.366157,22576.393481,46.040954,49.210342,44.536582,47.318759,390633.0,327127.5,2076.367112,155017.335771
4,70,69000000000,69000000000,2021-06-30,84.396398,111.861843,74.768127,29.625562,85.558034,65.000312,...,37726.686456,31714.555481,44.093133,46.110521,47.607987,49.84158,355770.2,339723.3,738.279008,26024.235323


In [30]:
submission.to_csv(os.path.join(PATH_SUBM, 'phrop_deviation_sum_nz.csv'))

# below this level - trash. do not used

In [None]:
train = get_aver('fruit_value')
train.shape

In [None]:
train.head()

In [None]:
train.fruit_value.plot()

In [None]:
X = train.reset_index()[['date', 'fruit_value']]#.columns = ['ds', 'y']
X.columns=['ds', 'y']
X.head()

In [None]:
model = Prophet(daily_seasonality=True)

In [None]:
model.fit(X)

c 01.04.2021 по 30.06.2021 

In [None]:
future = model.make_future_dataframe(periods=91)
print(future.shape)
future = future[821:]

In [None]:
future.head(), future.tail()

In [None]:
forecast = model.predict(future)

In [None]:
fig1 = model.plot(forecast, figsize = (22, 7))

In [None]:
fig2 = model.plot_components(forecast)

In [None]:
forecast['ds']

In [None]:
fig = px.line(x = forecast['ds'], y = forecast['yhat'])
fig.show()

In [None]:
fig = px.line(x = forecast['ds'], y = forecast['yhat'])
fig.show()

In [None]:
Xfull.fruit.mean(), Xtrue.fruit.mean()

In [None]:
Xfull = get_aver('fruit', ignore_Ingush = False)
Xtrue = get_aver('fruit', ignore_Ingush = True)

In [None]:
fig = px.line(y = Xfull.fruit, x = Xfull.index)
#fig.add_scatter(px.line(Xtrue))
fig.add_trace(go.Scatter(y = Xtrue.fruit, x = Xtrue.index))
fig.show()

In [None]:
Xfull.index
#Xtrue.index

In [None]:
fig = px.line(y = Xfull.fruit, x= Xfull.index)
fig.show()

In [None]:
fig = px.line(y = Xtrue.fruit, x= Xtrue.index)
fig.show()