### Импортируем библиотеки

In [47]:
import pickle
import numpy as np
import pandas as pd
from docx import Document

### Считываем исходную таблицу из файла

In [48]:
def parse_docx_table(filename: str):
    document = Document(filename)

    tables = []
    for table in document.tables:
        # список списков с пустыми строками - структура будущего датафрейма
        df = [['' for x in range(len(table.columns))] for y in range(len(table.rows))]
        # заполняем дф данными из табл
        for x, row in enumerate(table.rows):
            for y, cell in enumerate(row.cells):
                if cell.text:
                    df[x][y] = cell.text
        tables.append(pd.DataFrame(df))

    return tables

In [49]:
# берем первую таблицу из списка
init_df = parse_docx_table('DE_task_table.docx')[0]
# заголовк таблицы - первый ряд
header = init_df.iloc[0]
# убираем первый ряд из таблицы
init_df = init_df[1:]
# назначаем название колонок дф
init_df.columns = header

In [50]:
# переводим формат дат, год 9999 заменяем на nan обработкой ошибок
init_df['start_date']= pd.to_datetime(init_df['start_date'], format="%d.%m.%Y", errors = 'coerce',)
init_df['finish_date']= pd.to_datetime(init_df['finish_date'], format="%d.%m.%Y", errors = 'coerce',)

In [51]:
# правило №2
init_df.loc[init_df['finish_date'].isnull(), 'finish_date'] = init_df['end_da']

  return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy)
  return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy)


In [52]:
init_df

Unnamed: 0,tab_num,start_date,finish_date,wday_type01,wday_type02,wday_type03,wday_type04,wday_type05,wplace_type,end_da
1,15123,2020-09-02,2020-10-31,0,0,0,0,0,0,31.10.2020
2,16234,2020-09-20,2020-10-30,0,0,1,1,0,2,
3,17345,2020-09-28,2020-10-25,1,0,0,0,0,2,
4,17345,2020-10-26,2020-12-31,1,1,1,1,1,1,
5,18456,2020-09-02,2020-09-30,2,2,2,2,2,3,30.09.2020
6,19567,2020-09-02,2020-12-31,3,3,3,3,3,4,


In [53]:
# словарь для каждого номера ряда исходной таблицы с датой старта и окончания режима
start_date_dict = dict(zip(range(init_df.shape[0]), init_df.start_date.to_list()))
finish_date_dict = dict(zip(range(init_df.shape[0]), init_df.finish_date.to_list()))

In [54]:
dash = pd.DataFrame()
# для каждого ряда - свой датафрейм по правилам №1 и №2
for num, tab_num in enumerate(init_df.tab_num.to_list()):
    interim = pd.DataFrame()
    interim['ymd_date'] = pd.date_range(start='2020-09-01', end='2020-12-31', freq='D')
    # дополнительные фичи для правила №8
    interim['weekday'] = interim['ymd_date'].dt.dayofweek
    interim['week'] = interim['ymd_date'].dt.isocalendar().week
    interim['week'] = interim['week'] - interim['week'].min()
    interim['halfweek'] = interim['week'] // 2
    interim['tab_num'] = tab_num
    interim['start_date'] = start_date_dict[num]
    interim['finish_date'] = finish_date_dict[num]
    dash = pd.concat([dash, interim])
dash['to_be_at_office'] = np.nan

In [55]:
repeated_tab_num = [x for x in init_df[init_df['tab_num'].duplicated()].tab_num.values]
repeated_tab_num

['17345']

In [56]:
def process_duplicated_tab_num(init_df: pd.DataFrame, repeated_tab_nums: list):
    dash_merged = init_df.copy()
    for tab_num in repeated_tab_nums:
        dash_merged = pd.concat(
            [
                dash_merged[~dash_merged.tab_num.isin([tab_num])],
                pd.merge(
                    dash_merged.loc[(dash_merged['tab_num'] == tab_num)][['ymd_date', 'weekday', 'tab_num']],
                    dash_merged.loc[
                        (dash_merged['tab_num'] == tab_num) &
                        (dash_merged['ymd_date'] <= dash_merged['finish_date']) &
                        (dash_merged['ymd_date'] >= dash_merged['start_date'])
                    ],
                    how='left'
                ).drop_duplicates()
            ]
        )
        print(dash_merged.shape)
    return dash_merged

In [57]:
dash_merged = process_duplicated_tab_num(dash, repeated_tab_num)

(610, 8)


In [58]:
dash.shape, dash_merged.shape, 122*5

((732, 8), (610, 8), 610)

In [59]:
assert dash_merged.shape[0] == 122 * len(dash.tab_num.unique())

In [60]:
dash_merged.sample(5)

Unnamed: 0,ymd_date,weekday,week,halfweek,tab_num,start_date,finish_date,to_be_at_office
22,2020-09-23,2,3,1,18456,2020-09-02,2020-09-30,
49,2020-10-20,1,7,3,19567,2020-09-02,2020-12-31,
83,2020-11-23,0,12,6,15123,2020-09-02,2020-10-31,
23,2020-09-24,3,3,1,16234,2020-09-20,2020-10-30,
60,2020-10-31,5,8,4,16234,2020-09-20,2020-10-30,


In [15]:
with open('dash_merged.pickle', 'wb') as handle:
    pickle.dump(dash_merged, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('init_df.pickle', 'wb') as handle:
    pickle.dump(init_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Обработку "вручную"

In [61]:
start_date_dict = dict(zip(range(init_df.shape[0]), init_df.start_date.to_list()))
finish_date_dict = dict(zip(range(init_df.shape[0]), init_df.finish_date.to_list()))

In [62]:
init_df.tab_num.unique()[0]

'15123'

In [63]:
# офисная работа wplace_type = 0
tab_num_index = 0
dash_merged.loc[
    (dash_merged['tab_num'] == init_df.tab_num.to_list()[tab_num_index]) &
    (dash_merged['ymd_date'] <= finish_date_dict[tab_num_index].to_datetime64()) &
    (dash_merged['ymd_date'] >= start_date_dict[tab_num_index].to_datetime64()) &
    (dash_merged['weekday'] != 5) &
    (dash_merged['weekday'] != 6),
    'to_be_at_office'
] = 1

15123


In [64]:
# смотрим что получилось
dash_merged[dash_merged['tab_num'] == '15123']

Unnamed: 0,ymd_date,weekday,week,halfweek,tab_num,start_date,finish_date,to_be_at_office
0,2020-09-01,1,0,0,15123,2020-09-02,2020-10-31,
1,2020-09-02,2,0,0,15123,2020-09-02,2020-10-31,1.0
2,2020-09-03,3,0,0,15123,2020-09-02,2020-10-31,1.0
3,2020-09-04,4,0,0,15123,2020-09-02,2020-10-31,1.0
4,2020-09-05,5,0,0,15123,2020-09-02,2020-10-31,
...,...,...,...,...,...,...,...,...
117,2020-12-27,6,16,8,15123,2020-09-02,2020-10-31,
118,2020-12-28,0,17,8,15123,2020-09-02,2020-10-31,
119,2020-12-29,1,17,8,15123,2020-09-02,2020-10-31,
120,2020-12-30,2,17,8,15123,2020-09-02,2020-10-31,


In [65]:
# Удаленная работа wplace_type = 1
print(init_df.tab_num.to_list()[3])

17345


In [66]:
tab_num_index = 3
dash_merged.loc[
    (dash_merged['tab_num'] == init_df.tab_num.to_list()[tab_num_index]) &
    (dash_merged['ymd_date'] <= finish_date_dict[tab_num_index].to_datetime64()) &
    (dash_merged['ymd_date'] >= start_date_dict[tab_num_index].to_datetime64()) &
    (dash_merged['weekday'] != 5) &
    (dash_merged['weekday'] != 6),
    'to_be_at_office'
] = 0

In [67]:
dash_merged[dash_merged['tab_num'] == '17345']

Unnamed: 0,ymd_date,weekday,week,halfweek,tab_num,start_date,finish_date,to_be_at_office
0,2020-09-01,1,,,17345,NaT,NaT,
1,2020-09-02,2,,,17345,NaT,NaT,
2,2020-09-03,3,,,17345,NaT,NaT,
3,2020-09-04,4,,,17345,NaT,NaT,
4,2020-09-05,5,,,17345,NaT,NaT,
...,...,...,...,...,...,...,...,...
117,2020-12-27,6,16,8,17345,2020-10-26,2020-12-31,
118,2020-12-28,0,17,8,17345,2020-10-26,2020-12-31,0.0
119,2020-12-29,1,17,8,17345,2020-10-26,2020-12-31,0.0
120,2020-12-30,2,17,8,17345,2020-10-26,2020-12-31,0.0


In [68]:
# init_df[['wday_type01', 'wday_type02', 'wday_type03', 'wday_type04', 'wday_type05']][2:3].where('wday_type01 > 0') #.map({1 : 0, 1: 0})

In [None]:
# Смешанная работа wplace_type = 2
print(init_df.tab_num.to_list()[2])

In [69]:
tab_num_index = 2
dash_merged.loc[
    (dash_merged['tab_num'] == init_df.tab_num.to_list()[tab_num_index]) &
    (dash_merged['ymd_date'] <= finish_date_dict[tab_num_index].to_datetime64()) &
    (dash_merged['ymd_date'] >= start_date_dict[tab_num_index].to_datetime64()) &
    (dash_merged['weekday'] != 5) &
    (dash_merged['weekday'] != 6) &
    (dash_merged['weekday'] == 0),
    'to_be_at_office'
] = 0

In [70]:
tab_num_index = 2
dash_merged.loc[
    (
        (dash_merged['tab_num'] == init_df.tab_num.to_list()[tab_num_index]) &
        (dash_merged['ymd_date'] < finish_date_dict[tab_num_index].to_datetime64()) &
        (dash_merged['ymd_date'] > start_date_dict[tab_num_index].to_datetime64()) &
        (dash_merged['weekday'] != 5) &
        (dash_merged['weekday'] != 6)
    ) &
    (
        (dash_merged['weekday'].isin([1, 2, 3, 4]))
    ),
    'to_be_at_office'
] = 1

In [70]:
# Смешанная работа wplace_type = 2
print(init_df.tab_num.to_list()[1])

In [71]:
tab_num_index = 1
dash_merged.loc[
    (
        (dash_merged['tab_num'] == init_df.tab_num.to_list()[tab_num_index]) &
        (dash_merged['ymd_date'] <= finish_date_dict[tab_num_index].to_datetime64()) &
        (dash_merged['ymd_date'] >= start_date_dict[tab_num_index].to_datetime64()) &
        (dash_merged['weekday'] != 5) &
        (dash_merged['weekday'] != 6)
    ) &
    (
        (dash_merged['weekday'].isin([0, 1, 4]))
    ),
    'to_be_at_office'
] = 1

In [72]:
tab_num_index = 1
dash_merged.loc[
    (
        (dash_merged['tab_num'] == init_df.tab_num.to_list()[tab_num_index]) &
        (dash_merged['ymd_date'] <= finish_date_dict[tab_num_index].to_datetime64()) &
        (dash_merged['ymd_date'] >= start_date_dict[tab_num_index].to_datetime64()) &
        (dash_merged['weekday'] != 5) &
        (dash_merged['weekday'] != 6)
    )
        &
    (
        (dash_merged['weekday'].isin([2, 3]))
    ),
    'to_be_at_office'
] = 0

In [73]:
dash_merged[dash_merged['tab_num'] == init_df.tab_num.to_list()[1]]

Unnamed: 0,ymd_date,weekday,week,halfweek,tab_num,start_date,finish_date,to_be_at_office
0,2020-09-01,1,0,0,16234,2020-09-20,2020-10-30,
1,2020-09-02,2,0,0,16234,2020-09-20,2020-10-30,
2,2020-09-03,3,0,0,16234,2020-09-20,2020-10-30,
3,2020-09-04,4,0,0,16234,2020-09-20,2020-10-30,
4,2020-09-05,5,0,0,16234,2020-09-20,2020-10-30,
...,...,...,...,...,...,...,...,...
117,2020-12-27,6,16,8,16234,2020-09-20,2020-10-30,
118,2020-12-28,0,17,8,16234,2020-09-20,2020-10-30,
119,2020-12-29,1,17,8,16234,2020-09-20,2020-10-30,
120,2020-12-30,2,17,8,16234,2020-09-20,2020-10-30,


In [None]:
# Смешанная работа wplace_type = 3 неделя через неделю
print(init_df.tab_num.to_list()[4])

In [74]:
tab_num_index = 4
dash_merged.loc[
    (
        (dash_merged['tab_num'] == init_df.tab_num.to_list()[tab_num_index]) &
        (dash_merged['ymd_date'] <= finish_date_dict[tab_num_index].to_datetime64()) &
        (dash_merged['ymd_date'] >= start_date_dict[tab_num_index].to_datetime64()) &
        (dash_merged['weekday'] != 5) &
        (dash_merged['weekday'] != 6)
    )
        &
    (
        (dash_merged['weekday'].isin([0, 1, 2, 3, 4]))
    )
            &
    (
        (dash_merged['week'] % 2 == 0)
    ),
    'to_be_at_office'
] = 0

In [75]:
tab_num_index = 4
dash_merged.loc[
    (
        (dash_merged['tab_num'] == init_df.tab_num.to_list()[tab_num_index]) &
        (dash_merged['ymd_date'] <= finish_date_dict[tab_num_index].to_datetime64()) &
        (dash_merged['ymd_date'] >= start_date_dict[tab_num_index].to_datetime64()) &
        (dash_merged['weekday'] != 5) &
        (dash_merged['weekday'] != 6)
    )
        &
    (
        (dash_merged['weekday'].isin([0, 1, 2, 3, 4]))
    )
            &
    (
        (dash_merged['week'] % 2 == 1)
    ),
    'to_be_at_office'
] = 1

In [76]:
dash_merged[dash_merged['tab_num'] == init_df.tab_num.to_list()[4]]

Unnamed: 0,ymd_date,weekday,week,halfweek,tab_num,start_date,finish_date,to_be_at_office
0,2020-09-01,1,0,0,18456,2020-09-02,2020-09-30,
1,2020-09-02,2,0,0,18456,2020-09-02,2020-09-30,0.0
2,2020-09-03,3,0,0,18456,2020-09-02,2020-09-30,0.0
3,2020-09-04,4,0,0,18456,2020-09-02,2020-09-30,0.0
4,2020-09-05,5,0,0,18456,2020-09-02,2020-09-30,
...,...,...,...,...,...,...,...,...
117,2020-12-27,6,16,8,18456,2020-09-02,2020-09-30,
118,2020-12-28,0,17,8,18456,2020-09-02,2020-09-30,
119,2020-12-29,1,17,8,18456,2020-09-02,2020-09-30,
120,2020-12-30,2,17,8,18456,2020-09-02,2020-09-30,


In [None]:
# Смешанная работа wplace_type = 3 неделя через две
print(init_df.tab_num.to_list()[5])

In [78]:
tab_num_index = 5
dash_merged.loc[
    (
        (dash_merged['tab_num'] == init_df.tab_num.to_list()[tab_num_index]) &
        (dash_merged['ymd_date'] <= finish_date_dict[tab_num_index].to_datetime64()) &
        (dash_merged['ymd_date'] >= start_date_dict[tab_num_index].to_datetime64()) &
        (dash_merged['weekday'] != 5) &
        (dash_merged['weekday'] != 6)
    )
        &
    (
        (dash_merged['weekday'].isin([0, 1, 2, 3, 4]))
    )
            &
    (
        (dash_merged['halfweek'] % 2 == 0)
    ),
    'to_be_at_office'
] = 0

In [79]:
tab_num_index = 5
dash_merged.loc[
    (
        (dash_merged['tab_num'] == init_df.tab_num.to_list()[tab_num_index]) &
        (dash_merged['ymd_date'] <= finish_date_dict[tab_num_index].to_datetime64()) &
        (dash_merged['ymd_date'] >= start_date_dict[tab_num_index].to_datetime64()) &
        (dash_merged['weekday'] != 5) &
        (dash_merged['weekday'] != 6)
    )
        &
    (
        (dash_merged['weekday'].isin([0, 1, 2, 3, 4]))
    )
            &
    (
        (dash_merged['halfweek'] % 2 == 1)
    ),
    'to_be_at_office'
] = 1

In [80]:
dash_merged[dash_merged['tab_num'] == init_df.tab_num.to_list()[5]]

Unnamed: 0,ymd_date,weekday,week,halfweek,tab_num,start_date,finish_date,to_be_at_office
0,2020-09-01,1,0,0,19567,2020-09-02,2020-12-31,
1,2020-09-02,2,0,0,19567,2020-09-02,2020-12-31,0.0
2,2020-09-03,3,0,0,19567,2020-09-02,2020-12-31,0.0
3,2020-09-04,4,0,0,19567,2020-09-02,2020-12-31,0.0
4,2020-09-05,5,0,0,19567,2020-09-02,2020-12-31,
...,...,...,...,...,...,...,...,...
117,2020-12-27,6,16,8,19567,2020-09-02,2020-12-31,
118,2020-12-28,0,17,8,19567,2020-09-02,2020-12-31,0.0
119,2020-12-29,1,17,8,19567,2020-09-02,2020-12-31,0.0
120,2020-12-30,2,17,8,19567,2020-09-02,2020-12-31,0.0


In [81]:
# TO-DO поменять формат даты

### Результат

In [14]:
result = dash_merged[['tab_num', 'ymd_date', 'to_be_at_office']].sort_values(by=['tab_num', 'ymd_date']).reset_index(drop=True)

In [17]:
result['ymd_date'] = pd.to_datetime(result['ymd_date']).dt.strftime("%d.%m.%Y")

In [18]:
result

Unnamed: 0,tab_num,ymd_date,to_be_at_office
0,15123,01.09.2020,
1,15123,02.09.2020,1.0
2,15123,03.09.2020,1.0
3,15123,04.09.2020,1.0
4,15123,05.09.2020,
...,...,...,...
605,19567,27.12.2020,
606,19567,28.12.2020,
607,19567,29.12.2020,
608,19567,30.12.2020,


### Автоматическая обработка

In [1]:
import pickle
import pandas as pd

In [2]:
with open("dash_merged.pickle", "rb") as handle:
    dash_merged_from_file = pickle.load(handle)

with open("init_df.pickle", "rb") as handle:
    init_df_from_file = pickle.load(handle)

In [4]:
init_df_from_file.shape, dash_merged_from_file.shape

((6, 10), (610, 8))

In [5]:
from rule_parser import RuleParser


In [6]:
rp = RuleParser(data=dash_merged_from_file, init_table=init_df_from_file)

In [7]:
rp.data

Unnamed: 0,ymd_date,weekday,week,halfweek,tab_num,start_date,finish_date,to_be_at_office
0,2020-09-01,1,0,0,15123,2020-09-02,2020-10-31,
1,2020-09-02,2,0,0,15123,2020-09-02,2020-10-31,
2,2020-09-03,3,0,0,15123,2020-09-02,2020-10-31,
3,2020-09-04,4,0,0,15123,2020-09-02,2020-10-31,
4,2020-09-05,5,0,0,15123,2020-09-02,2020-10-31,
...,...,...,...,...,...,...,...,...
117,2020-12-27,6,16,8,17345,2020-10-26,2020-12-31,
118,2020-12-28,0,17,8,17345,2020-10-26,2020-12-31,
119,2020-12-29,1,17,8,17345,2020-10-26,2020-12-31,
120,2020-12-30,2,17,8,17345,2020-10-26,2020-12-31,


In [None]:
rp.transfrom(tab_num_index=0)
# rp.transfrom(tab_num_index=3)
# rp.transfrom(tab_num_index=4)


In [None]:
rp.data[rp.data['tab_num'] == init_df.tab_num.to_list()[4]]