# Исследование и обогащение данных

In [1]:
import os
import pandas as pd
import json
import time
from tqdm import tqdm
import numpy as np
import gurobipy

from collections import defaultdict

## Анализ уникальных операций для всех печей

In [31]:
path = 'data/train/'
fnames = sorted(list(filter(lambda x: x.split('.')[1] == 'json',os.listdir(path))))

## Количество печей по дням

In [22]:
ovens_qty = defaultdict()

for fn in fnames:
    full_fn = os.path.join(path, fn)
    with open(full_fn, 'r') as f:
        j = json.load(f)
        ovens_qty[fn] = len(j['ovens'])

In [24]:
ovens_qty.values()

dict_values([422, 467, 336, 296, 240, 271, 380, 413, 328, 290, 463, 348, 224, 490, 468, 215, 296, 209, 491, 310, 396, 230, 338, 266, 227, 378, 298, 255, 382, 402, 423, 478, 495, 214, 294, 204, 365, 436, 329, 424, 233, 378, 407, 495, 345, 393, 327, 229, 262, 339, 455, 369, 470, 240, 223, 307, 287, 383, 405, 318, 211, 266, 264, 258, 300, 221, 441, 336, 227, 227, 304, 470, 422, 313, 500, 488, 465, 456, 387, 498, 463, 301, 469, 354, 325, 246, 202, 278, 411, 482, 257, 260, 298, 379, 288, 492, 396, 461, 431, 265])

In [39]:
pd.Series(ovens_qty.values()).mean()

347.68

## Количество серий по дням

In [25]:
series_qty = defaultdict()

for fn in fnames:
    full_fn = os.path.join(path, fn)
    with open(full_fn, 'r') as f:
        j = json.load(f)
        series_qty[fn] = len(j['series'])

In [27]:
series_qty.values()

dict_values([2594, 1697, 1998, 2375, 2873, 2556, 1997, 2250, 1770, 1603, 2401, 2149, 2729, 2063, 1594, 1759, 1603, 1842, 2994, 1680, 2083, 2530, 1869, 1679, 2073, 1796, 1972, 1506, 2061, 2387, 2017, 1716, 1735, 1915, 2299, 2873, 1998, 2573, 2745, 2351, 1580, 1770, 2138, 2137, 1761, 2698, 2710, 2717, 2661, 1901, 1951, 1944, 1934, 1677, 2127, 1972, 2836, 1919, 1963, 2556, 2744, 2759, 2263, 2497, 1661, 1553, 2063, 2149, 1816, 1860, 1901, 2477, 2723, 2656, 1751, 1581, 2877, 1947, 2319, 2837, 1589, 2964, 1574, 2516, 1888, 1789, 1631, 1989, 2111, 2995, 1686, 2810, 1997, 1617, 1959, 2794, 2474, 2340, 1577, 2861])

In [38]:
pd.Series(series_qty.values()).mean()

2152.52

## Соотношение количества серий к количеству печей по дням

In [33]:
series_ratio = defaultdict()

for fn in ovens_qty:
    series_ratio[fn] = round((series_qty[fn] / ovens_qty[fn]), 2)

In [34]:
series_ratio.values()

dict_values([6.15, 3.63, 5.95, 8.02, 11.97, 9.43, 5.26, 5.45, 5.4, 5.53, 5.19, 6.18, 12.18, 4.21, 3.41, 8.18, 5.42, 8.81, 6.1, 5.42, 5.26, 11.0, 5.53, 6.31, 9.13, 4.75, 6.62, 5.91, 5.4, 5.94, 4.77, 3.59, 3.51, 8.95, 7.82, 14.08, 5.47, 5.9, 8.34, 5.54, 6.78, 4.68, 5.25, 4.32, 5.1, 6.87, 8.29, 11.86, 10.16, 5.61, 4.29, 5.27, 4.11, 6.99, 9.54, 6.42, 9.88, 5.01, 4.85, 8.04, 13.0, 10.37, 8.57, 9.68, 5.54, 7.03, 4.68, 6.4, 8.0, 8.19, 6.25, 5.27, 6.45, 8.49, 3.5, 3.24, 6.19, 4.27, 5.99, 5.7, 3.43, 9.85, 3.36, 7.11, 5.81, 7.27, 8.07, 7.15, 5.14, 6.21, 6.56, 10.81, 6.7, 4.27, 6.8, 5.68, 6.25, 5.08, 3.66, 10.8])

In [40]:
pd.Series(series_ratio.values()).mean()

6.658499999999999

## Уникальные значения по операциям на печах

In [41]:
df_op = pd.DataFrame()

In [42]:
for fn in fnames:
    full_fn = os.path.join(path,fn)
    with open(full_fn, 'r') as f:
        j = json.load(f)
    df_op = pd.concat([df_op, pd.json_normalize(j['ovens']).\
        assign(day=fn.split('.')[0]).\
        explode(column = 'operations').\
        explode(column = 'working_temps')])
df_op

Unnamed: 0,start_temp,working_temps,operations,day
0,1220,960,prokat,day-0
0,1220,1030,prokat,day-0
0,1220,1190,prokat,day-0
0,1220,1100,prokat,day-0
0,1220,1230,prokat,day-0
...,...,...,...,...
264,1190,1120,otzhig,day-99
264,1190,1000,otzhig,day-99
264,1190,1040,otzhig,day-99
264,1190,1210,otzhig,day-99


In [43]:
df['operations'].unique()

array(['prokat', 'kovka', 'otzhig'], dtype=object)

## Уникальные значения по операциям в сериях

In [58]:
series_set = set()

In [59]:
for fn in fnames:
    full_fn = os.path.join(path,fn)
    with open(full_fn, 'r') as f:
        j = json.load(f)
        for series in j['series']:
            for op in series['operations']:
                series_set.add(op['name'])


In [60]:
series_set

{'kovka', 'nagrev', 'otzhig', 'prokat'}

## Проверка гипотезы

**Гипотеза:** каждая из запланированных серий начинается с операции ***nagrev*** 

In [43]:
exception_list = []

for fn in tqdm(fnames):
    time.sleep(1)
    full_fn = os.path.join(path,fn)
    with open(full_fn, 'r') as f:
        j = json.load(f)
        for idx, ser in enumerate(j['series']):
            if ser['operations'][0]['name'] != 'nagrev':
                exception_list.append((fn, idx))

if exception_list:
    print(exception_list)
else:
    print('No exception')

100%|███████████████████████████████████████████████████████████| 100/100 [01:41<00:00,  1.02s/it]

No exception





## Обогащение данных

### Ovens enrichment test

In [91]:
test_dict = {
    "ovens": [
        {
            "start_temp": 1220,
            "working_temps": [
                960,
                1030,
                1190,
                1100,
                1230,
                1070,
                1110,
                1080,
                1240,
                1220
            ],
            "operations": [
                "prokat",
                "kovka"
            ]
        },
        {
            "start_temp": 990,
            "working_temps": [
                980,
                1190,
                1030,
                1100,
                990
            ],
            "operations": [
                "otzhig",
                "kovka"
            ]
        },
        {
            "start_temp": 1220,
            "working_temps": [
                970,
                1060,
                1220
            ],
            "operations": [
                "otzhig"
            ]
        }
    ]
}

In [92]:
for oven in test_dict['ovens']:
    oven['operations'].append('nagrev')

### Operation enrichment test

In [93]:
test_list = [
                {
                    "name": "nagrev",
                    "timing": 296
                },
                {
                    "name": "kovka",
                    "timing": 10
                },
                {
                    "name": "kovka",
                    "timing": 10
                },
                {
                    "name": "prokat",
                    "timing": 10
                }
            ]

In [94]:
enriched_list = []

In [55]:
for i in range(1, len(test_list)):
    enriched_list.append(test_list[i - 1])
    if (test_list[i]['name'] == 'kovka' or test_list[i]['name'] == 'prokat') and test_list[i - 1]['name'] != 'nagrev':
        enriched_list.append(
            {
                "name": "nagrev",
                "timing": 120
            }
        )
    if i == len(test_list) - 1:
        enriched_list.append(test_list[i])

In [56]:
enriched_list

[{'name': 'nagrev', 'timing': 296},
 {'name': 'kovka', 'timing': 10},
 {'name': 'nagrev', 'timing': 120},
 {'name': 'kovka', 'timing': 10},
 {'name': 'nagrev', 'timing': 120},
 {'name': 'prokat', 'timing': 10}]

### Ovens and Operation enrichment

In [89]:
enriched_path = 'data/enriched_train/'

for fn in tqdm(fnames):
    full_fn = os.path.join(path,fn)
    with open(full_fn, 'r') as f:
        j = json.load(f)
        
        for idx, oven in enumerate(j['ovens']):
            j['ovens'][idx]['operations'].append('nagrev')
        
        for idx, series in enumerate(j['series']):
            enriched_list = []
            for k in range(1, len(series['operations'])):
                enriched_list.append(series['operations'][k - 1])
                if (
                    series['operations'][k]['name'] == 'kovka' or series['operations'][k]['name'] == 'prokat'
                ) and series['operations'][k - 1]['name'] != 'nagrev':
                    enriched_list.append(
                        {
                            "name": "nagrev",
                            "timing": 120
                        }
                    )
                if k == len(series['operations']) - 1:
                    enriched_list.append(series['operations'][k])
                    
            j['series'][idx]['operations'] = enriched_list
        
        with open(enriched_path + 'enriched_' + fn, 'w') as json_file:
            json.dump(j, json_file, indent=4) 

100%|███████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 21.22it/s]
