- Здесь на основе собранных данных собираем базу данных сегментов/категорий.
- После этого происходит обновление/дополнение предыдущей версии сегментов/категорий.
- Тут можно запускать сразу весь ноутбук.

In [1]:
from datetime import datetime
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
from pathlib import Path
import requests
from requests.exceptions import ConnectTimeout
from datetime import datetime
from pytz import timezone
tqdm.pandas()
requests.packages.urllib3.disable_warnings()

In [2]:
start = datetime.strftime(datetime.now(timezone('Europe/Moscow')), '%Y-%b-%d %H:%M:%S')

In [3]:
def _save_df_to_zip(df_: pd.DataFrame, archive_name: str = 'archive', folder: str='data', replace: bool=False) -> None:
    # Путь к файлу
    file_path = Path(folder).joinpath(archive_name + '.zip')
    Path(folder).mkdir(exist_ok=True)
    # Проверяем, существует ли файл
    if file_path.exists() and not replace:
        # Получаем время создания файла
        time = datetime.fromtimestamp(file_path.lstat().st_atime).strftime('%Y-%m-%d %H-%M')

        # Создаем новое имя файла с добавлением времени Unix
        new_file_name = file_path.stem + " " + str(time) + file_path.suffix

        # Создаем новый путь для переименованного файла
        new_file_path = file_path.with_name(new_file_name)
        # Переименовываем файл
        file_path.rename(new_file_path)

# to csv
    compression_opts = dict(method='zip', archive_name=f'{archive_name}.csv')
    df_.to_csv(f'{folder}/{archive_name}.zip', index=False, compression=compression_opts, encoding='utf-8')

In [4]:
loc_df = pd.read_csv('data/located_list.zip')
ret_df = pd.read_csv('data/retailers.zip')

In [5]:
segments_list = list()
url = 'https://search.edadeal.io/api/v4/search'
with tqdm(total=ret_df.shape[0], desc='Зашрузка:') as pbar:
    for index, item in ret_df.iterrows():
        
        res = loc_df[loc_df['slug'] == item.sity].min()
    
        headers = {
        'x-locality-geoid': str(res.geoId),
        'x-position-latitude': f'{res.lat:.5f}',
        'x-position-longitude': f'{res.lng:.5f}'
        }
        
        params = {
            'groupBy': ['meta'],
             'noContent': ['true'],
             'page': ['0'],
             'retailerUuid': [item.data_uuid]
        }
        
        for _ in range(10):
            try:
                respons = requests.get(url, headers=headers, params=params, verify=False).json()
            except ConnectTimeout:
                continue
            else:
                break
            
        segments_list.append((item.sity, item.data_uuid, respons))
        pbar.update(1)


Зашрузка::   0%|          | 0/5354 [00:00<?, ?it/s]

In [6]:
%%time
df = pd.DataFrame(segments_list, columns=['sity', 'market', 'tmp'])
df = (df['tmp']
      .apply(lambda x: x.get('entities').get('segments'))
      .explode()
      .dropna()
      .apply(lambda x: pd.Series(x))
      .drop('count',axis=1)
      .drop_duplicates(ignore_index=True)
      )
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1049 entries, 0 to 1048
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   uuid        1049 non-null   object
 1   slug        1049 non-null   object
 2   level       1049 non-null   int64 
 3   name        1049 non-null   object
 4   parentUuid  1028 non-null   object
dtypes: int64(1), object(4)
memory usage: 41.1+ KB
CPU times: total: 1min 13s
Wall time: 1min 13s


In [7]:
segment_old = pd.read_csv('data/segments_id.zip')
print('Старая таблица:', segment_old.shape)
df = pd.concat((segment_old, df), axis=0, ignore_index=True).drop_duplicates(keep='last', ignore_index=True)
_save_df_to_zip(df, 'segments_id')

Старая таблица: (1067, 5)


In [8]:
lev_00 = df.copy()
lev_00.shape

(1082, 5)

In [9]:
lev_03 = lev_00[lev_00['level'] == 3].reset_index(drop=True).copy().drop('level', axis=1)
lev_02 = lev_00[lev_00['level'] == 2].reset_index(drop=True).copy().drop('level', axis=1)
lev_01 = lev_00[lev_00['level'] == 1].reset_index(drop=True).copy().drop('level', axis=1)

In [10]:
lev_1 = lev_03.merge(
                        right=lev_02,
                        how='left',
                        left_on='parentUuid',
                        right_on='uuid',
                        suffixes=(' level 03', ' level 02')
                    )
lev_2 = lev_1.merge(
                        right=lev_01,
                        how='left',
                        left_on='parentUuid level 02',
                        right_on='uuid'
                    ).drop(['parentUuid level 03', 'parentUuid level 02', 'parentUuid'], axis=1)
lev_df = lev_2[[
                    'slug',
                    'name',
                    'uuid',
                    'slug level 02',
                    'name level 02',
                    'uuid level 02',
                    'slug level 03',
                    'name level 03',
                    'uuid level 03',
                ]]

In [11]:
_save_df_to_zip(lev_df, 'segments')
# lev_df.to_excel('data/segments.xlsx', index=False)

In [12]:
end = datetime.strftime(datetime.now(timezone('Europe/Moscow')), '%Y-%b-%d %H:%M:%S')
print(f'Начало: {start}')
print(f'Конец: {end}')

Начало: 2023-Oct-03 10:11:21
Конец: 2023-Oct-03 10:23:04
