In [179]:
from pathlib import Path
import pandas as pd
import pyarrow as pa
import numpy as np
import re
from tqdm.notebook import tqdm
from datetime import datetime

In [177]:
def _save_df_to_zip(df_: pd.DataFrame, archive_name: str = 'archive', folder: str='data', replace: bool=False) -> None:
    # Путь к файлу
    file_path = Path(folder).joinpath(archive_name + '.zip')
    Path(folder).mkdir(exist_ok=True)
    # Проверяем, существует ли файл
    if file_path.exists() and not replace:
        # Получаем время создания файла
        time = datetime.fromtimestamp(file_path.lstat().st_atime).strftime('%Y-%m-%d %H-%M')

        # Создаем новое имя файла с добавлением времени Unix
        new_file_name = file_path.stem + " " + str(time) + file_path.suffix

        # Создаем новый путь для переименованного файла
        new_file_path = file_path.with_name(new_file_name)
        # Переименовываем файл
        file_path.rename(new_file_path)

# to csv
    compression_opts = dict(method='zip', archive_name=f'{archive_name}.csv')
    df_.to_csv(f'{folder}/{archive_name}.zip', index=False, compression=compression_opts, encoding='utf-8')

In [33]:
p = Path('data')
if p.exists():
    dataframes_list = list(filter(lambda x: re.match(r'(result \d{4}.zip)', x.name), p.iterdir()))

In [161]:
columns_types = {
                    'sity':pd.StringDtype(),
                    'market':pd.StringDtype(),
                    'discounts':pd.UInt32Dtype(),
                    'data_uuid':pd.StringDtype(),
                    'href':pd.StringDtype(),
                    'su1':pd.StringDtype(),
                    'su2':pd.StringDtype(),
                    'su3':pd.StringDtype(),
                    'title':pd.StringDtype(),
                    'sku_id':pd.StringDtype(),
                    'price':pd.UInt32Dtype(),
                    'price_from':pd.UInt32Dtype(),
                    'price_to':pd.UInt32Dtype(),
                    'discountPercent':pd.UInt32Dtype(),
                    'quantity':pd.Float32Dtype(),
                    'quantityUnit':pd.StringDtype(),
                }

In [174]:
df = list()
for item in tqdm(dataframes_list, desc='Load:'):
    # df.append(pd.read_csv(item, dtype_backend='pyarrow', engine='pyarrow', parse_dates=['date', 'dateStart', 'dateEnd'], dtype=columns_types))
    df.append(pd.read_csv(item, parse_dates=['date', 'dateStart', 'dateEnd'], dtype=columns_types))
df = pd.concat(df, axis=0, ignore_index=True)

Load::   0%|          | 0/34 [00:00<?, ?it/s]

In [175]:
# pd.to_datetime(df['dateStart'])
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16482493 entries, 0 to 16482492
Data columns (total 19 columns):
 #   Column           Dtype         
---  ------           -----         
 0   date             datetime64[ns]
 1   sity             string        
 2   market           string        
 3   discounts        UInt32        
 4   data_uuid        string        
 5   href             string        
 6   su1              string        
 7   su2              string        
 8   su3              string        
 9   title            string        
 10  sku_id           string        
 11  dateStart        datetime64[ns]
 12  dateEnd          datetime64[ns]
 13  price            UInt32        
 14  price_from       UInt32        
 15  price_to         UInt32        
 16  discountPercent  UInt32        
 17  quantity         Float32       
 18  quantityUnit     string        
dtypes: Float32(1), UInt32(5), datetime64[ns](3), string(10)
memory usage: 2.1 GB


In [180]:
_save_df_to_zip(df, 'result')