In [1]:
import os, glob, math, json, warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 200)

In [3]:

pip install pyarrow fastparquet

Collecting pyarrow
  Downloading pyarrow-17.0.0-cp38-cp38-win_amd64.whl.metadata (3.4 kB)
Collecting fastparquet
  Downloading fastparquet-2024.2.0-cp38-cp38-win_amd64.whl.metadata (4.2 kB)
Collecting pandas>=1.5.0 (from fastparquet)
  Downloading pandas-2.0.3-cp38-cp38-win_amd64.whl.metadata (18 kB)
Collecting numpy>=1.16.6 (from pyarrow)
  Downloading numpy-1.24.4-cp38-cp38-win_amd64.whl.metadata (5.6 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.11.0-cp38-cp38-win_amd64.whl.metadata (681 bytes)
Collecting tzdata>=2022.1 (from pandas>=1.5.0->fastparquet)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pyarrow-17.0.0-cp38-cp38-win_amd64.whl (25.2 MB)
   ---------------------------------------- 25.2/25.2 MB 31.2 MB/s eta 0:00:00
Downloading fastparquet-2024.2.0-cp38-cp38-win_amd64.whl (671 kB)
   --------------------------------------- 671.0/671.0 kB 21.3 MB/s eta 0:00:00
Downloading cramjam-2.11.0-cp38-cp38-win_amd64.whl (1.7 MB)

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scipy 1.6.2 requires numpy<1.23.0,>=1.16.5, but you have numpy 1.24.4 which is incompatible.

[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
C:\Users\buzan\OneDrive\Рабочий стол\Проект\big-data-tech-in-fin-manage\data\train_data\train_data\train_data_0.pq

In [2]:
DATA_DIR = r"../data/train_data"  # путь из твоей папки jupiter/
FILES = sorted(glob.glob(os.path.join(DATA_DIR, "*.pq")))
assert FILES, f"Не нашли .pq в {DATA_DIR}"

def read_parquet_safe(path, columns=None):
    try:
        return pd.read_parquet(path, columns=columns, engine="fastparquet")
    except Exception:
        return pd.read_parquet(path, columns=columns, engine="pyarrow")


In [3]:
# берём первый файл как «эталон» схемы
df0 = read_parquet_safe(FILES[0])
schema = pd.DataFrame({
    "column": df0.columns,
    "dtype": [df0[c].dtype for c in df0.columns]
})
print("Колонок:", len(schema))
display(schema.head(20))


Колонок: 61


Unnamed: 0,column,dtype
0,id,int64
1,rn,int64
2,pre_since_opened,int64
3,pre_since_confirmed,int64
4,pre_pterm,int64
5,pre_fterm,int64
6,pre_till_pclose,int64
7,pre_till_fclose,int64
8,pre_loans_credit_limit,int64
9,pre_loans_next_pay_summ,int64


In [7]:
# Сэмплим N строк равномерно из всех файлов (по доле). Можно поставить None — тогда читаем всё.
TARGET_SAMPLE = 2_000_000   # ≈ 2 млн строк для быстрой разведки
USE_SAMPLE = True           # False — читать всё (осторожно!)

# предварительно оценим общий размер по файлам (грубая оценка)
sizes = []
for fp in FILES:
    try:
        sizes.append(read_parquet_safe(fp).shape[0])
    except Exception:
        sizes.append(0)
total_rows_est = int(np.sum(sizes))
print(f"Оценка общего числа строк: {total_rows_est:,}")

if not USE_SAMPLE or TARGET_SAMPLE is None or TARGET_SAMPLE >= total_rows_est:
    sample_perc = 1.0
else:
    sample_perc = TARGET_SAMPLE / max(total_rows_est, 1)
print(f"Доля выборки: {sample_perc:.4f}")


Оценка общего числа строк: 26,162,717
Доля выборки: 0.0764


In [23]:
rng = np.random.default_rng(42)
parts = []
for fp, nrows in zip(FILES, sizes):
    if nrows == 0:
        continue
    frac = 1.0 if not USE_SAMPLE else min(1.0, sample_perc * total_rows_est / nrows)  # ровнее распределит
    frac = min(frac, 1.0)
    df_part = read_parquet_safe(fp)
    if frac < 0.9999:
        take = max(1, int(round(nrows * frac)))
        # быстрая равномерная выборка без replace
        idx = rng.choice(nrows, size=take, replace=False)
        df_part = df_part.iloc[idx]
    parts.append(df_part)

df = pd.concat(parts, axis=0, ignore_index=True)
print("Выборка для EDA:", df.shape)
display(df.head())


Выборка для EDA: (23974724, 61)


Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,pre_loans_outstanding,pre_loans_total_overdue,pre_loans_max_overdue_sum,pre_loans_credit_cost_rate,pre_loans5,pre_loans530,pre_loans3060,pre_loans6090,pre_loans90,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,pre_util,pre_over2limit,pre_maxover2limit,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,enc_paym_0,enc_paym_1,enc_paym_2,enc_paym_3,enc_paym_4,enc_paym_5,enc_paym_6,enc_paym_7,enc_paym_8,enc_paym_9,enc_paym_10,enc_paym_11,enc_paym_12,enc_paym_13,enc_paym_14,enc_paym_15,enc_paym_16,enc_paym_17,enc_paym_18,enc_paym_19,enc_paym_20,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,0,1,18,9,2,3,16,10,11,3,3,0,2,11,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0,0,3,3,3,3,3,3,3,3,3,4,3,3,3,3,3,3,3,3,4,3,3,3,4,1,3,4,1,0,0
1,0,2,18,9,14,14,12,12,0,3,3,0,2,11,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,4,1,3,4,1,0,0
2,0,3,18,9,4,8,1,11,11,0,5,0,2,8,6,16,5,4,8,1,1,1,1,1,15,2,17,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,4,1,2,3,1,1,1
3,0,4,4,1,9,12,16,7,12,2,3,0,2,4,6,16,5,4,8,0,1,1,1,1,16,2,17,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,3,3,3,3,3,3,3,3,4,3,3,3,4,1,3,1,1,0,0
4,0,5,5,12,15,2,11,12,10,2,3,0,2,4,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0,0,0,0,0,0,0,3,3,3,3,4,3,3,3,3,3,3,3,3,4,3,3,3,4,1,3,4,1,0,0
