In [1]:
import numpy as np
import polars as pl
import os
from collections import defaultdict
from tqdm import tqdm
import zipfile
import gc

In [2]:
folder1 = "/kaggle/input/ozon-embeddings/emb1/*.parquet"
folder2 = "/kaggle/input/ozon-embeddings/emb2/*.parquet"
folder3 = "/kaggle/input/ozon-embeddings/emb3/*.parquet"
folder4 = "/kaggle/input/ozon-embeddings/emb4/*.parquet"

In [3]:
embs1 = pl.scan_parquet(folder1)
embs2 = pl.scan_parquet(folder2)
embs3 = pl.scan_parquet(folder3)
embs4 = pl.scan_parquet(folder4)

embs = pl.concat([embs1, embs2, embs3, embs4])

In [4]:
chunks_dir_path = "/kaggle/input/item-feats-files-including-the-feats-themselves/ultra_mega_chunks"

filepaths = os.listdir(chunks_dir_path)
chunks_paths = sorted([os.path.join(chunks_dir_path, filepath) for filepath in filepaths if filepath.endswith(".parquet")])

In [None]:
chunk_n = 0
for chunk_path in tqdm(chunks_paths):
    chunk = pl.read_parquet(chunk_path)
    chunk_ids = chunk['item_id'].to_list()

    raw_embeddings = (
        embs
        .sort('index')
        .filter(pl.col('index').is_in(chunk_ids))
        .collect()
    )

    grouped_embeddings = defaultdict(list)
    
    for row in raw_embeddings.iter_rows(named=True):
        index = row['index']
        embedding = row['embedding']
        grouped_embeddings[index].append(embedding)

    text_embeds_data = []
    
    for index, vectors_list in grouped_embeddings.items():
        if len(vectors_list) == 1:
            avg_embedding = vectors_list[0]
        else:
            np_vectors = [np.array(vec, dtype=np.float64) for vec in vectors_list]
            mean_vector = np.mean(np_vectors, axis=0)
            
            max_diff = 0
            for vec in np_vectors:
                diff_norm = np.linalg.norm(vec - mean_vector)
                max_diff = max(max_diff, diff_norm)
            
            if max_diff > 1e-3:
                print(f"WARNING Item {index}: max diff from mean = {max_diff:.6f}")
            
            avg_embedding = mean_vector.astype(np.float32).tolist()
        
        text_embeds_data.append({'index': index, 'embedding': avg_embedding})

    text_embeds = pl.DataFrame(text_embeds_data)

    result = (
        chunk.lazy()
        .join(
            text_embeds.lazy(), 
            left_on='item_id',
            right_on='index',
            how='inner'
        )
        .select([
            pl.col('item_id').cast(pl.Int32),
            pl.concat_list([
                pl.col('full_embed').cast(pl.List(pl.Float32)),
                pl.col('embedding').cast(pl.List(pl.Float32))
            ]).alias('concatenated_vector')
        ])
        .collect()
    )

    result.write_parquet(f"mega_chunk_{chunk_n}.parquet")
    print(f"Chunk {chunk_n} was saved")
    chunk_n += 1

    del raw_embeddings, grouped_embeddings, text_embeds_data, text_embeds, result
    gc.collect()

  1%|          | 1/100 [00:53<1:27:45, 53.19s/it]

Chunk 0 was saved


  2%|▏         | 2/100 [02:05<1:45:19, 64.48s/it]

Chunk 1 was saved


  3%|▎         | 3/100 [02:50<1:29:33, 55.39s/it]

Chunk 2 was saved


  4%|▍         | 4/100 [03:36<1:22:43, 51.71s/it]

Chunk 3 was saved


  5%|▌         | 5/100 [04:23<1:19:08, 49.98s/it]

Chunk 4 was saved


  6%|▌         | 6/100 [05:10<1:16:39, 48.93s/it]

Chunk 5 was saved


  7%|▋         | 7/100 [05:53<1:13:05, 47.16s/it]

Chunk 6 was saved


  8%|▊         | 8/100 [06:36<1:10:10, 45.76s/it]

Chunk 7 was saved


  9%|▉         | 9/100 [07:19<1:08:20, 45.06s/it]

Chunk 8 was saved


 10%|█         | 10/100 [08:06<1:08:19, 45.55s/it]

Chunk 9 was saved


 11%|█         | 11/100 [08:50<1:06:50, 45.06s/it]

Chunk 10 was saved


 12%|█▏        | 12/100 [09:34<1:05:28, 44.64s/it]

Chunk 11 was saved


 13%|█▎        | 13/100 [10:16<1:03:48, 44.00s/it]

Chunk 12 was saved


 14%|█▍        | 14/100 [11:00<1:03:11, 44.09s/it]

Chunk 13 was saved


 15%|█▌        | 15/100 [11:46<1:03:08, 44.57s/it]

Chunk 14 was saved


 16%|█▌        | 16/100 [12:44<1:07:51, 48.47s/it]

Chunk 15 was saved


 17%|█▋        | 17/100 [13:28<1:05:31, 47.36s/it]

Chunk 16 was saved


 18%|█▊        | 18/100 [14:15<1:04:19, 47.07s/it]

Chunk 17 was saved


 19%|█▉        | 19/100 [15:00<1:02:38, 46.40s/it]

Chunk 18 was saved


 20%|██        | 20/100 [15:43<1:00:47, 45.59s/it]

Chunk 19 was saved


 21%|██        | 21/100 [16:26<59:00, 44.82s/it]  

Chunk 20 was saved


 22%|██▏       | 22/100 [17:10<57:52, 44.52s/it]

Chunk 21 was saved


 23%|██▎       | 23/100 [17:54<57:02, 44.44s/it]

Chunk 22 was saved


 24%|██▍       | 24/100 [18:40<56:43, 44.79s/it]

Chunk 23 was saved


 25%|██▌       | 25/100 [19:24<55:36, 44.49s/it]

Chunk 24 was saved


 26%|██▌       | 26/100 [20:10<55:39, 45.12s/it]

Chunk 25 was saved


 27%|██▋       | 27/100 [20:56<55:09, 45.33s/it]

Chunk 26 was saved


 28%|██▊       | 28/100 [21:41<54:00, 45.01s/it]

Chunk 27 was saved


 29%|██▉       | 29/100 [22:39<58:02, 49.05s/it]

Chunk 28 was saved


 30%|███       | 30/100 [23:24<55:56, 47.95s/it]

Chunk 29 was saved


 31%|███       | 31/100 [24:10<54:20, 47.25s/it]

Chunk 30 was saved


 32%|███▏      | 32/100 [24:54<52:17, 46.14s/it]

Chunk 31 was saved


 33%|███▎      | 33/100 [25:38<51:02, 45.70s/it]

Chunk 32 was saved


 34%|███▍      | 34/100 [26:22<49:38, 45.12s/it]

Chunk 33 was saved


 35%|███▌      | 35/100 [27:06<48:23, 44.68s/it]

Chunk 34 was saved


 36%|███▌      | 36/100 [27:50<47:33, 44.58s/it]

Chunk 35 was saved


 37%|███▋      | 37/100 [28:34<46:45, 44.53s/it]

Chunk 36 was saved


 38%|███▊      | 38/100 [29:20<46:18, 44.82s/it]

Chunk 37 was saved


 39%|███▉      | 39/100 [30:03<45:08, 44.40s/it]

Chunk 38 was saved


 40%|████      | 40/100 [30:47<44:17, 44.28s/it]

Chunk 39 was saved


 41%|████      | 41/100 [31:36<44:47, 45.55s/it]

Chunk 40 was saved


 42%|████▏     | 42/100 [32:22<44:17, 45.81s/it]

Chunk 41 was saved


 43%|████▎     | 43/100 [33:33<50:28, 53.14s/it]

Chunk 42 was saved


 44%|████▍     | 44/100 [34:17<47:05, 50.46s/it]

Chunk 43 was saved


 45%|████▌     | 45/100 [35:01<44:33, 48.61s/it]

Chunk 44 was saved


 46%|████▌     | 46/100 [35:48<43:21, 48.18s/it]

Chunk 45 was saved


 47%|████▋     | 47/100 [36:32<41:19, 46.78s/it]

Chunk 46 was saved


 48%|████▊     | 48/100 [37:15<39:41, 45.81s/it]

Chunk 47 was saved


 49%|████▉     | 49/100 [37:59<38:31, 45.32s/it]

Chunk 48 was saved


 50%|█████     | 50/100 [38:43<37:19, 44.78s/it]

Chunk 49 was saved


 51%|█████     | 51/100 [39:30<37:10, 45.53s/it]

Chunk 50 was saved


 52%|█████▏    | 52/100 [40:14<36:00, 45.01s/it]

Chunk 51 was saved


 53%|█████▎    | 53/100 [40:58<34:57, 44.63s/it]

Chunk 52 was saved


 54%|█████▍    | 54/100 [41:41<33:49, 44.12s/it]

Chunk 53 was saved


 55%|█████▌    | 55/100 [42:26<33:22, 44.50s/it]

Chunk 54 was saved


 56%|█████▌    | 56/100 [43:10<32:27, 44.27s/it]

Chunk 55 was saved


 57%|█████▋    | 57/100 [44:19<37:02, 51.69s/it]

Chunk 56 was saved


 58%|█████▊    | 58/100 [45:04<34:45, 49.64s/it]

Chunk 57 was saved


 59%|█████▉    | 59/100 [45:48<32:51, 48.08s/it]

Chunk 58 was saved


 60%|██████    | 60/100 [46:34<31:41, 47.55s/it]

Chunk 59 was saved


 61%|██████    | 61/100 [47:19<30:21, 46.70s/it]

Chunk 60 was saved


 62%|██████▏   | 62/100 [48:03<29:07, 45.99s/it]

Chunk 61 was saved


 63%|██████▎   | 63/100 [48:48<28:04, 45.53s/it]

Chunk 62 was saved


 64%|██████▍   | 64/100 [49:32<27:08, 45.23s/it]

Chunk 63 was saved


 65%|██████▌   | 65/100 [50:18<26:21, 45.20s/it]

Chunk 64 was saved


 66%|██████▌   | 66/100 [51:04<25:52, 45.67s/it]

Chunk 65 was saved


 67%|██████▋   | 67/100 [51:49<25:00, 45.48s/it]

Chunk 66 was saved


 68%|██████▊   | 68/100 [52:34<24:09, 45.28s/it]

Chunk 67 was saved


 69%|██████▉   | 69/100 [53:22<23:46, 46.01s/it]

Chunk 68 was saved


 70%|███████   | 70/100 [54:30<26:18, 52.62s/it]

Chunk 69 was saved


 71%|███████   | 71/100 [55:14<24:13, 50.11s/it]

Chunk 70 was saved


 72%|███████▏  | 72/100 [55:59<22:34, 48.38s/it]

Chunk 71 was saved


 73%|███████▎  | 73/100 [56:43<21:15, 47.23s/it]

Chunk 72 was saved


 74%|███████▍  | 74/100 [57:27<20:03, 46.29s/it]

Chunk 73 was saved


 75%|███████▌  | 75/100 [58:14<19:20, 46.42s/it]

Chunk 74 was saved


 76%|███████▌  | 76/100 [58:58<18:19, 45.79s/it]

Chunk 75 was saved


 77%|███████▋  | 77/100 [59:42<17:22, 45.32s/it]

Chunk 76 was saved


 78%|███████▊  | 78/100 [1:00:27<16:28, 44.94s/it]

Chunk 77 was saved


 79%|███████▉  | 79/100 [1:01:12<15:47, 45.10s/it]

Chunk 78 was saved


 80%|████████  | 80/100 [1:01:58<15:07, 45.37s/it]

Chunk 79 was saved


 81%|████████  | 81/100 [1:02:42<14:16, 45.06s/it]

Chunk 80 was saved


 82%|████████▏ | 82/100 [1:03:28<13:34, 45.27s/it]

Chunk 81 was saved


 83%|████████▎ | 83/100 [1:04:13<12:48, 45.21s/it]

Chunk 82 was saved


 84%|████████▍ | 84/100 [1:05:21<13:53, 52.12s/it]

Chunk 83 was saved


 85%|████████▌ | 85/100 [1:06:06<12:27, 49.84s/it]

Chunk 84 was saved


 86%|████████▌ | 86/100 [1:06:54<11:28, 49.18s/it]

Chunk 85 was saved


 87%|████████▋ | 87/100 [1:07:49<11:04, 51.12s/it]

Chunk 86 was saved


 88%|████████▊ | 88/100 [1:08:38<10:03, 50.31s/it]

Chunk 87 was saved


 89%|████████▉ | 89/100 [1:09:22<08:52, 48.38s/it]

Chunk 88 was saved


 90%|█████████ | 90/100 [1:10:05<07:49, 46.96s/it]

Chunk 89 was saved


 91%|█████████ | 91/100 [1:10:49<06:54, 46.06s/it]

Chunk 90 was saved


 92%|█████████▏| 92/100 [1:11:33<06:02, 45.37s/it]

Chunk 91 was saved


 93%|█████████▎| 93/100 [1:12:17<05:15, 45.10s/it]

Chunk 92 was saved


 94%|█████████▍| 94/100 [1:13:04<04:33, 45.66s/it]

Chunk 93 was saved


 95%|█████████▌| 95/100 [1:13:50<03:48, 45.72s/it]

Chunk 94 was saved


 96%|█████████▌| 96/100 [1:14:35<03:02, 45.51s/it]

Chunk 95 was saved


 97%|█████████▋| 97/100 [1:15:44<02:37, 52.38s/it]

Chunk 96 was saved


 98%|█████████▊| 98/100 [1:16:28<01:40, 50.01s/it]

Chunk 97 was saved


 99%|█████████▉| 99/100 [1:17:22<00:51, 51.29s/it]

Chunk 98 was saved


100%|██████████| 100/100 [1:18:07<00:00, 46.88s/it]

Chunk 99 was saved





In [6]:
source_dir = '/kaggle/working/'
zip_path = 'item_mega_embs.zip'

with zipfile.ZipFile(zip_path, 'a') as zipf:
    for file in os.listdir(source_dir):
        if file.endswith('.parquet'):
            file_path = os.path.join(source_dir, file)
            
            zipf.write(file_path, arcname=file)
            
            os.remove(file_path)
            print(f"Добавлен и удален: {file}")

print("Все файлы архивированы.")

Добавлен и удален: mega_chunk_63.parquet
Добавлен и удален: mega_chunk_86.parquet
Добавлен и удален: mega_chunk_83.parquet
Добавлен и удален: mega_chunk_2.parquet
Добавлен и удален: mega_chunk_85.parquet
Добавлен и удален: mega_chunk_91.parquet
Добавлен и удален: mega_chunk_99.parquet
Добавлен и удален: mega_chunk_53.parquet
Добавлен и удален: mega_chunk_62.parquet
Добавлен и удален: mega_chunk_24.parquet
Добавлен и удален: mega_chunk_97.parquet
Добавлен и удален: mega_chunk_61.parquet
Добавлен и удален: mega_chunk_20.parquet
Добавлен и удален: mega_chunk_75.parquet
Добавлен и удален: mega_chunk_40.parquet
Добавлен и удален: mega_chunk_34.parquet
Добавлен и удален: mega_chunk_27.parquet
Добавлен и удален: mega_chunk_30.parquet
Добавлен и удален: mega_chunk_32.parquet
Добавлен и удален: mega_chunk_6.parquet
Добавлен и удален: mega_chunk_59.parquet
Добавлен и удален: mega_chunk_71.parquet
Добавлен и удален: mega_chunk_26.parquet
Добавлен и удален: mega_chunk_16.parquet
Добавлен и удален: