In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import polars as pl

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sports-and-outdoors-raw-amazon2023/Sports_and_Outdoors.jsonl


In [6]:
import polars as pl
import os
import io
import gc

# --- CONFIGURATION ---
INPUT_FILE = '/kaggle/input/sports-and-outdoors-raw-amazon2023/Sports_and_Outdoors.jsonl'
TEMP_FOLDER = 'temp_parquet_parts'
OUTPUT_FILE = 'sports_reviews_10core.jsonl'

USER_COL = 'user_id'
ITEM_COL = 'parent_asin'
MIN_INTERACTIONS = 5

# 1 Million rows is safe for 30GB RAM
CHUNK_SIZE = 1_000_000 
# ---------------------

def convert_to_parquet_batches(input_path, output_folder):
    """
    Reads the JSONL file in large chunks and saves them as Parquet.
    This solves the 'JSON Parse Spike' memory issue.
    """
    print(f"Step 1: Converting {input_path} to Parquet in batches...")
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        
    chunk_buffer = []
    batch_num = 0
    
    with open(input_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            chunk_buffer.append(line)
            
            # When we hit the chunk size, save and clear memory
            if len(chunk_buffer) >= CHUNK_SIZE:
                _save_batch(chunk_buffer, output_folder, batch_num)
                batch_num += 1
                chunk_buffer = [] # Reset buffer
                gc.collect()      # Force memory cleanup
                print(f"   Processed {i+1} lines...")
                
        # Save the final remainder
        if chunk_buffer:
            _save_batch(chunk_buffer, output_folder, batch_num)

    print("Conversion complete.")

def _save_batch(lines, folder, batch_num):
    """Helper to write a batch of lines to a parquet file"""
    # Convert list of strings to bytes for Polars
    f = io.BytesIO("".join(lines).encode('utf-8'))
    
    # Read using Polars (fast)
    df = pl.read_ndjson(f, infer_schema_length=None, ignore_errors=True)
    
    # Enforce ID types to string to prevent mixing int/str errors
    df = df.with_columns([
        pl.col(USER_COL).cast(pl.String),
        pl.col(ITEM_COL).cast(pl.String)
    ])
    
    # Write compressed parquet
    output_path = os.path.join(folder, f"part_{batch_num}.parquet")
    df.write_parquet(output_path)
    
    # Clean up
    del df
    del f

def main():
    # 1. Convert JSONL -> Parquet (The Chunked Step)
    convert_to_parquet_batches(INPUT_FILE, TEMP_FOLDER)

if __name__ == "__main__":
    main()

Step 1: Converting /kaggle/input/sports-and-outdoors-raw-amazon2023/Sports_and_Outdoors.jsonl to Parquet in batches...
   Processed 1000000 lines...
   Processed 2000000 lines...
   Processed 3000000 lines...
   Processed 4000000 lines...
   Processed 5000000 lines...
   Processed 6000000 lines...
   Processed 7000000 lines...
   Processed 8000000 lines...
   Processed 9000000 lines...
   Processed 10000000 lines...
   Processed 11000000 lines...
   Processed 12000000 lines...
   Processed 13000000 lines...
   Processed 14000000 lines...
   Processed 15000000 lines...
   Processed 16000000 lines...
   Processed 17000000 lines...
   Processed 18000000 lines...
   Processed 19000000 lines...
Conversion complete.


In [7]:

# --- CONFIGURATION ---
TEMP_FOLDER = 'temp_parquet_parts' # The folder where your files already are
OUTPUT_FILE = 'sports_reviews_5core.jsonl'
MIN_INTERACTIONS = 5

USER_COL = 'user_id'
ITEM_COL = 'parent_asin'
# ---------------------

def get_safe_columns(folder_path):
    """
    Peeks at the first parquet file to get all column names,
    then removes complex columns known to cause schema conflicts.
    """
    # Find the first file
    files = sorted([os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.parquet')])
    if not files:
        raise FileNotFoundError("No parquet files found in temp folder.")
    
    # Read schema of first file (very fast, reads metadata only)
    # We use scan_parquet to get the schema without loading data
    schema = pl.scan_parquet(files[0]).schema
    all_cols = list(schema.keys())
    
    # COLUMNS TO EXCLUDE
    # 'images' is causing your current crash.
    # 'videos' often causes similar issues.
    exclude_cols = ['images', 'videos', 'style'] 
    
    safe_cols = [c for c in all_cols if c not in exclude_cols]
    
    print(f"Total columns detected: {len(all_cols)}")
    print(f"Excluding problematic columns: {exclude_cols}")
    print(f"Keeping {len(safe_cols)} columns.")
    
    return safe_cols

def main():
    print("Step 2: Resuming Load (with Schema Safety)...")
    
    # 1. Determine which columns are safe to load
    try:
        safe_cols = get_safe_columns(TEMP_FOLDER)
    except Exception as e:
        print(f"Error inspecting files: {e}")
        return

    # 2. Load only the safe columns
    print("Loading Parquet files into RAM...")
    try:
        # We explicitly pass 'columns=safe_cols' to avoid the SchemaError
        df = pl.read_parquet(os.path.join(TEMP_FOLDER, "*.parquet"), columns=safe_cols)
    except Exception as e:
        print(f"Still failing to load? Error: {e}")
        return

    print(f"Successfully loaded dataset: {len(df)} rows")
    
 # 3. Iterative 10-Core Filter
    print(f"Starting {MIN_INTERACTIONS}-core processing...")
    iteration = 0
    
    while True:
        iteration += 1
        start_len = len(df)
        
        df = df.filter(pl.len().over(USER_COL) >= MIN_INTERACTIONS)
        df = df.filter(pl.len().over(ITEM_COL) >= MIN_INTERACTIONS)
        
        end_len = len(df)
        print(f"Iteration {iteration}: {start_len} -> {end_len} rows")
        
        if start_len == end_len:
            print("Convergence reached.")
            break

    # 4. Final Stats & Save
    n_users = df[USER_COL].n_unique()
    n_items = df[ITEM_COL].n_unique()
    
    print(f"\nFinal Stats:")
    print(f"Rows: {len(df)}")
    print(f"Users: {n_users}")
    print(f"Items: {n_items}")
    
    print(f"Saving result to {OUTPUT_FILE}...")
    df.write_ndjson(OUTPUT_FILE)
    print("Done!")

if __name__ == "__main__":
    main()

Step 2: Resuming Load (with Schema Safety)...
Total columns detected: 10
Excluding problematic columns: ['images', 'videos', 'style']
Keeping 9 columns.
Loading Parquet files into RAM...


  schema = pl.scan_parquet(files[0]).schema


Successfully loaded dataset: 19595170 rows
Starting 5-core processing...
Iteration 1: 19595170 -> 4439848 rows
Iteration 2: 4439848 -> 3724066 rows
Iteration 3: 3724066 -> 3639255 rows
Iteration 4: 3639255 -> 3627556 rows
Iteration 5: 3627556 -> 3625829 rows
Iteration 6: 3625829 -> 3625541 rows
Iteration 7: 3625541 -> 3625501 rows
Iteration 8: 3625501 -> 3625477 rows
Iteration 9: 3625477 -> 3625460 rows
Iteration 10: 3625460 -> 3625448 rows
Iteration 11: 3625448 -> 3625444 rows
Iteration 12: 3625444 -> 3625444 rows
Convergence reached.

Final Stats:
Rows: 3625444
Users: 425812
Items: 161362
Saving result to sports_reviews_5core.jsonl...
Done!
