# Convert NBA CSV to Parquet

**Steps:**
1. Add "meeper" dataset (Add Data -> search "meeper")
2. Run the cell below (~10 min)
3. Download parquet from Output tab
4. Upload as new Kaggle dataset

**No GPU needed**

In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import gc

csv_path = '/kaggle/input/meeper/aggregated_nba_data.csv/aggregated_nba_data.csv.gzip'
parquet_path = 'aggregated_nba_data.parquet'

print("="*60)
print("CHUNKED CSV TO PARQUET CONVERSION (PyArrow)")
print("="*60)
print("Loading in chunks to avoid memory crashes...\n")

# Read and write in chunks using PyArrow (no data loss!)
chunk_size = 500000  # 500K rows at a time
total_rows = 0
writer = None

for i, chunk in enumerate(pd.read_csv(csv_path, chunksize=chunk_size, low_memory=False)):
    total_rows += len(chunk)
    print(f"Chunk {i+1}: {total_rows:,} rows processed...")
    
    # Convert to PyArrow table
    table = pa.Table.from_pandas(chunk, preserve_index=False)
    
    if writer is None:
        # First chunk: create writer with schema
        writer = pq.ParquetWriter(parquet_path, table.schema, compression='snappy')
    
    # Write chunk to parquet
    writer.write_table(table)
    
    del chunk, table
    gc.collect()

# Close writer to finalize file
if writer:
    writer.close()

print(f"\nTotal rows written: {total_rows:,}")

# Verify the parquet file
pf = pq.ParquetFile(parquet_path)
actual_rows = pf.metadata.num_rows
print(f"Parquet rows (verified): {actual_rows:,}")

if actual_rows != total_rows:
    print(f"WARNING: Row count mismatch! Expected {total_rows:,}, got {actual_rows:,}")
else:
    print("Row count verified OK!")

size_mb = os.path.getsize(parquet_path) / 1024**2
print(f"\nParquet file: {size_mb:.1f} MB")

# Show year range
sample = pd.read_parquet(parquet_path, columns=['season', 'game_year'], engine='pyarrow')
print(f"Season range: {sample['season'].min()} - {sample['season'].max()}")
print(f"Game year range: {sample['game_year'].min()} - {sample['game_year'].max()}")

print("\n" + "="*60)
print("DONE! Download from Output tab")
print("="*60)