# Convert NBA CSV to Parquet

**Steps:**
1. Add "meeper" dataset (Add Data -> search "meeper")
2. Run the cell below (~10 min)
3. Download parquet from Output tab
4. Upload as new Kaggle dataset

**No GPU needed**

In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import gc

csv_path = '/kaggle/input/meeper/aggregated_nba_data.csv/aggregated_nba_data.csv.gzip'
parquet_path = 'aggregated_nba_data.parquet'

print("="*60)
print("CHUNKED CSV TO PARQUET CONVERSION (PyArrow)")
print("="*60)
print("Loading in chunks to avoid memory crashes...\n")

# First pass: read first chunk to get schema, then standardize ALL dtypes
print("Step 1: Reading first chunk to establish schema...")
first_chunk = pd.read_csv(csv_path, nrows=1000, low_memory=False)

# Create dtype mapping - convert ALL object columns to string, numeric to float
dtype_mapping = {}
for col in first_chunk.columns:
    if first_chunk[col].dtype == 'object':
        dtype_mapping[col] = str
    elif first_chunk[col].dtype in ['int64', 'float64']:
        # Keep as-is but will handle NaN conversion
        pass

print(f"Found {len(first_chunk.columns)} columns")
del first_chunk
gc.collect()

# Read and write in chunks using PyArrow (no data loss!)
print("\nStep 2: Converting chunks to Parquet...")
chunk_size = 500000  # 500K rows at a time
total_rows = 0
writer = None
schema = None

for i, chunk in enumerate(pd.read_csv(csv_path, chunksize=chunk_size, low_memory=False, dtype=dtype_mapping)):
    total_rows += len(chunk)
    print(f"Chunk {i+1}: {total_rows:,} rows processed...")
    
    # Standardize dtypes to prevent schema mismatch
    # Convert all object columns to string explicitly
    for col in chunk.select_dtypes(include=['object']).columns:
        chunk[col] = chunk[col].astype(str)
    
    # Convert to PyArrow table
    table = pa.Table.from_pandas(chunk, preserve_index=False)
    
    if writer is None:
        # First chunk: create writer with schema
        schema = table.schema
        writer = pq.ParquetWriter(parquet_path, schema, compression='snappy')
        print(f"Schema established with {len(schema)} columns")
    else:
        # Ensure schema matches by casting if needed
        try:
            table = table.cast(schema)
        except Exception as e:
            print(f"Warning: Schema cast issue in chunk {i+1}, attempting column-by-column fix...")
            # Manual fix: rebuild table with consistent types
            arrays = []
            for field in schema:
                if field.name in chunk.columns:
                    col_data = chunk[field.name]
                    # Force convert to expected type
                    if pa.types.is_string(field.type):
                        col_data = col_data.astype(str)
                    arrays.append(pa.array(col_data, type=field.type, from_pandas=True))
                else:
                    # Missing column - fill with nulls
                    arrays.append(pa.nulls(len(chunk), type=field.type))
            table = pa.Table.from_arrays(arrays, schema=schema)
    
    # Write chunk to parquet
    writer.write_table(table)
    
    del chunk, table
    gc.collect()

# Close writer to finalize file
if writer:
    writer.close()

print(f"\nTotal rows written: {total_rows:,}")

# Verify the parquet file
pf = pq.ParquetFile(parquet_path)
actual_rows = pf.metadata.num_rows
print(f"Parquet rows (verified): {actual_rows:,}")

if actual_rows != total_rows:
    print(f"WARNING: Row count mismatch! Expected {total_rows:,}, got {actual_rows:,}")
else:
    print("Row count verified OK!")

size_mb = os.path.getsize(parquet_path) / 1024**2
print(f"\nParquet file: {size_mb:.1f} MB")

# Show year range
print("\nVerifying year range...")
sample = next(pq.ParquetFile(parquet_path).iter_batches(columns=['season', 'game_year'], batch_size=100000))
sample_df = sample.to_pandas()
print(f"Sample season range: {sample_df['season'].min()} - {sample_df['season'].max()}")

print("\n" + "="*60)
print("DONE! Download from Output tab")
print("="*60)