# Convert NBA Dataset to Parquet

This notebook converts the gzipped CSV to Parquet format for 10x faster loading.

## Steps:
1. Add "meeper" dataset (Add Data -> search "meeper")
2. Run the cell below (takes ~10 min)
3. Download parquet from Output tab
4. Upload as new Kaggle dataset

**No GPU needed** - just CPU

In [None]:
import pandas as pd
import os

csv_path = '/kaggle/input/meeper/aggregated_nba_data.csv/aggregated_nba_data.csv.gzip'

print("="*60)
print("CONVERTING CSV TO PARQUET")
print("="*60)

# Check file exists
if not os.path.exists(csv_path):
    print("ERROR: Dataset not found!")
    print("Add 'meeper' dataset: Add Data -> search 'meeper' -> Add")
else:
    csv_size = os.path.getsize(csv_path) / 1024**2
    print(f"Input: {csv_path}")
    print(f"Size: {csv_size:.1f} MB (compressed)")
    
    print("\nStep 1/3: Loading CSV (this takes 5-10 minutes)...")
    df = pd.read_csv(csv_path, low_memory=False)
    print(f"✅ Loaded {len(df):,} rows, {len(df.columns)} columns")
    
    # Show memory usage
    mem_mb = df.memory_usage(deep=True).sum() / 1024**2
    print(f"   Memory usage: {mem_mb:.1f} MB")
    
    # Show year range
    if 'season_end_year' in df.columns:
        print(f"   Year range: {df['season_end_year'].min()}-{df['season_end_year'].max()}")
    
    print("\nStep 2/3: Saving as Parquet (snappy compression)...")
    parquet_path = 'aggregated_nba_data.parquet'
    df.to_parquet(parquet_path, compression='snappy', index=False)
    
    parquet_size = os.path.getsize(parquet_path) / 1024**2
    print(f"✅ Saved: {parquet_path}")
    print(f"   Size: {parquet_size:.1f} MB")
    print(f"   Compression ratio: {csv_size/parquet_size:.1f}x")
    
    print("\nStep 3/3: Download and upload")
    print("   1. Look at right sidebar -> Output tab")
    print("   2. Find 'aggregated_nba_data.parquet'")
    print("   3. Click download icon")
    print("   4. Go to kaggle.com/datasets/new")
    print("   5. Upload the parquet file")
    print("   6. Name it 'nba-aggregated-parquet'")
    
    print("\n" + "="*60)
    print("DONE! Parquet file ready for download")
    print("="*60)