# HPI-FHFA Basic Tutorial\n\nThis notebook demonstrates basic usage of the HPI-FHFA library for calculating house price indices using the FHFA methodology.\n\n## Overview\n\nThe HPI-FHFA library implements the Federal Housing Finance Agency's repeat-sales methodology for calculating house price indices at tract and city levels.

In [None]:
# Import required libraries\nimport polars as pl\nimport numpy as np\nfrom pathlib import Path\nfrom datetime import date, timedelta\n\nfrom hpi_fhfa.processing.pipeline import HPIPipeline\nfrom hpi_fhfa.config.settings import HPIConfig\nfrom hpi_fhfa.validation import HPIValidator

## Step 1: Create Sample Data\n\nLet's create some sample transaction and geographic data for demonstration purposes.

In [None]:
# Create sample transaction data\nnp.random.seed(42)\n\nn_transactions = 5000\nn_properties = 1500\n\n# Generate property IDs with repeat sales\nproperty_weights = np.random.exponential(2, n_properties)\nproperty_weights = property_weights / property_weights.sum()\nproperty_ids = np.random.choice(\n    [f'P{i:06d}' for i in range(n_properties)],\n    size=n_transactions,\n    p=property_weights\n)\n\n# Generate dates over 5 years\nstart_date = date(2018, 1, 1)\nend_date = date(2023, 12, 31)\ndate_range = (end_date - start_date).days\ndates = [start_date + timedelta(days=int(d)) for d in np.random.randint(0, date_range, n_transactions)]\n\n# Generate prices with appreciation trend\nbase_price = 350000\nyears_from_start = [(d - start_date).days / 365.25 for d in dates]\nannual_appreciation = 0.05  # 5% annual appreciation\nprice_trend = [base_price * (1 + annual_appreciation) ** year for year in years_from_start]\nprice_noise = np.random.lognormal(0, 0.25, n_transactions)\nprices = [max(75000, trend * noise) for trend, noise in zip(price_trend, price_noise)]\n\n# Create transaction DataFrame\ntransactions = pl.DataFrame({\n    'property_id': property_ids,\n    'transaction_date': dates,\n    'transaction_price': prices,\n    'census_tract': np.random.choice([f'T{i:03d}' for i in range(20)], n_transactions),\n    'cbsa_code': np.random.choice(['CBSA001', 'CBSA002'], n_transactions),\n    'distance_to_cbd': np.random.uniform(1, 30, n_transactions)\n})\n\nprint(f'Generated {len(transactions):,} transactions')\nprint(f'Date range: {transactions["transaction_date"].min()} to {transactions["transaction_date"].max()}')\nprint(f'Price range: ${transactions["transaction_price"].min():,.0f} to ${transactions["transaction_price"].max():,.0f}')\ntransactions.head()

In [None]:
# Create geographic data\ntract_ids = [f'T{i:03d}' for i in range(20)]\ncbsa_codes = ['CBSA001', 'CBSA002']\n\ngeographic = pl.DataFrame({\n    'tract_id': tract_ids,\n    'cbsa_code': np.random.choice(cbsa_codes, 20),\n    'centroid_lat': np.random.uniform(33.5, 34.5, 20),\n    'centroid_lon': np.random.uniform(-118.5, -117.5, 20),\n    'housing_units': np.random.randint(1000, 4000, 20),\n    'housing_value': np.random.uniform(800_000_000, 2_500_000_000, 20),\n    'college_share': np.random.beta(3, 2, 20),\n    'nonwhite_share': np.random.beta(2, 3, 20)\n})\n\nprint(f'Generated {len(geographic)} census tracts')\ngeographic.head()

## Step 2: Configure the Pipeline\n\nSet up the HPI calculation pipeline with appropriate parameters.

In [None]:
# Save data to temporary files\nimport tempfile\n\ntemp_dir = Path(tempfile.mkdtemp())\nprint(f'Using temporary directory: {temp_dir}')\n\n# Save data files\ntxn_path = temp_dir / 'transactions.parquet'\ngeo_path = temp_dir / 'geographic.parquet'\n\ntransactions.write_parquet(txn_path)\ngeographic.write_parquet(geo_path)\n\nprint('Data files saved successfully')

In [None]:
# Configure the HPI pipeline\nconfig = HPIConfig(\n    transaction_data_path=txn_path,\n    geographic_data_path=geo_path,\n    output_path=temp_dir / 'output',\n    start_year=2019,\n    end_year=2023,\n    weight_schemes=['sample', 'value', 'unit'],\n    n_jobs=2,\n    validate_data=True,\n    use_lazy_evaluation=False  # For easier debugging\n)\n\nprint('Pipeline configuration:')\nprint(f'  Years: {config.start_year}-{config.end_year}')\nprint(f'  Weight schemes: {config.weight_schemes}')\nprint(f'  Parallel jobs: {config.n_jobs}')\nprint(f'  Validation enabled: {config.validate_data}')

## Step 3: Run the Pipeline\n\nExecute the HPI calculation pipeline.

In [None]:
# Create and run the pipeline\npipeline = HPIPipeline(config)\n\nprint('Running HPI pipeline...')\nresults = pipeline.run()\n\nprint('Pipeline completed successfully!')\nprint(f'Processing time: {results.metadata["processing_time"]:.2f} seconds')\nprint(f'Transactions processed: {results.metadata["n_transactions"]:,}')\nprint(f'Repeat sales found: {results.metadata["n_repeat_sales"]:,}')\nprint(f'Filtered sales: {results.metadata["n_filtered_sales"]:,}')

## Step 4: Analyze Results\n\nExplore the calculated house price indices.

In [None]:
# Examine tract-level indices\ntract_df = results.tract_indices\n\nprint(f'Tract-level indices: {len(tract_df):,} records')\nif not tract_df.is_empty():\n    print(f'Tracts covered: {tract_df["tract_id"].n_unique()}')\n    print(f'Years covered: {tract_df["year"].n_unique()}')\n    \n    # Show sample data\n    print('\\nSample tract indices:')\n    display(tract_df.head(10))\nelse:\n    print('No tract indices generated')

In [None]:
# Examine city-level indices\nprint('City-level indices by weight scheme:')\n\nfor scheme, city_df in results.city_indices.items():\n    print(f'\\n{scheme.title()} weights:')\n    \n    if not city_df.is_empty():\n        print(f'  Records: {len(city_df)}')\n        print(f'  CBSAs covered: {city_df["cbsa_code"].n_unique()}')\n        print(f'  Years covered: {city_df["year"].n_unique()}')\n        \n        # Calculate average appreciation\n        appreciation_data = city_df.filter(pl.col('appreciation_rate').is_not_null())\n        if len(appreciation_data) > 0:\n            avg_appreciation = appreciation_data['appreciation_rate'].mean()\n            print(f'  Average appreciation: {avg_appreciation:.2f}%')\n        \n        # Show sample data\n        display(city_df.head())\n    else:\n        print('  No data generated')

## Step 5: Validate Results\n\nRun validation checks on the calculated indices.

In [None]:
# Validate the results\nvalidator = HPIValidator(tolerance=0.001)  # 0.1% tolerance\n\nvalidation_results = validator.validate_all(\n    results.tract_indices,\n    results.city_indices\n)\n\nprint('Validation completed')\nprint(f'Total validation tests: {len(validation_results)}')\n\n# Show validation summary\npassed = sum(1 for r in validation_results if r.passed)\nfailed = len(validation_results) - passed\n\nprint(f'Passed: {passed}')\nprint(f'Failed: {failed}')\nprint(f'Success rate: {passed/len(validation_results)*100:.1f}%')

In [None]:
# Show detailed validation report\nprint('Detailed Validation Report:')\nprint('=' * 50)\nprint(validator.get_summary_report())

## Step 6: Visualize Results (Optional)\n\nCreate basic visualizations of the house price indices.

In [None]:
# Optional: Create visualizations if matplotlib is available\ntry:\n    import matplotlib.pyplot as plt\n    \n    # Plot tract-level appreciation over time\n    if not tract_df.is_empty():\n        appreciation_data = tract_df.filter(\n            pl.col('appreciation_rate').is_not_null()\n        )\n        \n        if len(appreciation_data) > 0:\n            # Calculate average appreciation by year\n            yearly_appreciation = (\n                appreciation_data\n                .group_by('year')\n                .agg(pl.col('appreciation_rate').mean().alias('avg_appreciation'))\n                .sort('year')\n            )\n            \n            plt.figure(figsize=(10, 6))\n            \n            years = yearly_appreciation['year'].to_list()\n            appreciations = yearly_appreciation['avg_appreciation'].to_list()\n            \n            plt.plot(years, appreciations, marker='o', linewidth=2, markersize=8)\n            plt.title('Average Tract-Level House Price Appreciation', fontsize=14, fontweight='bold')\n            plt.xlabel('Year', fontsize=12)\n            plt.ylabel('Appreciation Rate (%)', fontsize=12)\n            plt.grid(True, alpha=0.3)\n            plt.tight_layout()\n            plt.show()\n            \n            print('Tract-level appreciation chart created')\n        else:\n            print('No appreciation data available for charting')\n    \nexcept ImportError:\n    print('Matplotlib not available - skipping visualizations')

## Conclusion\n\nThis tutorial demonstrated the basic usage of the HPI-FHFA library:\n\n1. **Data Preparation**: Created sample transaction and geographic data\n2. **Pipeline Configuration**: Set up the HPI calculation parameters\n3. **Execution**: Ran the complete HPI calculation pipeline\n4. **Analysis**: Examined tract and city-level indices\n5. **Validation**: Verified result quality and accuracy\n\nFor more advanced usage, see the other example notebooks and documentation.\n\n### Next Steps:\n\n- Try different weight schemes\n- Experiment with different time periods\n- Use your own real estate transaction data\n- Explore performance optimization options\n- Compare results with reference implementations