In [17]:
import pandas as pd
import numpy as np 
import os

file_path = 'census_long.csv'

print(f"Attempting to read: {file_path}")
df = []

try:
    df = pd.read_csv(file_path)
    
    column_names = df.columns.tolist()
    
    print("\n--- Column Names ---")
    print(column_names)
    
    print("\n--- As an itemized list ---")
    for i, col_name in enumerate(column_names):
        print(f"{i+1}: {col_name}")
    
except FileNotFoundError:
    print(f"\n--- ERROR: File Not Found ---")
    print(f"Could not find the file: '{file_path}'")
    print("Please make sure the file is in the same directory as your notebook.")
    print(f"Your notebook is currently running in: {os.getcwd()}")
    
except pd.errors.EmptyDataError:
    print(f"\n--- ERROR: File is Empty ---")
    print(f"The file '{file_path}' was found, but it contains no data.")
    
except Exception as e:
    print(f"\n--- An Unexpected Error Occurred ---")
    print(e)

Attempting to read: census_long.csv

--- Column Names ---
['zcta', 'date', 'household_income', 'household_income_prior', 'household_income_delta', 'pct_below_pov_line', 'pct_below_pov_line_prior', 'pct_below_pov_line_delta', 'total_pop', 'total_pop_delta', 'total_pop_prior', 'pct_wfh', 'pct_wfh_delta', 'pct_wfh_prior', 'pct_0_bed_rentals', 'pct_1_bed_rentals', 'pct_2_bed_rentals', 'pct_3_bed_rentals', 'pct_4_bed_rentals', 'pct_5_bed_rentals', 'pct_0_bed_stock', 'pct_1_bed_stock', 'pct_2_bed_stock', 'pct_3_bed_stock', 'pct_4_bed_stock', 'pct_5_bed_stock', 'pct_foreign', 'pct_foreign_delta', 'pct_foreign_prior', 'pct_white', 'pct_white_delta', 'pct_white_prior', 'pct_black', 'pct_black_delta', 'pct_black_prior', 'pct_asian', 'pct_asian_delta', 'pct_asian_prior', 'pct_hispanic', 'pct_hispanic_delta', 'pct_hispanic_prior', 'total_housing_units', 'total_housing_units_delta', 'total_housing_units_prior', 'pct_with_bachelor', 'pct_with_bachelor_prior', 'pct_with_bachelor_delta', 'pct_under_18

In [18]:
df = df.drop(columns=[
    'total_housing_units', 
    'pct_under_18','pct_under_5',
    'renter_occupied_homes',
    'm_pct_25_29', 'm_pct_30_34',
    'f_pct_25_29', 'f_pct_30_34',
    'average_home_age',
    'pct_over_65', 'pct_vacancy', 
    'pct_owner_occupied', 'pct_mortgage',
    'pct_sfh', 'total_rental_homes'
])
df = df.drop(columns=[col for col in df.columns if col.endswith('_delta')])
df = df.drop(columns=[col for col in df.columns if col.endswith('_prior')])

column_names = df.columns.tolist()
print("\n--- As an itemized list ---")
for i, col_name in enumerate(column_names):
    print(f"{i+1}: {col_name}")


--- As an itemized list ---
1: zcta
2: date
3: household_income
4: pct_below_pov_line
5: total_pop
6: pct_wfh
7: pct_0_bed_rentals
8: pct_1_bed_rentals
9: pct_2_bed_rentals
10: pct_3_bed_rentals
11: pct_4_bed_rentals
12: pct_5_bed_rentals
13: pct_0_bed_stock
14: pct_1_bed_stock
15: pct_2_bed_stock
16: pct_3_bed_stock
17: pct_4_bed_stock
18: pct_5_bed_stock
19: pct_foreign
20: pct_white
21: pct_black
22: pct_asian
23: pct_hispanic
24: pct_with_bachelor


In [19]:
print("\n--- Missing Values by Col ---")
print(df.isna().mean().sort_values(ascending=False) * 100)
print(df['date'].value_counts().sort_index())



--- Missing Values by Col ---
household_income      6.902916
pct_1_bed_rentals     6.174511
pct_0_bed_rentals     6.174511
pct_2_bed_rentals     6.174511
pct_5_bed_rentals     6.174511
pct_4_bed_rentals     6.174511
pct_3_bed_rentals     6.174511
pct_wfh               1.994376
pct_below_pov_line    1.906738
pct_4_bed_stock       1.751119
pct_2_bed_stock       1.751119
pct_5_bed_stock       1.751119
pct_0_bed_stock       1.751119
pct_1_bed_stock       1.751119
pct_3_bed_stock       1.751119
pct_foreign           1.589767
pct_with_bachelor     1.331222
pct_white             1.199629
pct_asian             1.199629
pct_black             1.199629
pct_hispanic          1.199629
total_pop             0.000000
date                  0.000000
zcta                  0.000000
dtype: float64
date
201301    33120
201401    33120
201501    33120
201601    33120
201701    33120
201801    33120
201901    33120
202001    33120
202101    33774
202201    33774
202301    33772
Name: count, dtype: int64


In [20]:
missing_any = df.isna().any(axis=1).mean() * 100
print(f"{missing_any:.2f}% of rows have at least one missing value.")


9.45% of rows have at least one missing value.


In [21]:
df = df.dropna()
print(f"Remaining rows: {len(df)}")



Remaining rows: 331660
