In [1]:
import pandas as pd
# import mysql.connector as mycon
import kagglehub
from kagglehub import KaggleDatasetAdapter

# This pulls in the wildfire data and reads it

In [2]:
use_cols = ['OBJECTID', 'FIRE_NAME', 'LATITUDE', 'LONGITUDE', 'DISCOVERY_DATE', 'NWCG_CAUSE_CLASSIFICATION', 'NWCG_GENERAL_CAUSE', 'FIRE_SIZE', 
            'FIRE_SIZE_CLASS', 'OWNER_DESCR', 'STATE', 'FIPS_NAME']

# Download latest version
wildfire_df = kagglehub.load_dataset(handle = "behroozsohrabi/us-wildfire-records-6th-edition", path = "data.csv", 
                                   adapter = KaggleDatasetAdapter.PANDAS, 
                                   pandas_kwargs={"usecols": use_cols, "compression": "zip"})



  result = read_function(


In [3]:
# Convert DISCOVERY_DATE to datetime object to remove records prior to year 2000

wildfire_df['DISCOVERY_DATE'] = pd.to_datetime(wildfire_df['DISCOVERY_DATE'], format = ('%m/%d/%Y'))

wildfire_df = wildfire_df[wildfire_df['DISCOVERY_DATE'].dt.year > 2000]

In [4]:
# Rename some columns for clarity

wildfire_df = wildfire_df.rename(columns = {"NWCG_CAUSE_CLASSIFICATION": "CAUSE_CLASSIFICATION", "NWCG_GENERAL_CAUSE": "SPECIFIC_CAUSE", 
                                            "OWNER_DESCR": "RESPONSIBLE_ENTITY", "FIPS_NAME": "COUNTY"})

# This pulls in housing data

In [5]:
bottom_tier_housing = pd.read_parquet('data/bottom_housing.parquet')
top_tier_housing = pd.read_parquet('data/top_housing.parquet')

## Remove Columns we don't use

In [6]:
remove_cols = ['RegionID', 'RegionType', 'StateName', 'Metro', 'SizeRank']

bottom_tier_housing.drop(columns = remove_cols, inplace = True)
top_tier_housing.drop(columns = remove_cols, inplace = True)

## Remove rows with states we aren't using in a way that modifies the original dataframe in-place

In [7]:
states = ['CA', 'TX', 'GA', 'FL', 'AZ']

bottom_tier_housing.loc[~bottom_tier_housing['State'].isin(states), :] = None
top_tier_housing.loc[~top_tier_housing['State'].isin(states), :] = None

bottom_tier_housing.dropna(inplace = True)
top_tier_housing.dropna(inplace = True)


## Combine the data frames and pivot long the dates and prices

In [8]:
housing_df = pd.concat([bottom_tier_housing, top_tier_housing], axis = 0, ignore_index = True)

In [9]:
housing_df = housing_df.melt(id_vars = ['RegionName', 'State', 'CountyName'], var_name = 'Date', value_name = 'Price')
# housing_df['Price'] = housing_df['Price'].astype('float').round(2).map('{:.2f}'.format)
housing_df['Date'] = pd.to_datetime(housing_df['Date'])

## Add price_id for each record

In [10]:
housing_df.insert(0, 'PRICE_ID', range(1, len(housing_df) + 1))

## Map states to FIPS code

In [11]:
map = {'CA': 6, 'AZ': 4, 'TX': 48, 'FL': 12, 'GA': 13}
housing_df['STATE_ID'] = housing_df['State'].map(map)
# housing_df.drop(columns = 'State', inplace = True)

# Rent Data (load and remove columns'state values)

In [12]:
rent_index = pd.read_csv("data/Observed Rent Index by City.csv")

In [13]:
# Columns and state values to remove are same as housing sales data

rent_index.drop(columns = remove_cols, inplace = True)
rent_index.loc[~rent_index['State'].isin(states), :] = None

rent_index.dropna(inplace = True)

## Pivot long the date and price data

In [14]:
rent_index = rent_index.melt(id_vars = ['RegionName', 'State', 'CountyName'], var_name = 'Date', value_name = 'Price')
rent_index['Date'] = pd.to_datetime(housing_df['Date'])

## Create price index

In [15]:
rent_index.insert(0, 'RENT_ID', range(1, len(rent_index) + 1))

## Map state codes

In [16]:
rent_index['STATE_ID'] = rent_index['State'].map(map)
# housing_df.drop(columns = 'State', inplace = True)

# Load Population Data into dictionary

In [17]:
state_population_dict = {}

for state in states:
    state_population_dict[state] = pd.read_csv(f"data/{state} City population estimates.csv")

In [18]:
columns_to_drop = ['SUMLEV', 'COUSUB', 'CONCIT', 'PRIMGEO_FLAG', 'FUNCSTAT', 'STNAME', 'ESTIMATESBASE2010', 'POPESTIMATE2010', 'STATE', 'COUNTY', 'PLACE']

## Iterate through df's and create PLACES_ID column and drop unnecessary columns

In [None]:
for key, value in state_population_dict.items():
    df = state_population_dict[f"{key}"] # Adds each population DF to a dictionary
    df['PLACES_ID'] = df[['STATE', 'COUNTY', 'PLACE']].astype('str').agg('-'.join, axis = 1) # Combines the codes into a unique code then drops them
    df.drop(columns = columns_to_drop, inplace = True) # Drops columns we won't be using
    df.insert(0, 'PLACES_ID', df.pop('PLACES_ID')) # moves Places_id to front
    df = df[df['CENSUS2010POP'] != '0'] # removes records with no 2010 census recording

# Create location table

## Bring in the census data with place information

In [49]:
ca_codes = pd.read_csv('https://www2.census.gov/geo/docs/reference/codes2020/place_by_cou/st06_ca_place_by_county2020.txt', delimiter = '|')
tx_codes = pd.read_csv('https://www2.census.gov/geo/docs/reference/codes2020/place_by_cou/st48_tx_place_by_county2020.txt', delimiter = '|')
ga_codes = pd.read_csv('https://www2.census.gov/geo/docs/reference/codes2020/place_by_cou/st13_ga_place_by_county2020.txt', delimiter = '|')
fl_codes = pd.read_csv('https://www2.census.gov/geo/docs/reference/codes2020/place_by_cou/st12_fl_place_by_county2020.txt', delimiter = '|')
az_codes = pd.read_csv('https://www2.census.gov/geo/docs/reference/codes2020/place_by_cou/st04_az_place_by_county2020.txt', delimiter = '|')

In [55]:
ca_codes.head()

Unnamed: 0,STATE,STATEFP,COUNTYFP,COUNTYNAME,PLACEFP,PLACENAME
0,CA,6,1,Alameda County,562,Alameda city
1,CA,6,1,Alameda County,674,Albany city
2,CA,6,1,Alameda County,2980,Ashland CDP
3,CA,6,1,Alameda County,6000,Berkeley city
4,CA,6,1,Alameda County,11964,Castro Valley CDP


## Drop columns

In [53]:
columns_to_drop = ['PLACENS', 'TYPE', 'CLASSFP', 'FUNCSTAT']

In [54]:
ca_codes.drop(columns = columns_to_drop, inplace = True)
tx_codes.drop(columns = columns_to_drop, inplace = True)
ga_codes.drop(columns = columns_to_drop, inplace = True)
fl_codes.drop(columns = columns_to_drop, inplace = True)
az_codes.drop(columns = columns_to_drop, inplace = True)