## Links

Data link: https://www.census.gov/construction/bps/msamonthly.html

## Imports

In [1]:
import pandas as pd
import numpy as np
import glob
import os

## Data read-in

In [2]:
xls_files = glob.glob('*.xls')

# df_list = []

# for xls_file in xls_files:
#     df = pd.read_csv(xls_file)
#     df_list.append(df)
    
# df = pd.concat(df_list,ignore_index=True)

# print(len(df))

In [3]:
df_2023 = pd.read_excel('msaannual_2023prelim.xls','MSA Units',header=7)
df_2023 = df_2023.dropna(thresh=4)
df_2023['Year'] = 2023 

In [4]:
df_2022 = pd.read_excel('msaannual_202299.xls','MSA Units',header=5)
df_2022 = df_2022.dropna(thresh=4)
df_2022['Year'] = 2022

In [5]:
df_2021 = pd.read_excel('msaannual_202199.xls','MSA Units',header=5)
df_2021 = df_2021.dropna(thresh=4)
df_2021['Year'] = 2021

In [6]:
df_2020 = pd.read_excel('msaannual_202099.xls','MSA Units',header=5)
df_2020 = df_2020.dropna(thresh=4)
df_2020['Year'] = 2020 

In [7]:
df_2019 = pd.read_excel('msaannual_201999.xls','MSA Units',header=5)
df_2019 = df_2019.dropna(thresh=4)
df_2019['Year'] = 2019

In [8]:
# Check length of dataframes
df_list = [df_2023,df_2022,df_2021,df_2020,df_2019]

for df in df_list:
    print(len(df))

384
384
384
384
384


In [27]:
df_2023.sort_values(by='Total',ascending=False).iloc[2]

CSA                                                                                   429.0
CBSA                                                                                38060.0
Name                                      Phoenix-Mesa-Chandler, AZ                     ...
Total                                                                               45637.0
1 Unit                                                                              24810.0
2 Units                                                                              1746.0
3 and 4 Units                                                                         149.0
5 Units or More                                                                     18932.0
Num of Structures With 5 Units or More                                                537.0
Year                                                                                   2023
Name: 271, dtype: object

In [44]:
df_2023[df_2023['Name'].str.contains('Miami')]['Name']

226    Miami-Fort Lauderdale-Pompano Beach, FL       ...
Name: Name, dtype: object

In [32]:
# Initialize empty lists to store the names with the highest 'Total' from each DataFrame
top_names = []
second_names = []
third_names = []
fourth_names = []
fifth_names = []

for df in df_list:
    # Sort the DataFrame by 'Total' in descending order
    sorted_df = df.sort_values(by='Total', ascending=False)
    
    # Assuming 'Year' is consistent across each df in df_list
    year = str(sorted_df['Year'].iloc[0])
    
    # Extract the top five names and their associated year
    for i, names_list in enumerate([top_names, second_names, third_names, fourth_names, fifth_names], start=0):
        name = sorted_df['Name'].iloc[i].strip() + ' | Year: ' + year
        names_list.append(name)

# Now each list contains the names in descending order of 'Total' from each DataFrame in 'df_list'
print("Top Names:\n", top_names)
print('----------')
print("Second Names:\n", second_names)
print('----------')
print("Third Names:\n", third_names)
print('----------')
print("Fourth Names:\n", fourth_names)
print('----------')
print("Fifth Names:\n", fifth_names)

Top Names:
 ['Houston-The Woodlands-Sugar Land, TX | Year: 2023', 'Dallas-Fort Worth-Arlington, TX | Year: 2022', 'Dallas-Fort Worth-Arlington, TX | Year: 2021', 'Houston-The Woodlands-Sugar Land, TX | Year: 2020', 'Houston-The Woodlands-Sugar Land, TX | Year: 2019']
----------
Second Names:
 ['Dallas-Fort Worth-Arlington, TX | Year: 2023', 'Houston-The Woodlands-Sugar Land, TX | Year: 2022', 'Houston-The Woodlands-Sugar Land, TX | Year: 2021', 'Dallas-Fort Worth-Arlington, TX | Year: 2020', 'Dallas-Fort Worth-Arlington, TX | Year: 2019']
----------
Third Names:
 ['Phoenix-Mesa-Chandler, AZ | Year: 2023', 'New York-Newark-Jersey City, NY-NJ-PA | Year: 2022', 'New York-Newark-Jersey City, NY-NJ-PA | Year: 2021', 'New York-Newark-Jersey City, NY-NJ-PA | Year: 2020', 'New York-Newark-Jersey City, NY-NJ-PA | Year: 2019']
----------
Fourth Names:
 ['New York-Newark-Jersey City, NY-NJ-PA | Year: 2023', 'Atlanta-Sandy Springs-Alpharetta, GA | Year: 2022', 'Austin-Round Rock-Georgetown, TX | Y

## Miami Metro area permit comparison

In [46]:
# Assuming df_list is just a list of DataFrames without explicit year pairing
df_list = [df_2023, df_2022, df_2021, df_2020, df_2019]  # Example list of your DataFrames

for df in df_list:
    specified_value = df.loc[df['Name'].str.contains('Miami-Fort Lauderdale-Pompano Beach, FL'), 'Total'].iloc[0]
    year = str(df['Year'].iloc[0])  # Assuming the first entry's Year applies to the whole DataFrame
    higher_count = (df['Total'] > specified_value).sum()
    lower_count = (df['Total'] < specified_value).sum()
    
    print(f"Year: {year}")
    print(f"Permits for 'Miami-Fort Lauderdale-Pompano Beach, FL': {specified_value}")
    print(f"Number of metros with a higher value: {higher_count}")
    print(f"Number of metros with a lower value: {lower_count}\n")

Year: 2023
Permits for 'Miami-Fort Lauderdale-Pompano Beach, FL': 21329.0
Number of metros with a higher value: 12
Number of metros with a lower value: 371

Year: 2022
Permits for 'Miami-Fort Lauderdale-Pompano Beach, FL': 20021.0
Number of metros with a higher value: 18
Number of metros with a lower value: 365

Year: 2021
Permits for 'Miami-Fort Lauderdale-Pompano Beach, FL': 25313.0
Number of metros with a higher value: 15
Number of metros with a lower value: 368

Year: 2020
Permits for 'Miami-Fort Lauderdale-Pompano Beach, FL': 21758.0
Number of metros with a higher value: 13
Number of metros with a lower value: 370

Year: 2019
Permits for 'Miami-Fort Lauderdale-Pompano Beach, FL': 20688.0
Number of metros with a higher value: 14
Number of metros with a lower value: 369



## Plotly Visual

In [48]:
# Concatenate all DataFrames into one, assuming each has a 'Year' column
combined_df = pd.concat(df_list)

In [50]:
import plotly.express as px

fig = px.scatter(combined_df, 
                 x='Year', 
                 y='Total', 
                 color='Name',  # Optional: Color code points by Name
                 hover_name='Name',  # Show Name on hover
                 title='Total by Year for Each Metro Area')

fig.update_layout(xaxis_title='Year',
                  yaxis_title='Total',
                  legend_title='Metro Area')

fig.write_html('index.html')
fig.show()

## Image snagger

In [51]:
base_name = 'https://trd-digital.github.io/trd-news-interactive-maps/'

cwd = os.getcwd()

cwd = cwd.split('/')

final_name = base_name + cwd[-1]
print(final_name)

https://trd-digital.github.io/trd-news-interactive-maps/Census_Bureau_Permit_Data
