In [17]:
# !pip install squarify
# !pip install bubbly

In [18]:
# Dependencies and Setup
import os
import shutil
from zipfile import ZipFile

import matplotlib.pyplot as plt
import matplotlib

import pandas as pd
import requests
#import seaborn as sns
import scipy.stats as st
from scipy.stats import linregress
from pathlib import Path


In [19]:
# Notebook configuration
# verbose = True  # Only set this to true for debugging
verbose = False

In [20]:
# The BLS files have all the county employment / wage info for our selected years and
# counties.

# Do not need anymore, keeping in case we missed something.
# nys_industycounty_path = "housing_data/NYS_Industry_by_County.csv"

# Study parameters

# Years 2012 - 2022
years = [2012, 2013, 2014, 2015, 2016, 2017, 2018,  # Before pandemic lockdown
         2019, 2020, 2021, 2022]  # During and after pandemic lockdown

# All NYC Boroughs and geographically adjacent counties
counties = [
    # NYC Boroughs
    "Bronx County, New York",
    "Kings County, New York",  # Brooklyn
    "New York County, New York",  # Manhattan
    "Queens County, New York",
    "Richmond County, New York",  # Staten Island

    # Counties adjacent to NYC Boroughs
    "Bergen County, New Jersey",
    "Nassau County, New York",
    "Hudson County, New Jersey",
    "Westchester County, New York",
    "Rockland County, New York",
    "Fairfield County, Connecticut",
]

In [21]:
# Download and/or extract BLS datasets from BLS site or file system (if pre-downloaded)
bls_csv_dir = "housing_data"
bls_csv_files = {}
for year in years:
    zip_fn = f"{year}_qtrly_by_area.zip"
    zip_url = f"https://data.bls.gov/cew/data/files/{year}/csv/{zip_fn}"
    zip_path = f"{bls_csv_dir}/{zip_fn}"

    # Download the zip archive with the data for each year if it's not in the file system.
    if not os.path.isfile(zip_path):
        head_response = requests.head(zip_url)
        file_size = float(head_response.headers['Content-Length'])
        if file_size >= 2**30:
            file_size, units = file_size / 2**30, 'GiB'
        elif file_size >= 2**20:
            file_size, units = file_size / 2**20, 'MiB'
        elif file_size >= 2**10:
            file_size, units = file_size / 2**10, 'kiB'
        else:
            units = 'B'
        file_size = f"{file_size:03.2f} {units}"
        
        print(f"Downloading {zip_url!r} to {zip_path!r} for year {year}.")
        print(f"Expected size: {file_size}")
        get_response = requests.get(zip_url)
        with open(zip_path, 'wb') as zip_file:
            zip_file.write(get_response.content)

    # Find the csv for each county within the archive
    # print(f"Listing files in {zip_path!r} zip archive.")
    with ZipFile(zip_path) as zip_archive:
        files = zip_archive.namelist()
        for file in files:
            for county in counties:
                if file.endswith(f'{county}.csv'):
                    # print(f"{file=!r}")
                    csv_key = (year, county)
                    csv_dir, csv_fn = os.path.split(file)
                    csv_path = os.path.join(bls_csv_dir, csv_fn)
                    # print(f"{csv_fn=!r}")
                    bls_csv_files[csv_key] = csv_path
                    zip_archive.extract(file, bls_csv_dir)
                    shutil.move(f"{bls_csv_dir}/{file}", csv_path)
                    shutil.rmtree(f"{bls_csv_dir}/{csv_dir}")
if verbose:
    for key, path in bls_csv_files.items():
        year, county = key
        print(f"BLS employment / wage info csv for {year=} and {county=!r}: {path=!r}")
print(f"Found {len(bls_csv_files)} BLS employment / wage info csv files (expected {len(counties) * len(years)}).")

Found 121 BLS employment / wage info csv files (expected 121).


In [14]:
#Make sure that files are unzipped from Finder (stores files). Remove the ".zip" from the file name. Then it should recongize and read the file.
os.listdir(bls_csv_dir)

['2012.q1-q4 36005 Bronx County, New York.csv',
 '2015.q1-q4 36047 Kings County, New York.csv',
 '2022.q1-q4 36087 Rockland County, New York.csv',
 '2020.q1-q4 36047 Kings County, New York.csv',
 '2016.q1-q4 36085 Richmond County, New York.csv',
 '2019.q1-q4 36119 Westchester County, New York.csv',
 '2017.q1-q4 36085 Richmond County, New York.csv',
 '2014.q1-q4 36119 Westchester County, New York.csv',
 '2016.q1-q4 36047 Kings County, New York.csv',
 '2018.q1-q4 36061 New York County, New York.csv',
 '2022.q1-q4 36005 Bronx County, New York.csv',
 '2020.q1-q4 34003 Bergen County, New Jersey.csv',
 '.DS_Store',
 '2021.q1-q4 36061 New York County, New York.csv',
 '2017.q1-q4 34017 Hudson County, New Jersey.csv',
 '2019.q1-q4 34003 Bergen County, New Jersey.csv',
 '2017.q1-q4 36005 Bronx County, New York.csv',
 '2013.q1-q4 36081 Queens County, New York.csv',
 '2012.q1-q4 36081 Queens County, New York.csv',
 '2020.q1-q4 36061 New York County, New York.csv',
 '2013.q1-q4 36047 Kings County, 

In [22]:
printed_example = False
dfs = []  # Intentionallt plural
for key, path in bls_csv_files.items():
    year, county = key
    df = pd.read_csv(Path(path))  # Intentionally singular
    df["Year"] = year
    df["County"] = county
    # if not printed_example:
    #     print(df.head())
    #     printed_example = True
    dfs.append(df)

In [26]:
merged_df = pd.concat(dfs)[["Year", "County", "qtr","agglvl_code", "agglvl_title","area_fips","area_title",
                            "avg_wkly_wage","industry_code","industry_title","lq_avg_wkly_wage","lq_month1_emplvl",
                            "lq_month2_emplvl","lq_month3_emplvl","lq_qtrly_contributions","lq_qtrly_estabs_count",
                            "lq_total_qtrly_wages","month1_emplvl","month2_emplvl", "month3_emplvl","oty_avg_wkly_wage_chg",
                            "oty_avg_wkly_wage_pct","oty_avg_wkly_wage_pct_chg","oty_month1_emplvl_chg","oty_month1_emplvl_pct",
                            "oty_month1_emplvl_pct_chg","oty_month2_emplvl_chg","oty_month2_emplvl_pct","oty_month2_emplvl_pct_chg",
                            "oty_month3_emplvl_chg","oty_month3_emplvl_pct","oty_month3_emplvl_pct_chg","oty_qtrly_contributions_chg",
                            "oty_qtrly_contributions_pct","oty_qtrly_contributions_pct_chg","oty_qtrly_estabs_count_chg", "oty_qtrly_estabs_count_pct_chg",
                            "oty_total_qtrly_wages_chg","oty_total_qtrly_wages_pct","oty_total_qtrly_wages_pct_chg",
                            "own_code","own_title","qtr","qtrly_contributions","qtrly_estabs_count","size_code","size_title",
                            "total_qtrly_wages","year",
                           ]]  # Select which olumns to keep
merged_df = merged_df.rename(columns={  # Rename columns for readability and brevity
    "County":"County",
    "Year":"Year",
    "agglvl_code":"Aggreg_Code",
    "agglvl_title":"Aggreg_Title",
    "area_fips":"Area_Fips",
    "area_title":"Area_Title",
    "avg_wkly_wage":"Avg_Wk_Wage",
    "industry_code":"Industry_Code",
    "industry_title":"Industry_Title",
    "lq_avg_wkly_wage":"Loc_Avg_Wk_Employ",
    "lq_month1_emplvl":"Loc_Mon1_Employ",
    "lq_month2_emplvl":"Loc_Mon2_Employ",
    "lq_month3_emplvl":"Loc_Mon3_Employ",
    "lq_qtrly_contributions":"Loc_Quart_Cont",
    "lq_qtrly_estabs_count":"Loc_Quart_Est_Ct",
    "lq_total_qtrly_wages":"Loc_Tot_Quart_Wages",
    "month1_emplvl":"Mon1_Employ",
    "month2_emplvl":"Mon2_Employ",
    "month3_emplvl":"Mon3_Employ",
    "oty_avg_wkly_wage_chg":"Endyr_Avg_Wk_Wage",
    "oty_avg_wkly_wage_pct_chg":"Endyr_Avg_Wk_WagePct",
    "oty_avg_wkly_wage_pct_chg":"Endyr_Avg_Wk_WagePctChg",
    "oty_month1_emplvl_chg":"Endyr_Mon1_Employ_Chg",
    "oty_month1_emplvl_pct":"Endyr_Mon1_Employ_Pct",
    "oty_month1_emplvl_pct_chg" :"Endyr_Mon1_Employ_PctChg",
    "oty_month2_emplvl_chg":"Endyr_Mon2_Employ_Chg",
    "oty_month2_emplvl_pct":"Endyr_Mon2_Employ_Pct",
    "oty_month2_emplvl_pct_chg":"Endyr_Mon2_Employ_PctChg",
    "oty_month3_emplvl_chg":"Endyr_Mon3_Employ_Chg",
    "oty_month3_emplvl_pct":"Endyr_Mon3_Employ_Pct",
    "oty_month3_emplvl_pct_chg":"Endyr_Mon3_Employ_PctChg",
    "oty_qtrly_contributions_chg":"Endyr_Quart_Cont_Chg",
    "oty_qtrly_contributions_pct":"Endyr_Quart_Con_Pct",
    "oty_qtrly_contributions_pct_chg":"Endyr_Quart_Cont_PctChg",
    "oty_qtrly_estabs_count_chg":"Endyr_Est_Ct_Chg",
    "oty_qtrly_estabs_count_pct_chg":"Endyr_Est_Ct_PctChg",
    "oty_total_qtrly_wages_chg":"Endyr_Total_Quart_Wages_Chg",
    "oty_total_qtrly_wages_pct":"Endyr_Total_Quart_Wages_Pct",
    "own_code":"Own_Code",
    "own_title":"Own_Title",
    "qtr":"Quarter",
    "qtrly_contributions":"Quarter_Cont",
    "qtrly_estabs_count":"Quart_Est_Ct",
    "size_code":"Size_Code",
    "size_title":"Size_Title",
    "total_qtrly_wages":"Total_Quart_Wages",
    "year":"year"
})
clean_df = merged_df[merged_df["Quarter"].isin([1,2,3,4])]  # Example filter; replace as needed.
clean_df.head(5)

ValueError: cannot reindex on an axis with duplicate labels

In [36]:
merged_df['tmp'] = merged_df["Quarter"].values[:,1]
merged_df.drop(columns=['Quarter'], inplace=True)
merged_df.rename(columns={'tmp':'Quarter'})
merged_df["Quarter"]

KeyError: 'Quarter'

In [None]:
clean_df.head(5)

In [None]:
clean_df["Quarter"].unique()

In [None]:
print(sorted(map(str, merged_df.columns)))

In [None]:
#Columns

#County: The name of the county.
# Year: The year the data pertains to.
#agglvl_code: Aggregation level code, indicating the level of data aggregation (e.g., state, county, industry).
#agglvl_title: Title or description of the aggregation level.
#area_fips: Federal Information Processing Standards (FIPS) code for the area.
#area_title: Name or title of the area.
#avg_wkly_wage: Average weekly wage.
#disclosure_code: Code indicating if data is withheld or disclosed for privacy reasons.
#industry_code: Code representing the industry.
#industry_title: Title or name of the industry.
#lq_avg_wkly_wage: Location quotient of the average weekly wage, comparing local industry wages to national industry wages.
#lq_disclosure_code: Location quotient disclosure code.
#lq_month1_emplvl: Employment level for the first month in the location quotient context.
#lq_month2_emplvl: Employment level for the second month in the location quotient context.
#lq_month3_emplvl: Employment level for the third month in the location quotient context.
#lq_qtrly_contributions: Quarterly contributions in the location quotient context.
#lq_qtrly_estabs_count: Quarterly establishment count in the location quotient context.
#lq_taxable_qtrly_wages: Taxable quarterly wages in the location quotient context.
#month1_emplvl: Employment level for the first month.
#month2_emplvl: Employment level for the second month.
#month3_emplvl: Employment level for the third month.
#oty_avg_wkly_wage_chg: Over-the-year change in average weekly wage.
#oty_avg_wkly_wage_pct: Over-the-year percentage of the average weekly wage.
#oty_avg_wkly_wage_pct_chg: Over-the-year percentage change in average weekly wage.
#oty_month1_emplvl_chg: Over-the-year change in employment level for the first month.
#oty_month1_emplvl_pct: Over-the-year percentage of employment level for the first month.
#oty_month1_emplvl_pct_chg: Over-the-year percentage change in employment level for the first month.
#oty_month2_emplvl_chg: Over-the-year change in employment level for the second month.
#oty_month2_emplvl_pct: Over-the-year percentage of employment level for the second month.
#oty_month2_emplvl_pct_chg: Over-the-year percentage change in employment level for the second month.
#oty_month3_emplvl_chg: Over-the-year change in employment level for the third month.
#oty_month3_emplvl_pct: Over-the-year percentage of employment level for the third month.
#oty_month3_emplvl_pct_chg: Over-the-year percentage change in employment level for the third month.
#oty_qtrly_contributions_chg: Over-the-year change in quarterly contributions.
#oty_qtrly_contributions_pct: Over-the-year percentage of quarterly contributions.
#oty_qtrly_contributions_pct_chg: Over-the-year percentage change in quarterly contributions.
#oty_qtrly_estabs_count_chg: Over-the-year change in quarterly establishment count.
#oty_qtrly_estabs_count_pct_chg: Over-the-year percentage change in quarterly establishment count.
#oty_taxable_qtrly_wages_chg: Over-the-year change in taxable quarterly wages.
#oty_taxable_qtrly_wages_chg.1: Duplicate or additional field for over-the-year change in taxable quarterly wages.
#oty_taxable_qtrly_wages_pct_chg: Over-the-year percentage change in taxable quarterly wages.
#oty_total_qtrly_wages_chg: Over-the-year change in total quarterly wages.
#oty_total_qtrly_wages_pct: Over-the-year percentage of total quarterly wages.
#oty_total_qtrly_wages_pct_chg: Over-the-year percentage change in total quarterly wages.
#own_code: Ownership code, indicating the type of ownership (e.g., private, government).
#own_title: Title or description of the ownership type.
#qtr: Quarter of the year.
#qtrly_contributions: Quarterly contributions.
#qtrly_estabs_count: Quarterly establishment count.
#size_code: Size code, indicating the size category of establishments.
#size_title: Title or description of the size category.
#taxable_qtrly_wages: Taxable quarterly wages.
#total_qtrly_wages: Total quarterly wages.
#year: Year the data pertains to (appears to be a duplicate of column 2)

In [None]:
#Renamed Columns

    #"County":"County",
    #"Year":"Year",
    #"agglvl_code":"Aggreg_Code",
    #"agglvl_title":"Aggreg_Title",
    #"area_fips":"Area_Fips",
    #"area_title":"Area_Title",
    #"avg_wkly_wage":"Avg_Wk_Wage",
    #"industry_code":"Industry_Code",
    #"industry_title":"Industry_Title",
    #"lq_avg_wkly_wage":"Loc_Avg_Wk_Employ",
    #"lq_month1_emplvl":"Loc_Mon1_Employ",
    #"lq_month3_emplvl":"Loc_Mon3_Employ",
    #"lq_qtrly_contributions":"Loc_Quart_Cont",
    #"lq_qtrly_estabs_count":"Loc_Quart_Est_Ct",
    #"lq_total_qtrly_wages":"Loc_Tot_Quart_Wages",
    #"month1_emplvl":"Mon1_Employ",
    #"month2_emplvl":"Mon2_Employ",
    #"month3_emplvl":"Mon3_Employ",
    #"oty_avg_wkly_wage_chg":"Endyr_Avg_Wk_Wage",
    #"oty_avg_wkly_wage_pct_chg":"Endyr_Avg_Wk_WagePct",
    #"oty_avg_wkly_wage_pct_chg":"Endyr_Avg_Wk_WagePctChg",
    #"oty_month1_emplvl_chg":"Endyr_Mon1_Employ_Chg",
    #"oty_month1_emplvl_pct":"Endyr_Mon1_Employ_Pct",
    #"oty_month1_emplvl_pct_chg" :"Endyr_Mon1_Employ_PctChg",
    #"oty_month2_emplvl_chg":"Endyr_Mon2_Employ_Chg",
    #"oty_month2_emplvl_pct":"Endyr_Mon2_Employ_Pct",
    #"oty_month2_emplvl_pct_chg":"Endyr_Mon2_Employ_PctChg",
    #"oty_month3_emplvl_chg":"Endyr_Mon3_Employ_Chg",
    #"oty_month3_emplvl_pct":"Endyr_Mon3_Employ_Pct",
    #"oty_month3_emplvl_pct_chg":"Endyr_Mon3_Employ_PctChg",
    #"oty_qtrly_contributions_chg":"Endyr_Quart_Cont_Chg",
    #"oty_qtrly_contributions_pct":"Endyr_Quart_Con_Pct",
    #"oty_qtrly_contributions_pct_chg":"Endyr_Quart_Cont_PctChg",
    #"oty_qtrly_estabs_count_chg":"Endyr_Est_Ct_Chg",
    #"oty_qtrly_estabs_count_pct_chg":"Endyr_Est_Ct_PctChg",
    #"oty_total_qtrly_wages_chg":"Endyr_Total_Quart_Wages_Chg",
    #"oty_total_qtrly_wages_pct":"Endyr_Total_Quart_Wages_Pct",
    #"own_code":"Own_Code",
    #"own_title":"Own_Title",
    #"qtr":"Quarter",
    #"qtrly_contributions":"Quarter_Cont",
    #"qtrly_estabs_count":"Quart_Est_Ct",
    #"size_code":"Size_Code",
    #"size_title":"Size_Title",
    #"total_qtrly_wages":"Total_Quart_Wages",
    #"year":"year"

In [None]:
# Read the employment data and the nys industry results
#first_quarteremployment = pd.read_csv(first_quarteremployment_path)
#nys_industrycounty = pd.read_csv(nys_industrycounty_path)

# Combine the data into a single DataFrame
#first_quarteremployment_nys_industrycounty = pd.merge(first_quarteremployment, nys_industrycounty,on=["County"])

# Display the data table for preview
#first_quarteremployment_nys_industrycounty.head()

In [None]:
pwd = os.getcwd()  # Get current working directory
files = os.listdir()  # Get files in current working directory
print(f"{pwd=!r}")
for file in files:
    print(f"{file=!r}")

In [None]:
clean_df['Year'] = clean_df['Year'].astype(str)
plt.figure(figsize=(8,5))
g = sns.FacetGrid(merged_df, hue='Year', size=10, hue_order=['2012',
                                                              '2013',
                                                              '2014',
                                                              '2015',
                                                              '2016',
                                                              '2017',
                                                              '2018',
                                                              '2019',
                                                              '2020',
                                                              '2021',
                                                              '2022'], palette="Paired")
g.map(sns.kdeplot, "Total_Quart_Wages", shade=True)
g.set_xticklabels(rotation=45)
g.add_legend()
plt.show()

In [None]:
#Variables that will be considered in the data analysis
    #"County":"County",
    #"Year":"Year",
    #"agglvl_code":"Aggreg_Code",
    #"agglvl_title":"Aggreg_Title",
    #"area_fips":"Area_Fips",
    #"area_title":"Area_Title",
    #"avg_wkly_wage":"Avg_Wk_Wage",
    #"industry_code":"Industry_Code",
    #"industry_title":"Industry_Title",
    #"lq_qtrly_contributions":"Loc_Quart_Cont",
    #"lq_qtrly_estabs_count":"Loc_Quart_Est_Ct",
    #"lq_total_qtrly_wages":"Loc_Tot_Quart_Wages",
    #"oty_avg_wkly_wage_chg":"Endyr_Avg_Wk_Wage",
    #"oty_avg_wkly_wage_pct_chg":"Endyr_Avg_Wk_WagePct",
    #"oty_avg_wkly_wage_pct_chg":"Endyr_Avg_Wk_WagePctChg"
    #"oty_qtrly_contributions_chg":"Endyr_Quart_Cont_Chg",
    #"oty_qtrly_contributions_pct":"Endyr_Quart_Con_Pct",
    #"oty_qtrly_contributions_pct_chg":"Endyr_Quart_Cont_PctChg",
    #"oty_qtrly_estabs_count_chg":"Endyr_Est_Ct_Chg",
    #"oty_qtrly_estabs_count_pct_chg":"Endyr_Est_Ct_PctChg",
    #"oty_total_qtrly_wages_chg":"Endyr_Total_Quart_Wages_Chg",
    #"oty_total_qtrly_wages_pct":"Endyr_Total_Quart_Wages_Pct",

In [None]:
#Groupby County and Average Weekly Wage
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the average wk wage for each county
avg_wk_wage_county_mean = clean_df.groupby('County')['Avg_Wk_Wage ($)'].mean()
avg_wk_wage_county_mean

avg_wk_wage_county_median = clean_df.groupby('County')['Avg_Wk_Wage ($)'].median()
avg_wk_wage_county_median

avg_wk_wage_county_variance = clean_df.groupby('County')['Avg_Wk_Wage ($)'].var()
avg_wk_wage_county_variance

avg_wk_wage_county_standard_deviation = clean_df.groupby('County')['Avg_Wk_Wage ($)'].std()
avg_wk_wage_countystandard_deviation

avg_wk_wage_county_sem = clean_df.groupby('County')['Avg_Wk_Wage ($)'].sem()
avg_wk_wage_county_sem

avg_wk_wage_county_stats_table = pd.DataFrame({"Mean":avg_wk_wage_county_mean, "Median":avg_wk_wage_county_median,
                                                 "Variance":vg_wk_wage_county_variance, "Standard Deviation":avg_wk_wage_county_standard_deviation,
                                                 "Sem":avg_wk_wage_county_sem})
avg_wk_wage_county_summary_stats_table

print(avg_wk_wage_countysummary_stats_table)
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen:
# mean, median, variance, standard deviation, and SEM of the tumor volume.
# Assemble the resulting series into a single summary DataFrame.

In [None]:
#Info on the dataset
clean_df.info()

In [None]:
#Check the shape of the dataset
clean_df.shape

In [None]:
print(f"There are {} rows and {} columns in our data".format(df.shape[0],df.shape[1])))

In [None]:
#Check for missing values in the dataset ##Data cleaning
clean_df.isnull().sum(axis = 0) #alt code clean_df.isnull().values.any()

In [None]:
#Drop missing values ##Data cleaning
clean_df.isnull().dropna()    #for column in clean_df.columns:
                                  #if clean_df[column].isnull().any():
                                            #print('{0} has {1} null values.format(colum, df[columb].isnull().sum()))

In [None]:
#Summary Statistics of the Numerical dataset 
clean_df.describe().T

In [None]:
#Check how many years
clean_df['Year'].value_counts()

In [None]:
#Check how many counties
cleandf['County'].value_counts()

In [None]:
#Check how many quarters
clean_df['Quarter'].value_counts()

In [None]:
#Check how many area fips: Federal info  Processing Standards code for the area
clean_df['Area_Fips'].value_counts()

In [None]:
#Check how many average weekly wages #Avg_Wk_Wage
clean_df['Avg_Wk_Wage'].value_counts()

In [None]:
#"Check how many Industry_Codes
clean_df['Industry_Codes'].value_counts()

In [None]:
#Check how many Industry_Titles
clean_df['Industry_Titles'].value_counts()

In [None]:
#Check how many over the year average weekly wage change #Endyr_Avg_Wk_Wage
clean_df['Endyr_Avg_Wk_Wage'].value_counts()

In [None]:
#Check how many over the year average weekly wage change percentage #Endyr_Avg_Wk_WagePct
clean_df['Endyr_Avg_Wk_WagePct'].value_counts()

In [None]:
#Check how many over the year average weekly wage change percentage Endyr_Avg_Wk_WagePctChg
clean_df['Endyr_Avg_Wk_WagePctChg'].value_counts()

In [None]:
#Histogram of Average and Weekly Wage

plt.hist(clean_df['Avg_Wk_Wage'], bins=50, alpha=  0.5, color='r', label='Avg_Wk_Wage')
plt.hist(clean_df['Endyr_Avg_Wk_Wage'],     bins=50, alpha = 0.5, color='b', label='Endyr_Avg_Wk_Wage')
plt.xlabel(clean_'Average Weekly Wage ($/year)')
plt.title('Distribution of Average Weekly Wage')

plt.axvline(clean_df['Avg_Wk_Wage'].quantile(.75), color='r')
plt.axvline(clean_df['Endyr_Avg_Wk_Wage'].quantile(.75), color='b')

plt.legend()
plt.show()

In [None]:
# Industry Titles #Careers
 
Industry_Catergory_df = clean_df['Industry_Title'].value_counts()[:10]
plt.figure(figsize=(10,10))
res=sns.barplot(x=Industry_Catergory_df, y=Industry_Catergory_df.index)
res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 16, color='black')
plt.xlabel('Value Counts of Industry Category',fontsize = 16, color='black')
plt.ylabel('Top 10 Industry Category Names',fontsize = 16, color='black')
plt.title('Industry Categories in NYC and Surrounding Counties',fontsize = 16, color='black')
plt.show()

In [None]:
# Average Weekly Wages
 
Avg_Wk_Wage_df = clean_df['Avg_Wk_Wage'].value_counts()[:10]
plt.figure(figsize=(10,10))
res=sns.barplot(x=Avg_Wk_Wage_df, y=Avg_Wk_Wage_df.index)
res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 16, color='black')
plt.xlabel('Value Counts of Average Weekly Wage',fontsize = 16, color='black')
plt.ylabel('Top 10 Average Weekly Wage',fontsize = 16, color='black')
plt.title('Average Weekly Wage in NYC and Surrounding Counties',fontsize = 16, color='black')
plt.show()

In [None]:
# End of year Average Weekly Wages
 
Endyr_Avg_Wk_Wage_df = df['Endyr_Avg_Wk_Wage'].value_counts()[:10]
plt.figure(figsize=(10,10))
res=sns.barplot(x=Endyr_Avg_Wk_Wage_df, y=Endyr_Avg_Wk_Wage_df.index)
res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 16, color='black')
plt.xlabel('Value Counts of Average Weekly Wage',fontsize = 16, color='black')
plt.ylabel('Top 10 Average Weekly Wage',fontsize = 16, color='black')
plt.title('Average Weekly Wage in NYC and Surrounding Counties',fontsize = 16, color='black')
plt.show()

In [None]:
#ScatterPlot Industry Titles (Careers) per Year
clean_df['Year'] = pd.to_datetime(df['Year'], format='%m/%d/%Y')

df.set_index('Year', inplace=True)

clean_df_monthly = df.resample('M').size()

plt.figure(figsize=(12, 6))
plt.plot(clean_df_monthly.index, df_monthly.values, marker='o', linestyle='-', color='b')
plt.title('Number of Careers Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Careers')
plt.grid(True)
plt.show()

In [None]:
#Pie chart  #"oty_avg_wkly_wage_pct_chg":"Endyr_Avg_Wk_WagePct"
endyr_awk_wagepct_cts = df['Endyr_Avg_Wk_WagePct'].value_counts()

# Plotting
plt.figure(figsize=(10, 10))
plt.pie(endyr_awk_wageptchg_cts, labels=endyr_awk_wageptchg_cts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of End of the Year Average Weekly Wage (% Change)')
plt.show()

In [None]:
#Pie chart #"oty_avg_wkly_wage_pct_chg":"Endyr_Avg_Wk_WagePctChg"
endyr_awk_wagepctchg_cts = df['Endyr_Avg_Wk_WagePctChg'].value_counts()

# Plotting
plt.figure(figsize=(10, 10))
plt.pie(endyr_awk_wagepctchg_cts, labels=endyr_awk_wagepctchg_cts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of End of the Year Average Weekly Wage (% Change)')
plt.show()

In [None]:
#Pie chart #"oty_qtrly_contributions_chg":"Endyr_Quart_Cont_Chg"
endyr_quart_contchg_cts = df['Endyr_Quart_Cont_Chg'].value_counts()

# Plotting
plt.figure(figsize=(10, 10))
plt.pie(endyr_quart_contchg_cts, labels=endyr_quart_contchg_cts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of End of the Quarter Contribution Change')
plt.show()

In [None]:
#Pie chart  #"oty_qtrly_contributions_pct":"Endyr_Quart_Con_Pct",
endyr_quart_contpct_cts = df['Endyr_Quart_Cont_Pct'].value_counts()

# Plotting
plt.figure(figsize=(10, 10))
plt.pie(endyr_quart_contpct_cts, labels=endyr_quart_contpct_cts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of End of the Quarter Contribution Percent (%)')
plt.show()

In [None]:
#Pie chart  #"oty_qtrly_contributions_pct_chg":"Endyr_Quart_Cont_PctChg",
agency_counts = df['Endyr_Quart_Cont_PctChg'].value_counts()

# Plotting
plt.figure(figsize=(10, 10))
plt.pie(agency_counts, labels=agency_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of End of the Quarter Contribution Percent Change (% Change)')
plt.show()

In [None]:
#Attempting to make a bubble chart
clean_df['start'] = round(clean_df['Avg_Wk_Wage'])
clean_df['start'] = clean_df['start'].astype('int')

clean_df['end'] = round(clean_df['Endyr_Avg_Wk_Wage'])
clean_df['end'] = clean_df['end'].astype('int')


fig = bubbleplot(df, x_column='start', y_column='end', bubble_column='Agency', size_column='# Of Positions', x_title='Avg_Wk_Wage', color_column='Agency', 
                 y_title='Endyr_Avg_Wk_Wage', title='Career per Average Weekly Wage', x_logscale=False, scale_bubble=3, height=650)

py.iplot(fig)

In [None]:
#Histogram Industry Title #https://www.kaggle.com/code/dnzcihan/analysing-new-york-technology-data-innovation
clean_df['Industry_Title'] = clean_df['Industry_Title']


groups = df.groupby(['Industry_Title']).size()
plt.figure(figsize=(10, 10))
groups.plot.barh()

In [None]:
#Histogram County
clean_df['County'] = clean_df['County']


groups = df.groupby(['County']).size()
plt.figure(figsize=(10, 10))
groups.plot.barh()

In [None]:
#Histogram Year
clean_df['Year'] = clean_df['Year']


groups = df.groupby(['Year']).size()
plt.figure(figsize=(10, 10))
groups.plot.barh()

In [None]:
#Histogram Quarter
clean_df['Quarter'] = clean_df['Quarter']


groups = df.groupby(['Quarter']).size()
plt.figure(figsize=(10, 10))
groups.plot.barh()

In [16]:
#d = {
    #'potato': 'tomato',
    #'pi': 3.14159,
    #'#name': 'Carlos',
#}
#d2 = dict(
    #potato='tomato',
    3pi=3.14159,
    #name='Carlos',
#)
#class A:
    #def __init__(self, , **kwargs):
        #for param, arg in kwargs.items():
            #setattr(self, param, arg)
    #def __str__(self):
        #return f"str version: {self.__class__.__name__}" \
            #f"({",".join(f"{param}={arg}" for param, arg in sorted(self.__dict__.items()))})"
    #def __repr__(self):
        #return f"repr version: {self.__class__.__name__}" \
            #f"({','.join(f'{param}={arg!r}' for param, arg in sorted(self.__dict__.items()))})"

#a = A(potato='tomato', pi=3.1415926, name='Carlos', none=None)
#print(f"default: {a}")
#print(f"with_bang_s: {a!s}")
#print(f"with_bang_r: {a!r}")

default: str version: A(name=Carlos,none=None,pi=3.1415926,potato=tomato)
with_bang_s: str version: A(name=Carlos,none=None,pi=3.1415926,potato=tomato)
with_bang_r: repr version: A(name='Carlos',none=None,pi=3.1415926,potato='tomato')


In [27]:
# `a.potato` is quivalent to `getattr(a, 'potato')`


'tomato'

In [20]:
#repr('a')

"'a'"

In [21]:
#A(name='Carlos',none=None,pi=3.1415926,potato='tomato')

repr version: A(name='Carlos',none=None,pi=3.1415926,potato='tomato')

In [22]:
#def mysum(*values):
    #s = 0
    #for value in values:
        #s += value
    #return s

In [59]:
#len(counties) *len(years)

121

In [3]:
#Reference NYC Housing Data

#Kaggle Datasets
#https://www.kaggle.com/code/shaqiavelli/nyc-geospatial-analysis
#https://www.kaggle.com/code/ashokmevada/house-price

#NYC Planning #????
#https://www.nyc.gov/site/planning/data-maps/open-data/dwn-housing-database.page#housingdevelopmentproject

#NYC.GOV Annual Housing Sales 2012-2022
#https://www.nyc.gov/site/finance/property/property-annualized-sales-update.page

#Reference Bureau of Labor Statistics (bls) API_Key Multiple Series and csv files

##bls.gov: https://www.bls.gov/developers/api_signature_v2.htm (API_Key)
##https://www.bls.gov/cew/downloadable-data-files.htm (csv files)
##https://www.bls.gov/cew/additional-resources/open-data/sample-code.htm (sample python code)

In [None]:
# Study data files
#nasdaqdatalink_apidata_path = "data/Mouse_metadata.csv"
#study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
#nasdaqdatalink_apidata = pd.read_csv(mouse_metadata_path)
#study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
#mouse_study_results = pd.merge(mouse_metadata, study_results,on=["Mouse ID"])

# Display the data table for preview
#mouse_study_results.head()