In [39]:
# Includes the necessary imports
import pandas as pd
import seaborn as sns
import numpy as np

In [82]:
# Try reading in the data

# Helpful function to read in the data
def readDataByYear(year):
    yearPath = '/Users/zacharyzhu/Desktop/Employment Data/' + str(year) + ' Employment By County.xlsx'
    df = pd.read_excel(yearPath)
    df = df.rename(columns = {"Labor Force Data by County, 2019 Annual Averages":"LAUS Code", 
                        "Unnamed: 1": "State FIPS Code", "Unnamed: 2": "County FIPS Code", "Unnamed: 3":
                        "County", "Unnamed: 4": "Year", "Unnamed: 6": "Labor Force",
                        "Unnamed: 7": "Employed", "Unnamed: 8": "Unemployed", "Unnamed: 9": "Unemployment Rate (%)"})
    # Drop blank columns/nonsense rows
    df = df.drop(columns = {'Unnamed: 5'})
    df = df.drop([0, 1, 2, 3, 4, 3224, 3225, 3226])
    # Split up State/County
    df['State'] = df['County'].str[-2:]
    df['County'] = df['County'].str[:-4]
    # Converting data types into integers
    df['State FIPS Code'] = df['State FIPS Code'].astype(int)
    df['County FIPS Code'] = df['County FIPS Code'].astype(int)
    df['Year'] = df['Year'].astype(int)
    df['Labor Force'] = df['Labor Force'].astype(int)
    df['Employed'] = df['Employed'].astype(int)
    df['Unemployed'] = df['Unemployed'].astype(int)
    df['Unemployment Rate (%)'].astype(int)
    # Accounting for DC
    #df = df['County'].replace({'District of Colu': 'District of Columbia'})
    #df = df['State'].replace({'ia': 'N/A'})

    # Putting data in desired order
    df = df[['LAUS Code', 'State FIPS Code', 'County FIPS Code', 'County', 'State', 'Year', 'Labor Force',
            'Employed', 'Unemployed', 'Unemployment Rate (%)']]
    df['County'].replace({'District of Colu': 'District of Columbia'}, inplace = True)
    df['State'].replace({'ia': 'N/A'}, inplace = True)
    return df
readDataByYear(2019)


Unnamed: 0,LAUS Code,State FIPS Code,County FIPS Code,County,State,Year,Labor Force,Employed,Unemployed,Unemployment Rate (%)
5,CN0100100000000,1,1,Autauga County,AL,2019,26172,25458,714,2.7
6,CN0100300000000,1,3,Baldwin County,AL,2019,97328,94675,2653,2.7
7,CN0100500000000,1,5,Barbour County,AL,2019,8537,8213,324,3.8
8,CN0100700000000,1,7,Bibb County,AL,2019,8685,8419,266,3.1
9,CN0100900000000,1,9,Blount County,AL,2019,25331,24655,676,2.7
10,CN0101100000000,1,11,Bullock County,AL,2019,4818,4643,175,3.6
11,CN0101300000000,1,13,Butler County,AL,2019,9263,8925,338,3.6
12,CN0101500000000,1,15,Calhoun County,AL,2019,46209,44574,1635,3.5
13,CN0101700000000,1,17,Chambers County,AL,2019,15748,15286,462,2.9
14,CN0101900000000,1,19,Cherokee County,AL,2019,11604,11266,338,2.9


In [83]:
# Different functions to parse the data

# Let's say we just wanted to keep California counties
df = readDataByYear(2019)
CACounties = df.loc[df['State'] == 'CA']
#print(CACounties)

# Get a specific row's values
print(df.iloc[0])

# Get a specific column's row's value
print(df['County'].iloc[0])

# Let's say we wanted all counties where the County FIPS Code was divisble by 7 or 9
divBySevNine = df.loc[(df['County FIPS Code'] % 7 == 0) | (df['County FIPS Code'] % 9 == 0)]
#print(divBySevNine)

LAUS Code                CN0100100000000
State FIPS Code                        1
County FIPS Code                       1
County                    Autauga County
State                                 AL
Year                                2019
Labor Force                        26172
Employed                           25458
Unemployed                           714
Unemployment Rate (%)                2.7
Name: 5, dtype: object
Autauga County


In [84]:
# Built-in functions

# Getting mean values of grouping by state
df.count() # Number of non-NaN values in each column
df.sum() # Sum of each column
df.mean() # Mean value of each column
df.describe() # Summary of stats for each column--count, mean, STD, min, median, max

Unnamed: 0,State FIPS Code,County FIPS Code,Year,Labor Force,Employed,Unemployed
count,3219.0,3219.0,3219.0,3219.0,3219.0,3219.0
mean,31.295744,102.95247,2019.0,51002.36,49115.43,1886.927307
std,16.277202,106.696455,0.0,167690.7,161372.4,6471.804399
min,1.0,1.0,2019.0,223.0,212.0,4.0
25%,19.0,35.0,2019.0,4939.5,4698.0,200.0
50%,30.0,79.0,2019.0,11442.0,10999.0,479.0
75%,46.0,133.0,2019.0,31290.5,30027.0,1231.0
max,72.0,840.0,2019.0,5121584.0,4894296.0,227288.0


In [87]:
stateGrouping = df.groupby(['State']) # Creating a grouping by state
stateGrouping.get_group('WI') # Gives us a sub-table based off of the grouping for WI (the state)
stateGrouping.size() # How many counties exist for each state? N/A represents Washington DC
stateGrouping.mean()

Unnamed: 0_level_0,State FIPS Code,County FIPS Code,Year,Labor Force,Employed,Unemployed
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AK,2.0,149.655172,2019.0,11992.344828,11256.137931,736.206897
AL,1.0,67.0,2019.0,33458.985075,32455.029851,1003.955224
AR,5.0,75.0,2019.0,18168.653333,17523.906667,644.746667
AZ,4.0,13.866667,2019.0,236750.6,225633.6,11117.0
CA,6.0,58.0,2019.0,334682.37931,321161.706897,13520.672414
CO,8.0,62.234375,2019.0,49199.46875,47845.28125,1354.1875
CT,9.0,8.0,2019.0,239191.375,230270.875,8920.5
DE,10.0,3.0,2019.0,162421.666667,156281.666667,6140.0
FL,12.0,67.910448,2019.0,154280.029851,149493.567164,4786.462687
GA,13.0,161.490566,2019.0,32140.427673,31039.72956,1100.698113
