# Step 1: Imports + setup

In [2]:
import pandas as pd
import json
from pathlib import Path

# Step 2: Load BLS data (CSV-like pipe-delimited file)

In [7]:
bls_path = Path("../data/raw/bls/pr.data.0.Current")
bls_df = pd.read_csv(
    bls_path,
    sep="\t",
    dtype = {"series_id":str,"period": str}
)

bls_df.head()

Unnamed: 0,series_id,year,period,value,footnote_codes
0,PRS30006011,1995,Q01,2.6,
1,PRS30006011,1995,Q02,2.1,
2,PRS30006011,1995,Q03,0.9,
3,PRS30006011,1995,Q04,0.1,
4,PRS30006011,1995,Q05,1.4,


# Step 3: Basic cleaning

In [8]:
# Strip whitespaces from column names
bls_df.columns = bls_df.columns.str.strip()

# Strip whitespace from string columns
bls_df["year"] = bls_df["year"].astype(int)
bls_df["value"] = bls_df["value"].astype(int)

bls_df.info()

<class 'pandas.DataFrame'>
RangeIndex: 37521 entries, 0 to 37520
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   series_id       37521 non-null  str  
 1   year            37521 non-null  int64
 2   period          37521 non-null  str  
 3   value           37521 non-null  int64
 4   footnote_codes  97 non-null     str  
dtypes: int64(2), str(3)
memory usage: 1.4 MB


# Step 4: Load population JSON

In [10]:
api_dir = Path("../data/raw/api")
population_file = sorted(api_dir.glob("population_*.json"))[-1]

with open(population_file, "r") as f:
    population_json =json.load(f)

population_df = pd.DataFrame(population_json["data"])
population_df.head()

Unnamed: 0,Nation ID,Nation,Year,Population
0,01000US,United States,2013,316128839.0
1,01000US,United States,2014,318857056.0
2,01000US,United States,2015,321418821.0
3,01000US,United States,2016,323127515.0
4,01000US,United States,2017,325719178.0


# Step 5: Clean population dataframe

In [12]:
population_df.columns = population_df.columns.str.strip()

population_df["Year"] = population_df["Year"].astype(int)
population_df["Population"] = pd.to_numeric(
    population_df["Population"], errors="coerce"
)

population_df.info()

<class 'pandas.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Nation ID   10 non-null     str    
 1   Nation      10 non-null     str    
 2   Year        10 non-null     int64  
 3   Population  10 non-null     float64
dtypes: float64(1), int64(1), str(2)
memory usage: 452.0 bytes


# Step 6: Filter population data to required years

In [14]:
# Filter population data for years 2013 - 2018 inclusive
pop_2013_2018 = population_df[
    (population_df["Year"]>=2013) &
    (population_df["Year"]<=2018)
]

pop_2013_2018

Unnamed: 0,Nation ID,Nation,Year,Population
0,01000US,United States,2013,316128839.0
1,01000US,United States,2014,318857056.0
2,01000US,United States,2015,321418821.0
3,01000US,United States,2016,323127515.0
4,01000US,United States,2017,325719178.0
5,01000US,United States,2018,327167439.0


# Step 7: Compute mean and standard deviation

In [17]:
population_mean = pop_2013_2018["Population"].mean()
population_std = pop_2013_2018["Population"].std()
print(f"Mean US Population (2013 - 2018): {population_mean:,.0f}")
print(f"Std deviation of US population (2013 - 2018): {population_std:,.0f}")

Mean US Population (2013 - 2018): 322,069,808
Std deviation of US population (2013 - 2018): 4,158,441
