**Rearc assignment**

Part 0 - pip installs

In [2]:

import boto3
import pandas as pd
import requests
import json
import time
from io import StringIO
from bs4 import BeautifulSoup
from botocore.exceptions import NoCredentialsError

Part 1 - Using the data to print sample

In [3]:
# === Configuration ===
bucket_name = "rearc-assignment-vipul"
s3 = boto3.client("s3")
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/117.0 Safari/537.36"}
bls_base_url = "https://download.bls.gov/pub/time.series/pr/"
population_api_url = "https://datausa.io/api/data?drilldowns=Nation&measures=Population"
population_key = "datausa/population_1.json"

In [4]:
# === Part 1: Sync BLS Files to S3 ===
def get_remote_bls_files():
    print("Fetching BLS index page...")
    response = requests.get(bls_base_url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    return [link.get("href") for link in soup.find_all("a") if link.get("href").startswith("pr.")]

def get_existing_s3_files():
    s3_keys = []
    paginator = s3.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket_name, Prefix="bls/"):
        for obj in page.get("Contents", []):
            s3_keys.append(obj["Key"].replace("bls/", ""))
    return s3_keys

def sync_bls_to_s3():
    remote_files = get_remote_bls_files()
    existing_files = get_existing_s3_files()

    for filename in remote_files:
        if filename not in existing_files:
            print(f"Uploading: {filename}")
            try:
                file_response = requests.get(bls_base_url + filename, headers=headers)
                s3.put_object(Bucket=bucket_name, Key=f"bls/{filename}", Body=file_response.content)
                print(f"Uploaded: {filename}")
            except Exception as e:
                print(f"Error uploading {filename}: {e}")
        else:
            print(f"Skipping already-uploaded file: {filename}")

Part 2 - Using the data to push it over the S3 bucket in a JSON file.

In [5]:
def fetch_population_data():
    print("Fetching population data from API...")
    response = requests.get(population_api_url)
    if response.status_code != 200:
        raise Exception("Failed to fetch population data")
    s3.put_object(
        Bucket=bucket_name,
        Key=population_key,
        Body=json.dumps(response.json()),
        ContentType='application/json'
    )
    print(f"Population data uploaded to S3: {population_key}")

Part 3 - Copying the pr.data.0.Current data: Tried using the data file from the server, but google result showed, the needs to be pulled from the server for working with google colab. So changed the interpreter to Visual studio to see the API call.

In [None]:
def perform_analysis():
    print("Starting Part 3: Data Analysis...")

    # Load the population JSON file from S3
    try:
        pop_response = s3.get_object(Bucket=bucket_name, Key=population_key)
        pop_json = json.loads(pop_response['Body'].read())
        pop_df = pd.DataFrame(pop_json['data'])
    except Exception as e:
        print("Failed to read population data from S3:", e)
        return

    # Filter for years 2013 to 2018 and compute stats
    pop_df["Year"] = pop_df["Year"].astype(int)
    pop_range = pop_df[(pop_df["Year"] >= 2013) & (pop_df["Year"] <= 2018)]
    mean_val = pop_range["Population"].astype(int).mean()
    std_val = pop_range["Population"].astype(int).std()

    print(f"Mean population (2013-2018): {int(mean_val)}")
    print(f"Standard deviation: {int(std_val)}")

    # Load the BLS data from S3
    bls_key = "datausa/pr.data.0.Current"
    try:
        bls_obj = s3.get_object(Bucket=bucket_name, Key=bls_key)
        raw_txt = bls_obj['Body'].read().decode("utf-8")
        bls_df = pd.read_csv(StringIO(raw_txt), sep="\t")
    except Exception as e:
        print("Failed to read BLS data from S3:", e)
        return

    # Clean and process BLS data
    bls_df = bls_df.dropna()
    bls_df.columns = [col.strip().lower() for col in bls_df.columns]

    bls_df['value'] = pd.to_numeric(bls_df['value'], errors='coerce')
    bls_df = bls_df.dropna(subset=['value'])

    grouped = bls_df.groupby(['series_id', 'year'])['value'].sum().reset_index()
    best_years = grouped.loc[grouped.groupby('series_id')['value'].idxmax()]

    print("\nTop year by series ID:")
    print(best_years.head())

    # Join with population data for a specific filter
    match_df = bls_df[(bls_df['series_id'] == 'PRS30006032') & (bls_df['period'].str.strip() == 'Q01')]
    match_df = match_df[['series_id', 'year', 'period', 'value']]

    pop_df.rename(columns={'Year': 'year'}, inplace=True)
    pop_df['year'] = pop_df['year'].astype(int)

    final_report = pd.merge(match_df, pop_df[['year', 'Population']], on='year', how='left')

    print("\nReport for series_id='PRS30006032' and period='Q01':")
    print(final_report)

    # Optionally write back to S3 or local file if needed


In [12]:
# === Entrypoint ===
if __name__ == "__main__":
    try:
        sync_bls_to_s3()
        fetch_population_data()
        perform_analysis()
        print("\n All parts completed successfully.")
    except NoCredentialsError:
        print("AWS credentials not found. Make sure your environment is configured.")
    except Exception as e:
        print(f"Unexpected error: {e}")

Fetching BLS index page...
Fetching population data from API...
Population data uploaded to S3: datausa/population_1.json
Performing analytics...
Mean Population (2013-2018): 317437383.0
Std Population (2013-2018): 4257089.5415293295

Series PRS30006032 - Q01 vs Population:
Empty DataFrame
Columns: [series_id, year, period, value, Population]
Index: []

Best Year per Series ID:
             series_id  year    value
27   PRS30006011        2022   20.500
57   PRS30006012        2022   17.100
63   PRS30006013        1998  705.895
105  PRS30006021        2010   17.700
135  PRS30006022        2010   12.400

Best Year per Series ID:
             series_id  year    value
27   PRS30006011        2022   20.500
57   PRS30006012        2022   17.100
63   PRS30006013        1998  705.895
105  PRS30006021        2010   17.700
135  PRS30006022        2010   12.400

 All parts completed successfully.
