# **PROJECT 1 : GROUP 6(MA YINCHU)**
# JAPAN 

## **DELIVERABLES**


In [None]:
%pip install wbdata
%pip install pandas-datareader

import numpy as np
import pandas as pd
import wbdata
from pandas_datareader import wb
import matplotlib.pyplot as plt

## 1. [A] Population Statistics 

In this section, we define our population function. It first uses helper functions to retrieve population data from the World Bank API for specific countries, years,age groups, and genders. Since the World Bank reports population data in a 5 year age range, our function calculates population for custom age ranges by determining how much of each 5 year bin overlaps with the user input and divide. It also handles the 80+ age group sepreately and combines male and female population count for aged people 12-30 in 2004 Japan.

In [12]:
import requests
import re

WB_BASE = "https://api.worldbank.org/v2"


class PopulationDataError(ValueError):
    pass


def _wb_get_value(place, indicator, year):
    place = place.strip().upper()
    indicator = indicator.strip()
    year = int(year)

    url = f"{WB_BASE}/country/{place}/indicator/{indicator}"
    params = {"format": "json", "date": str(year), "per_page": 20000}

    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()

    if not isinstance(data, list) or len(data) < 2 or not data[1]:
        return None

    val = data[1][0].get("value", None)
    return None if val is None else float(val)


def _age_group_indicator(low, high, sex):
    suffix = "MA" if sex == "males" else "FE"
    return f"SP.POP.{low:02d}{high:02d}.{suffix}"


def _age_80_plus_indicator(sex):
    return "SP.POP.80UP.MA" if sex == "males" else "SP.POP.80UP.FE"


def population(year, sex, age_range, place):
    """
    Returns the population count needed to answer:

    In [year] how many [people/males/females] aged [low] to [high]
    were living in [the world/region/country]?
    """
    if not isinstance(year, int):
        raise PopulationDataError("year must be an int")

    if not isinstance(age_range, tuple) or len(age_range) != 2:
        raise PopulationDataError("age_range must be (low, high)")

    low, high = age_range
    if not (isinstance(low, int) and isinstance(high, int)):
        raise PopulationDataError("age_range values must be ints")

    if low < 0 or high < 0 or high < low:
        raise PopulationDataError("age_range must satisfy 0 <= low <= high")

    sex = sex.strip().lower()
    if sex not in ("people", "males", "females"):
        raise PopulationDataError("sex must be 'people', 'males', or 'females'")

    place = place.strip().upper()
    if not place:
        raise PopulationDataError("place must be a World Bank code like 'JPN' or 'WLD'")

    def one_sex_total(one_sex):
        total = 0.0

        if high >= 80:
            if low > 80:
                raise PopulationDataError("low > 80 not supported (80+ is a single bin)")
            ind80 = _age_80_plus_indicator(one_sex)
            v80 = _wb_get_value(place, ind80, year)
            if v80 is None:
                raise PopulationDataError(f"missing data: {place}, {year}, {ind80}")
            total += v80
            high_effective = 79
        else:
            high_effective = high

        for start in range(0, 80, 5):
            end = start + 4

            overlap_low = max(low, start)
            overlap_high = min(high_effective, end)
            if overlap_low > overlap_high:
                continue

            ind = _age_group_indicator(start, end, one_sex)
            bin_value = _wb_get_value(place, ind, year)
            if bin_value is None:
                raise PopulationDataError(f"missing data: {place}, {year}, {ind}")

            years_covered = overlap_high - overlap_low + 1
            total += bin_value * (years_covered / 5.0)

        return total

    if sex == "people":
        total_value = one_sex_total("males") + one_sex_total("females")
    else:
        total_value = one_sex_total(sex)

    return int(round(total_value))


PLACE_MAP = {
    "japan": "JPN",
    "world": "WLD",
}


def population_from_query(query):
    """
    Takes a query of the form:

    In [year] how many [people/males/females] aged [low] to [high]
    were living in [the world/region/country]?

    and returns the population count.
    """
    parts = re.findall(r"\[(.*?)\]", query, flags=re.DOTALL)

    if len(parts) != 5:
        raise ValueError(
            "Query must have 5 bracketed fields: [year], [sex], [low], [high], [place]"
        )

    year = int(parts[0].strip())
    sex = parts[1].strip().lower()
    low = int(parts[2].strip())
    high = int(parts[3].strip())

    place_raw = parts[4].strip().lower()
    place_raw = re.sub(r"\s+", " ", place_raw)

    if place_raw in PLACE_MAP:
        place = PLACE_MAP[place_raw]
    else:
        place = place_raw.upper()

    return population(year, sex, (low, high), place)
def ask(query):
    print(population_from_query(query))

In [13]:
ask("In [2004] how many [people] aged [12] to [30] were living in [Japan]?")

28319362


## 2.[A] Unit Tests

This section of Unit Testing verifies that all of our deliverable functions work properly.

In [14]:
def run_tests():
    jp_2004 = population(2004, "people", (0, 79), "JPN")
    assert jp_2004 > 50_000_000, "Japan 2004 (0-79) should be > 50M"

    wld_2004 = population(2004, "people", (0, 79), "WLD")
    assert wld_2004 > 5_000_000_000, "World 2004 (0-79) should be > 5B"

    year = 2004
    place = "JPN"
    ages = (12, 30)
    m = population(year, "males", ages, place)
    f = population(year, "females", ages, place)
    p = population(year, "people", ages, place)
    assert abs((m + f) - p) <= 5, "people should equal males+females (rounding tolerance)"

    small = population(2004, "people", (12, 20), "JPN")
    big = population(2004, "people", (12, 30), "JPN")
    assert big >= small, "Expanding age range should not decrease population"

    clean = population(2004, "people", (12, 30), "JPN")
    messy = population(2004, "people", (12, 30), "  jpn  ")
    assert clean == messy, "Place code should be case/whitespace insensitive"

    q = "In [2004] how many [people] aged [12] to [30] were living in [Japan]?"
    parsed = population_from_query(q)
    direct = population(2004, "people", (12, 30), "JPN")
    assert parsed == direct, "Query parser should match direct function call"

    try:
        population(2004, "men", (12, 30), "JPN")
        assert False, "Expected error for invalid sex"
    except Exception:
        pass

    try:
        population(2004, "people", (30, 12), "JPN")
        assert False, "Expected error for reversed age_range"
    except Exception:
        pass

    a = population(2004, "people", (80, 80), "JPN")   # 80+ bin
    b = population(2004, "people", (0, 80), "JPN")    # includes 80+
    assert b >= a, "(0,80) should be >= (80,80)"

    print("All tests passed!")

# 3.[A] Population DataFrames

In [None]:
## creating age bins

age_ranges = []

for i in np.arange(0, 80, 5):
    age_ranges.append({'code': f"{i:02d}{i + 4:02d}", 
                       'label': f"{i} - {i + 4}"
                      })
    
age_ranges.append({'code': '80UP', 
                   'label': '80+'})
print(age_ranges)

In [None]:
## separating by sex

male_variables = {f"SP.POP.{b['code']}.MA": f"Males {b['label']}" for b in age_ranges}
female_variables = {f"SP.POP.{b['code']}.FE": f"Females {b['label']}" for b in age_ranges}

variables = {}

variables.update(male_variables)
variables.update(female_variables)

print(variables)

In [None]:
## dataframe function

def population_df(countries, start, end):
    
    df = wb.download(indicator = variables, 
                     country = countries, 
                     start = start, 
                     end = end)
    
    df = df.rename(columns = variables)
    
    df = df.sort_index()
    
    return df

pop = population_df(['USA'], start = 2000, end = 2020)
pop.head()

# 4.[B]Population Pyramids

In [None]:
## population pyramid function

def _age_sort_key(label):
    s = str(label)
    num = ''
    for character in s:
        if character.isdigit():
            num += character
        elif num:
            break
    return int(num) if num else 9999

def population_pyramid(df, country = None, year = None, title = None, ax = None):
    if country is not None and year is not None:
        row = df.loc[(country, str(year))]
    else:
        row = df.iloc[0]

    male_cols = [c for c in row.index if str(c).startswith('Males ')]
    female_cols = [c for c in row.index if str(c).startswith('Females ')]

    if len(male_cols) == 0 or len(female_cols) == 0:
        raise ValueError('No columns starting with Males and Females')

    def age_label(column_name, prefix):
        return str(column_name)[len(prefix):]

    male_map = {age_label(c, 'Males '): c for c in male_cols}
    female_map = {age_label(c, 'Females '): c for c in female_cols}

    ages = sorted(set(male_map.keys()) & set(female_map.keys()), key = _age_sort_key)

    males = np.array([row[male_map[a]] if a in male_map else np.nan for a in ages], dtype=float)
    females = np.array([row[female_map[a]] if a in female_map else np.nan for a in ages], dtype=float)

    scale = 1e6
    males = males/scale
    females = females/scale

    males_plot = -males
    females_plot = females

    if ax is None:
        fig, ax = plt.subplots(figsize = (7, 6))

    y = np.arange(len(ages))
    ax.barh(y, males_plot, label = "Males")
    ax.barh(y, females_plot, label = "Females")

    ax.set_yticks(y)
    ax.set_yticklabels(ages)

    max_val = max(np.nanmax(males_plot), np.nanmax(females_plot)) if len(ages) else 1
    ax.set_xlim(-max_val * 1.1, max_val * 1.1)
    ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{abs(int(x))}"))

    ax.axvline(0, linewidth=1)

    ax.set_xlabel("Population (millions)")
    ax.set_ylabel("Age group")

    if title is None:
        if country is not None:
            title = f"{country} Population Pyramid"
        else:
            title = "Population Pyramid"
    ax.set_title(title)

    ax.legend()
    return ax

# 7. [C] Other Visualization Tools

**Birth Rate VS Death Rate**

In [19]:
variable_labels = {"SP.POP.TOTL":"JPN Population", 
                   "SP.POP.3539.FE": "Population ages 35-39, female", 
                   "SP.POP.3034.FE": "Population ages 30-34, female", 
                   "SP.POP.2529.FE": "Population ages 25-29, female",
                   "SP.DYN.TFRT.IN": "Fertility rate", 
                   "SP.POP.3539.MA": "Population ages 35-39, male", 
                   "SP.POP.3034.MA": "Population ages 30-34, male", 
                   "SP.POP.2529.MA": "Population ages 25-29, male", 
                   "SP.POP.2024.MA": "Population ages 20-24, male", 
                   "SP.POP.GROW": "Population growth (annual %)", 
                   "SH.DYN.MORT": "Mortality rate, under-5 (per 1,000 live births)",
                   "SP.DYN.CBRT.IN": "Birth rate, crude (per 1,000 people)"}       
JPN = wbdata.get_dataframe(variable_labels, country="JPN")
JPN.index = JPN.index.astype(int)

NameError: name 'wbdata' is not defined

In [20]:
birth_mortality = JPN[['Birth rate, crude (per 1,000 people)', 'Mortality rate, under-5 (per 1,000 live births)']].sort_index()
birth_mortality

NameError: name 'JPN' is not defined

In [21]:
birth_mortality.iplot(title="Birth Rate vs. Mortality Rate Over Time (JAPAN)",xTitle='Time',yTitle='Population vs. Birth Rate')

NameError: name 'birth_mortality' is not defined