In [1]:
import pandas as pd

df = pd.read_csv("/content/interpolated_education_dataset.csv")

# target countries
target_countries = ["Bangladesh", "United States", "Niger", "South Korea", "Norway"]

# filtering and sorting
df_filtered = df[df["Country"].isin(target_countries)].sort_values(by=["Country", "Year"]).reset_index(drop=True)


In [2]:
# ADF TEST
from statsmodels.tsa.stattools import adfuller
import pandas as pd
import numpy as np

fertility_series = df_filtered["Fertility_Rate"]
result = adfuller(fertility_series)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

ADF Statistic: -1.773415
p-value: 0.393738
Critical Values:
	1%: -3.461
	5%: -2.875
	10%: -2.574


In [3]:
# Needs differencing, so need to do ARIMAX(1,1,1)
from statsmodels.tsa.statespace.sarimax import SARIMAX

exog_vars = ["GDP_Per_Capita", "Homicide_Rate", "Avg_Yrs_Education", "Infant_Mortality", "Life_Expectancy"]
models = {}
forecasts = []

# forecasting
future_years = list(range(df_filtered["Year"].max() + 1, df_filtered["Year"].max() + 11))

# ARIMAX for each country
for country in target_countries:
    cdf = df_filtered[df_filtered["Country"] == country].dropna(subset=exog_vars)

    if len(cdf) > 10:
        model = SARIMAX(
            cdf["Fertility_Rate"],
            order=(1,1,1),
            exog=cdf[exog_vars],
            enforce_stationarity=False,
            enforce_invertibility=False
        )
        results = model.fit(disp=False)
        models[country] = results

        # using the last row's exog values for forecasting
        future_exog = pd.DataFrame([cdf[exog_vars].iloc[-1].values] * 10, columns=exog_vars)
        forecast = results.get_forecast(steps=10, exog=future_exog)

        forecast_df = pd.DataFrame({
            "Country": country,
            "Year": future_years,
            "ARIMAX_Forecast": forecast.predicted_mean
        })
        forecasts.append(forecast_df)

# combining all forecast data
arimax_forecasts = pd.concat(forecasts, ignore_index=True)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


In [6]:
# UN projection dataset
un_data = pd.read_csv("/content/fertility-rate-with-projections.csv")

# clean and rename the columns for ease
un_data = un_data.rename(columns={
    "Entity": "Country",
    "Fertility rate - Sex: all - Age: all - Variant: estimates": "UN_Estimates",
    "Fertility rate - Sex: all - Age: all - Variant: medium": "UN_Projections"
})

# filtering relevant countries and years
future_years = list(range(2024, 2034))
un_projection_df = un_data[
    (un_data["Country"].isin(target_countries)) &
    (un_data["Year"].isin(future_years))
][["Country", "Year", "UN_Projections"]].dropna()

In [12]:
# merge ARIMAX and UN projections
comparison_df = pd.merge(arimax_forecasts, un_projection_df, on=["Country", "Year"], how="inner")

# Calculate forecast difference
comparison_df["Difference"] = comparison_df["ARIMAX_Forecast"] - comparison_df["UN_Projections"]

# errors
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# overall metrics
y_true = comparison_df["UN_Projections"]
y_pred = comparison_df["ARIMAX_Forecast"]

mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))

print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")


MAE: 0.2722
RMSE: 0.4579
