In [9]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [15]:
folder_path = "dataset/source/world_bank/extracted"

# Initialize an empty DataFrame to store predictions
all_predictions = pd.DataFrame()

# Loop through files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)

        # Read data from CSV file from 3 row
        data = pd.read_csv(file_path, header=2)

        # Filter data for the desired indicator and relevant columns
        relevant_columns = ["Country Name", "Country Code", "Indicator Name"] + list(map(str, range(1960, 2023)))
        data = data[data["Indicator Name"].str.startswith("Population ages") & data["Indicator Name"].str.endswith("total")]
        data = data[relevant_columns]

        # Melt the DataFrame
        data = pd.melt(data, id_vars=["Country Name", "Country Code", "Indicator Name"], var_name="Year", value_name="Population")

        # Pivot the DataFrame to have years as columns
        data = data.pivot_table(index=["Country Name", "Country Code", "Indicator Name"], columns="Year", values="Population").reset_index()

        # Prepare data for training
        X = data.drop(["Country Name", "Country Code", "Indicator Name"], axis=1)
        y = X.pop("2022")  # Use 2022 as the target variable for training

        # Initialize and train a linear regression model
        model = LinearRegression()
        model.fit(X, y)

        # Make predictions for the years 2023-2030
        future_years = list(map(str, range(2023, 2031)))
        future_data = data[["Country Name", "Country Code", "Indicator Name"]].copy()

        # Create a DataFrame with predictions
        predictions_df = pd.DataFrame(index=range(len(future_data)))
        predictions_df[["Country Name", "Country Code", "Indicator Name"]] = future_data[["Country Name", "Country Code", "Indicator Name"]]

        # Assign predictions to the DataFrame
        for year in future_years:
            predictions_df[year] = model.predict(X)

        # Concatenate the predictions DataFrame with future_data
        future_data = pd.concat([future_data, predictions_df.drop(["Country Name", "Country Code", "Indicator Name"], axis=1)], axis=1)

        # Melt the future DataFrame
        future_data = pd.melt(future_data, id_vars=["Country Name", "Country Code", "Indicator Name"], var_name="Year", value_name="Population")

        # Extract 'Year' from the melted column
        future_data["Year"] = future_data["Year"].str.extract('(\d+)', expand=False)

        # Convert 'Population' to numeric
        future_data["Population"] = pd.to_numeric(future_data["Population"], errors="coerce")

        # Append predictions to the overall DataFrame
        all_predictions = pd.concat([all_predictions, future_data], ignore_index=True)

# Save predictions to a CSV file
all_predictions.to_csv("population_predictions.csv", index=False)