
<div style="width: 100%; max-width: 100%; background-color: #fefefe; border: 1px solid #333; border-radius: 10px; padding: 20px; font-family: Arial, sans-serif; box-sizing: border-box;">
  <h3 style="color: #2c3e50; text-align: center;">Data Processing</h2>
  
  <p style="color: #34495e; line-height: 1.6;">
    In the midterm project, electricity and economic data were processed. In the final project, population data was incorporated, and data cleaning was performed again. The results were saved in “data_imputation_full.csv”.
    
  </p>
  
</div>

In [1]:
import pandas as pd
import requests
from io import StringIO
import numpy as np

# 1. Load data
url_imputation = 'data_imputation.csv'
data_imputation = pd.read_csv(url_imputation, index_col=[0, 1])

data_urls = {
    'world_population': 'https://github.com/xiaothua/dataset/raw/main/world_population.csv'
}

def load_data_from_url(url):
    response = requests.get(url)
    response.raise_for_status()  # Ensure the request was successful
    csv_data = StringIO(response.text)
    return pd.read_csv(csv_data)

data_population = load_data_from_url(data_urls['world_population'])

In [2]:
# Drop rows with missing values in relevant columns to prevent any unexpected issues
relevant_columns = ['Country/Territory', '2022 Population', '2020 Population', '2015 Population',
                    '2010 Population', '2000 Population', '1990 Population', '1980 Population',
                    'Area (km²)', 'World Population Percentage']
data_population_filtered = data_population.dropna(subset=['Country/Territory'])[relevant_columns]

country_name_corrections = {
    "Brunei": "Brunei Darussalam",
    'Turkey': "Türkiye",
    'DR Congo': "COD",
    'Cape Verde': "Cabo Verde",
    'United States Virgin Islands': "United States Minor Outlying Islands"
}

data_population_filtered['Country/Territory'] = data_population_filtered['Country/Territory'].replace(country_name_corrections)

# Add new indices for population-related information to data_imputation
new_indices = ['Population: Population', 
               'Population: Percentage', 
               'Population: Area (km²)',
               'Population: Growth Rate', 
               'Population: Density']

# Extend the existing MultiIndex to include new attributes for population data
existing_index = data_imputation.index
new_index = pd.MultiIndex.from_product([
    existing_index.levels[0], existing_index.levels[1].tolist() + new_indices
], names=existing_index.names)

data_imputation = data_imputation.reindex(new_index)

In [3]:
# Fill in the 'Population: Area (km²)' data from data_population_filtered
for country in data_population_filtered['Country/Territory'].unique():
    if country in data_imputation.index.levels[0]:
        area_value = data_population_filtered.loc[data_population_filtered['Country/Territory'] == country, 'Area (km²)'].values[0]
        data_imputation.loc[(country, 'Population: Area (km²)')] = area_value

# Fill in the 'Population: Population' data using linear interpolation
years = [1980, 1990, 2000, 2010, 2015, 2020, 2022]
for country in data_population_filtered['Country/Territory'].unique():
    if country in data_imputation.index.levels[0]:
        population_values = data_population_filtered.loc[data_population_filtered['Country/Territory'] == country, [f'{year} Population' for year in years]].values.flatten()
        interp_years = range(1992, 2022 + 1)  # Interpolating from 1992 to 2022
        interpolated_population = np.interp(interp_years, years, population_values)
        for year, population in zip(interp_years, interpolated_population):
            if str(year) in data_imputation.columns:
                data_imputation.loc[(country, 'Population: Population'), str(year)] = population

# Calculate total population for each year and compute percentage
for year in data_imputation.columns:
    if year.isdigit():
        total_population = data_imputation.loc[(slice(None), 'Population: Population'), year].sum()
        for country in data_population_filtered['Country/Territory'].unique():
            if country in data_imputation.index.levels[0]:
                country_population = data_imputation.loc[(country, 'Population: Population'), year]
                if total_population > 0:
                    data_imputation.loc[(country, 'Population: Percentage'), year] = (country_population / total_population) * 100

# Calculate population density for each country and year
for country in data_population_filtered['Country/Territory'].unique():
    if country in data_imputation.index.levels[0]:
        area = data_imputation.loc[(country, 'Population: Area (km²)'), :].iloc[0]  # Assuming area is constant across all years
        if pd.notna(area) and area > 0:
            for year in data_imputation.columns:
                if year.isdigit():
                    population = data_imputation.loc[(country, 'Population: Population'), year]
                    if pd.notna(population):
                        data_imputation.loc[(country, 'Population: Density'), year] = population / area

# Calculate population growth rate for each country and year
for country in data_population_filtered['Country/Territory'].unique():
    if country in data_imputation.index.levels[0]:
        for year in range(1993, 2023):  # From 1993 to 2022, to calculate growth rate compared to the previous year
            current_year = str(year)
            previous_year = str(year - 1)
            if current_year in data_imputation.columns and previous_year in data_imputation.columns:
                current_population = data_imputation.loc[(country, 'Population: Population'), current_year]
                previous_population = data_imputation.loc[(country, 'Population: Population'), previous_year]
                if pd.notna(current_population) and pd.notna(previous_population) and previous_population > 0:
                    growth_rate = ((current_population - previous_population) / previous_population) * 100
                    data_imputation.loc[(country, 'Population: Growth Rate'), current_year] = growth_rate
        # Set 1992 growth rate equal to 1993 value
        if '1992' in data_imputation.columns and '1993' in data_imputation.columns:
            data_imputation.loc[(country, 'Population: Growth Rate'), '1992'] = data_imputation.loc[(country, 'Population: Growth Rate'), '1993']

# Check for NaN values in data_imputation and print details if present
nan_values = data_imputation[data_imputation.isna().any(axis=1)]
if not nan_values.empty:
    print('Detailed NaN information:')
    print(nan_values)
else:
    print('No NaN values found in data_imputation.')

No NaN values found in data_imputation.


In [4]:
data_imputation.to_csv('data_imputation_full.csv')

In [5]:
data_imputation.head(40)

Unnamed: 0_level_0,Unnamed: 1_level_0,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Albania,Economics: GDP,652175000.0,1185315000.0,1880952000.0,2392765000.0,3199641000.0,2258514000.0,2545965000.0,3212122000.0,3480355000.0,3922101000.0,...,12319830000.0,12776220000.0,13228150000.0,11386850000.0,11861200000.0,13019690000.0,15156430000.0,15400240000.0,14887630000.0,14887630000.0
Albania,Economics: GDP Growth,-7.187111,9.559412,8.302867,13.32233,9.099999,-10.91998,8.829424,12.8908,6.946217,8.293313,...,1.417243,1.002018,1.774449,2.218726,3.314981,3.802227,4.01936,2.11342,-3.955398,-3.955398
Albania,Economics: GDP Per Capita,200.8522,367.2792,586.4163,750.6044,1009.977,717.38,813.7894,1033.243,1126.683,1281.66,...,4247.63,4413.062,4578.633,3952.803,4124.055,4531.019,5287.664,5395.66,5246.292,5246.292
Albania,Economics: GDP Per Capita Growth,-6.622551,10.22995,8.969762,14.0245,9.78018,-10.3611,9.516484,13.60807,7.630022,9.314397,...,1.584873,1.187234,1.985388,2.516827,3.480293,3.897741,4.276326,2.549359,-3.398708,-3.398708
Albania,Economics: PPP,5917192000.0,6636506000.0,7341011000.0,8493446000.0,9436013000.0,8555908000.0,9451419000.0,10792620000.0,11926130000.0,13157360000.0,...,30530370000.0,30603910000.0,32529190000.0,33585840000.0,34739970000.0,36696820000.0,38647380000.0,39995710000.0,38138320000.0,38138320000.0
Albania,Economics: PPP Growth,-5.071988,12.15634,10.61559,15.69859,11.09759,-9.327085,10.46658,14.19052,10.50259,10.32377,...,2.949927,0.2408745,6.290968,3.248311,3.436355,5.632831,5.31535,3.488801,-4.643986,-4.643986
Albania,Economics: PPP Per Capita,1822.335,2056.373,2288.676,2664.373,2978.508,2717.644,3021.042,3471.662,3860.805,4299.546,...,10526.26,10570.96,11259.27,11658.91,12078.84,12770.96,13483.01,14012.98,13439.67,13439.67
Albania,Economics: PPP Per Capita Growth,-4.494563,12.84277,11.29673,16.41548,11.79022,-8.758213,11.16398,14.91605,11.20913,11.364,...,3.120091,0.4246955,6.511268,3.549414,3.601861,5.730029,5.575517,3.930612,-4.091288,-4.091288
Albania,Electricity: Distribution Losses,1.411,1.655,1.843,2.283,2.887,2.913,3.189,2.213,2.073,1.889,...,1.1193,1.95349,1.11907,1.25,1.9859,1.276047,1.1557,1.08183,0.93263,0.9990721
Albania,Electricity: Distribution Losses Growth,10.40689,17.2927,11.35952,23.87412,26.45642,0.9005888,9.474768,-30.60521,-6.326254,-8.876025,...,3.997101,74.52783,-42.71432,11.69989,58.872,-35.74465,-9.431236,-6.391797,-13.79145,7.124168


In [6]:
data_imputation.index.levels[1]

Index(['Economics: GDP', 'Economics: GDP Growth', 'Economics: GDP Per Capita',
       'Economics: GDP Per Capita Growth', 'Economics: PPP',
       'Economics: PPP Growth', 'Economics: PPP Per Capita',
       'Economics: PPP Per Capita Growth', 'Electricity: Distribution Losses',
       'Electricity: Distribution Losses Growth',
       'Electricity: Distribution Losses Per Capita', 'Electricity: Exports',
       'Electricity: Exports Growth', 'Electricity: Exports Per Capita',
       'Electricity: Imports', 'Electricity: Imports Growth',
       'Electricity: Imports Per Capita', 'Electricity: Installed Capacity',
       'Electricity: Installed Capacity Growth',
       'Electricity: Installed Capacity Per Capita',
       'Electricity: Net Consumption', 'Electricity: Net Consumption Growth',
       'Electricity: Net Consumption Per Capita',
       'Electricity: Net Generation', 'Electricity: Net Generation Growth',
       'Electricity: Net Generation Per Capita', 'Electricity: Net Imports