## 1. Load Raw Data

In [5]:
import pandas as pd
import numpy as np
import re

df_gdp = pd.read_csv("../../data/raw/GDP-countries.csv", skiprows=4)

print("Preview of raw GDP dataset:")
display(df_gdp.head())

Preview of raw GDP dataset:


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,Unnamed: 69
0,Aruba,ABW,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,2983635000.0,3092429000.0,3276184000.0,3395799000.0,2481857000.0,2929447000.0,3279344000.0,3648573000.0,,
1,Africa Eastern and Southern,AFE,GDP (current US$),NY.GDP.MKTP.CD,24209930000.0,24963260000.0,27078020000.0,31774830000.0,30284920000.0,33812190000.0,...,828961200000.0,973025100000.0,1012291000000.0,1009747000000.0,933407200000.0,1085605000000.0,1191639000000.0,1133818000000.0,1205974000000.0,
2,Afghanistan,AFG,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,18116570000.0,18753460000.0,18053220000.0,18799440000.0,19955930000.0,14260000000.0,14497240000.0,17152230000.0,,
3,Africa Western and Central,AFW,GDP (current US$),NY.GDP.MKTP.CD,11905110000.0,12708030000.0,13630920000.0,14469260000.0,15803940000.0,16921240000.0,...,700028200000.0,694051300000.0,777840400000.0,833288900000.0,797295200000.0,858114500000.0,893639900000.0,814728500000.0,670025700000.0,
4,Angola,AGO,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,52761620000.0,73690150000.0,79450690000.0,70897960000.0,48501560000.0,66505130000.0,104399700000.0,84875160000.0,80396940000.0,


## 2. Select Relevant Year Columns

In [6]:
year_cols = [str(y) for y in range(2000, 2022 + 1)]

gdp_filtered = df_gdp[["Country Name", "Country Code"] + year_cols]

print("Filtered GDP dataset with selected year columns:")
display(gdp_filtered.head())

Filtered GDP dataset with selected year columns:


Unnamed: 0,Country Name,Country Code,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Aruba,ABW,1873453000.0,1896457000.0,1961844000.0,2044112000.0,2254831000.0,2360017000.0,2469783000.0,2677641000.0,...,2727850000.0,2790850000.0,2962907000.0,2983635000.0,3092429000.0,3276184000.0,3395799000.0,2481857000.0,2929447000.0,3279344000.0
1,Africa Eastern and Southern,AFE,287201700000.0,260992200000.0,267815000000.0,355716400000.0,442696200000.0,516661100000.0,580240800000.0,665598700000.0,...,962441300000.0,978743800000.0,898308900000.0,828961200000.0,973025100000.0,1012291000000.0,1009747000000.0,933407200000.0,1085605000000.0,1191639000000.0
2,Afghanistan,AFG,3521418000.0,2813572000.0,3825701000.0,4520947000.0,5224897000.0,6203257000.0,6971758000.0,9747886000.0,...,20146420000.0,20497130000.0,19134220000.0,18116570000.0,18753460000.0,18053220000.0,18799440000.0,19955930000.0,14260000000.0,14497240000.0
3,Africa Western and Central,AFW,142140100000.0,150058500000.0,179390100000.0,207755000000.0,258566700000.0,317096500000.0,402724200000.0,471537800000.0,...,844202600000.0,903933700000.0,778022100000.0,700028200000.0,694051300000.0,777840400000.0,833288900000.0,797295200000.0,858114500000.0,893639900000.0
4,Angola,AGO,9129595000.0,8936079000.0,15285590000.0,17812700000.0,23552060000.0,36970900000.0,52381030000.0,65266420000.0,...,132339100000.0,135966800000.0,90496420000.0,52761620000.0,73690150000.0,79450690000.0,70897960000.0,48501560000.0,66505130000.0,104399700000.0


## 3. Analyze Row-wise Missing Values

In [7]:
row_wise_missing = gdp_filtered.isnull().sum(axis=1)

print("Row-wise analysis of missing values (count of missing years per country):")
print(row_wise_missing.value_counts().sort_index())

countries_with_missing = gdp_filtered[row_wise_missing > 0]
missing_counts = row_wise_missing[row_wise_missing > 0]

result = pd.DataFrame({
    "Country Name": countries_with_missing["Country Name"],
    "Missing Values Count": missing_counts
})

print("\nCountries with missing values and their respective counts:")
display(result.reset_index(drop=True))


Row-wise analysis of missing values (count of missing years per country):
0     247
1       2
2       5
4       1
6       1
8       2
9       1
11      1
15      1
19      1
23      4
Name: count, dtype: int64

Countries with missing values and their respective counts:


Unnamed: 0,Country Name,Missing Values Count
0,American Samoa,2
1,Channel Islands,1
2,Cuba,2
3,Cayman Islands,6
4,Eritrea,11
5,Gibraltar,23
6,Guam,2
7,Not classified,23
8,St. Martin (French part),19
9,Northern Mariana Islands,2


## 4. Drop Rows with Too Many Missing Values

In [8]:
gdp_filtered_cleaned = gdp_filtered[row_wise_missing <= 10]

print("DataFrame after dropping rows with more than 10 missing values:")
display(gdp_filtered_cleaned.head(10))

# Recalculate missing values for the cleaned data
cleaned_row_wise_missing = gdp_filtered_cleaned.isnull().sum(axis=1)

print("\nRow-wise analysis of missing values in the cleaned DataFrame:")
print(cleaned_row_wise_missing.value_counts().sort_index())


DataFrame after dropping rows with more than 10 missing values:


Unnamed: 0,Country Name,Country Code,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Aruba,ABW,1873453000.0,1896457000.0,1961844000.0,2044112000.0,2254831000.0,2360017000.0,2469783000.0,2677641000.0,...,2727850000.0,2790850000.0,2962907000.0,2983635000.0,3092429000.0,3276184000.0,3395799000.0,2481857000.0,2929447000.0,3279344000.0
1,Africa Eastern and Southern,AFE,287201700000.0,260992200000.0,267815000000.0,355716400000.0,442696200000.0,516661100000.0,580240800000.0,665598700000.0,...,962441300000.0,978743800000.0,898308900000.0,828961200000.0,973025100000.0,1012291000000.0,1009747000000.0,933407200000.0,1085605000000.0,1191639000000.0
2,Afghanistan,AFG,3521418000.0,2813572000.0,3825701000.0,4520947000.0,5224897000.0,6203257000.0,6971758000.0,9747886000.0,...,20146420000.0,20497130000.0,19134220000.0,18116570000.0,18753460000.0,18053220000.0,18799440000.0,19955930000.0,14260000000.0,14497240000.0
3,Africa Western and Central,AFW,142140100000.0,150058500000.0,179390100000.0,207755000000.0,258566700000.0,317096500000.0,402724200000.0,471537800000.0,...,844202600000.0,903933700000.0,778022100000.0,700028200000.0,694051300000.0,777840400000.0,833288900000.0,797295200000.0,858114500000.0,893639900000.0
4,Angola,AGO,9129595000.0,8936079000.0,15285590000.0,17812700000.0,23552060000.0,36970900000.0,52381030000.0,65266420000.0,...,132339100000.0,135966800000.0,90496420000.0,52761620000.0,73690150000.0,79450690000.0,70897960000.0,48501560000.0,66505130000.0,104399700000.0
5,Albania,ALB,3584570000.0,4059064000.0,4515003000.0,5801712000.0,7406646000.0,8256658000.0,9150528000.0,11116940000.0,...,12796990000.0,13296320000.0,11470170000.0,11988670000.0,13258270000.0,15379510000.0,15585110000.0,15241460000.0,18032010000.0,19017240000.0
6,Andorra,AND,1432606000.0,1548266000.0,1764280000.0,2366942000.0,2900245000.0,3161084000.0,3459338000.0,3957625000.0,...,3193513000.0,3271686000.0,2789881000.0,2896610000.0,3000162000.0,3218420000.0,3155149000.0,2891001000.0,3324648000.0,3380613000.0
7,Arab World,ARB,763411600000.0,744728500000.0,746649300000.0,830076500000.0,987316000000.0,1210080000000.0,1439471000000.0,1674641000000.0,...,2874276000000.0,2926205000000.0,2568631000000.0,2524103000000.0,2598475000000.0,2907330000000.0,2949355000000.0,2596423000000.0,3077907000000.0,3738584000000.0
8,United Arab Emirates,ARE,104337400000.0,103311600000.0,109816200000.0,124346400000.0,147824400000.0,180617500000.0,222116500000.0,257916100000.0,...,400218500000.0,414105400000.0,370275500000.0,369255300000.0,390516800000.0,427049400000.0,417989700000.0,349473000000.0,415178800000.0,502731900000.0
9,Argentina,ARG,284203800000.0,268696800000.0,97724000000.0,127587000000.0,164657900000.0,198737100000.0,232557300000.0,287530500000.0,...,552025100000.0,526319700000.0,594749300000.0,557532300000.0,643628400000.0,524819900000.0,447754700000.0,385740500000.0,486564100000.0,632790100000.0



Row-wise analysis of missing values in the cleaned DataFrame:
0    247
1      2
2      5
4      1
6      1
8      2
9      1
Name: count, dtype: int64


## 5. Interpolate Missing Year Values

In [9]:
gdp_filled = gdp_filtered_cleaned.copy()

# Year columns start from the third column
year_columns = gdp_filled.columns[2:]

# Interpolate across years (columns are transposed, interpolated, then transposed back)
gdp_filled[year_columns] = (
    gdp_filled[year_columns]
    .T
    .interpolate(method="linear", limit_direction="both")
    .T
)

print("DataFrame after linear interpolation of missing year values:")
display(gdp_filled.head())


DataFrame after linear interpolation of missing year values:


Unnamed: 0,Country Name,Country Code,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Aruba,ABW,1873453000.0,1896457000.0,1961844000.0,2044112000.0,2254831000.0,2360017000.0,2469783000.0,2677641000.0,...,2727850000.0,2790850000.0,2962907000.0,2983635000.0,3092429000.0,3276184000.0,3395799000.0,2481857000.0,2929447000.0,3279344000.0
1,Africa Eastern and Southern,AFE,287201700000.0,260992200000.0,267815000000.0,355716400000.0,442696200000.0,516661100000.0,580240800000.0,665598700000.0,...,962441300000.0,978743800000.0,898308900000.0,828961200000.0,973025100000.0,1012291000000.0,1009747000000.0,933407200000.0,1085605000000.0,1191639000000.0
2,Afghanistan,AFG,3521418000.0,2813572000.0,3825701000.0,4520947000.0,5224897000.0,6203257000.0,6971758000.0,9747886000.0,...,20146420000.0,20497130000.0,19134220000.0,18116570000.0,18753460000.0,18053220000.0,18799440000.0,19955930000.0,14260000000.0,14497240000.0
3,Africa Western and Central,AFW,142140100000.0,150058500000.0,179390100000.0,207755000000.0,258566700000.0,317096500000.0,402724200000.0,471537800000.0,...,844202600000.0,903933700000.0,778022100000.0,700028200000.0,694051300000.0,777840400000.0,833288900000.0,797295200000.0,858114500000.0,893639900000.0
4,Angola,AGO,9129595000.0,8936079000.0,15285590000.0,17812700000.0,23552060000.0,36970900000.0,52381030000.0,65266420000.0,...,132339100000.0,135966800000.0,90496420000.0,52761620000.0,73690150000.0,79450690000.0,70897960000.0,48501560000.0,66505130000.0,104399700000.0


## 6. Scale GDP Values to Billions

In [11]:
gdp_filled[year_columns] = (gdp_filled[year_columns] / 1e9).round(4)

print("DataFrame after scaling GDP values to billions:")
display(gdp_filled.head())

DataFrame after scaling GDP values to billions:


Unnamed: 0,Country Name,Country Code,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Aruba,ABW,1.8735,1.8965,1.9618,2.0441,2.2548,2.36,2.4698,2.6776,...,2.7278,2.7908,2.9629,2.9836,3.0924,3.2762,3.3958,2.4819,2.9294,3.2793
1,Africa Eastern and Southern,AFE,287.2017,260.9922,267.815,355.7164,442.6962,516.6611,580.2408,665.5987,...,962.4413,978.7438,898.3089,828.9612,973.0251,1012.2912,1009.7465,933.4072,1085.6049,1191.6386
2,Afghanistan,AFG,3.5214,2.8136,3.8257,4.5209,5.2249,6.2033,6.9718,9.7479,...,20.1464,20.4971,19.1342,18.1166,18.7535,18.0532,18.7994,19.9559,14.26,14.4972
3,Africa Western and Central,AFW,142.1401,150.0585,179.3901,207.755,258.5667,317.0965,402.7242,471.5378,...,844.2026,903.9337,778.0221,700.0282,694.0513,777.8404,833.2889,797.2952,858.1145,893.6399
4,Angola,AGO,9.1296,8.9361,15.2856,17.8127,23.5521,36.9709,52.381,65.2664,...,132.3391,135.9668,90.4964,52.7616,73.6902,79.4507,70.898,48.5016,66.5051,104.3997


## 7. Verify Missing Values After Imputation

In [12]:
final_row_wise_missing = gdp_filled.isnull().sum(axis=1)

print("Row-wise analysis of missing values in the final DataFrame:")
print(final_row_wise_missing.value_counts().sort_index())


Row-wise analysis of missing values in the final DataFrame:
0    259
Name: count, dtype: int64


## 8. Final Preview and Save Cleaned Data

In [13]:
print("Final preview of cleaned GDP dataset:")
display(gdp_filled.head())

output_file_path = "../../data/processed/cleaned-GDP-countries.csv"
gdp_filled.to_csv(output_file_path, index=False)

print(f"Cleaned data saved to {output_file_path}")

Final preview of cleaned GDP dataset:


Unnamed: 0,Country Name,Country Code,2000,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Aruba,ABW,1.8735,1.8965,1.9618,2.0441,2.2548,2.36,2.4698,2.6776,...,2.7278,2.7908,2.9629,2.9836,3.0924,3.2762,3.3958,2.4819,2.9294,3.2793
1,Africa Eastern and Southern,AFE,287.2017,260.9922,267.815,355.7164,442.6962,516.6611,580.2408,665.5987,...,962.4413,978.7438,898.3089,828.9612,973.0251,1012.2912,1009.7465,933.4072,1085.6049,1191.6386
2,Afghanistan,AFG,3.5214,2.8136,3.8257,4.5209,5.2249,6.2033,6.9718,9.7479,...,20.1464,20.4971,19.1342,18.1166,18.7535,18.0532,18.7994,19.9559,14.26,14.4972
3,Africa Western and Central,AFW,142.1401,150.0585,179.3901,207.755,258.5667,317.0965,402.7242,471.5378,...,844.2026,903.9337,778.0221,700.0282,694.0513,777.8404,833.2889,797.2952,858.1145,893.6399
4,Angola,AGO,9.1296,8.9361,15.2856,17.8127,23.5521,36.9709,52.381,65.2664,...,132.3391,135.9668,90.4964,52.7616,73.6902,79.4507,70.898,48.5016,66.5051,104.3997


Cleaned data saved to ../../data/processed/cleaned-GDP-countries.csv
