In [29]:
# Import my Variables

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [36]:
# Step 1: Load and preprocess the data
data = pd.read_csv("Resources/World_Bank.csv")
data_df = data
data_df.head(25)

Unnamed: 0,Country Name,Country Code,Year,ATM Usage,GDP_x,GDP_y,GDP,High Tech Export ($M),High Tech Import,Internet Subscription Per 100,...,IP Income($M),Labour Force (M),Med_High Tech Manufacturing,Mobile Sub Per 100,Personal Remittance ($M),Poverty Count,Stock Traded $B,Tech Cooperation Grant($M),Unemployment Rate,Med High Tech Export
0,Algeria,DZA,2010,5.33,1.69,1.69,1.69,5.38,59.43,2.51,...,10.16,10.55,9.66,91.42,150.34,6.25,114.82,191.31,10.16,0.46
1,American Samoa,ASM,2010,49.64,1.25,1.25,1.25,1521.29,22.73,0.0,...,511.83,4.8,12.28,163.6,730.23,0.77,45.53,21.76,8.45,0.0
2,Antigua and Barbuda,ATG,2010,69.03,-9.09,-9.09,-9.09,3037.85,28.34,8.31,...,1.89,8.13,20.05,196.01,20.66,0.53,68.07,2.72,7.0,0.0
3,Argentina,ARG,2010,13.86,9.84,9.84,9.84,1686.22,33.65,9.8,...,1531.73,18.32,26.18,138.89,628.54,0.7,0.45,71.88,8.65,45.02
4,Armenia,ARM,2010,30.24,2.82,2.82,2.82,4.73,17.12,3.18,...,1021.78,1.46,4.51,131.19,1439.81,1.0,22.99,40.8,9.89,24.8
5,Australia,AUS,2010,154.32,0.63,0.63,0.63,4554.4,25.79,25.02,...,3012.86,11.47,27.82,102.18,1334.65,0.3,90.62,16.68,5.56,19.94
6,Austria,AUT,2010,112.34,1.59,1.59,1.59,15801.88,35.94,24.52,...,2092.66,4.29,43.78,146.37,3101.68,0.5,12.82,30.65,5.3,59.97
7,Azerbaijan,AZE,2010,24.67,3.81,3.81,3.81,7.2,70.2,5.15,...,4.83,4.14,7.96,98.52,1254.64,0.37,17.05,44.61,5.74,17.23
8,Bahamas,The,2010,70.22,-5.55,-5.55,-5.55,0.0,46.33,8.58,...,19.68,0.2,27.77,97.49,1948.62,1.0,1.72,150.0,8.18,52.33
9,Bahrain,BHR,2010,79.02,1.39,1.39,1.39,0.96,10.1,12.76,...,57.43,0.66,18.02,129.12,1770.21,1.5,2.03,165.48,1.14,1.99


In [31]:
print(data_df.columns)

Index(['Country Name', 'Country Code', 'Year', 'ATM Usage', 'GDP_x', 'GDP_y',
       'GDP', 'High Tech Export ($M)', 'High Tech Import',
       'Internet Subscription Per 100', 'Internet Usage Per 100',
       'IP Income($M)', 'Labour Force (M)', 'Med_High Tech Manufacturing',
       'Mobile Sub Per 100', 'Personal Remittance ($M)', 'Poverty Count',
       'Stock Traded $B', 'Tech Cooperation Grant($M)', 'Unemployment Rate',
       'Med High Tech Export'],
      dtype='object')


In [32]:
#Perform label encoding for the 'Country_Name' column
encoder = LabelEncoder()
data_df['Country Code'] = encoder.fit_transform(data_df['Country Name'])

In [33]:
grouped_data = data.groupby('Country Code')

In [35]:

# Step 5: Define a function to train and predict for each country
def train_and_predict_country(country_data):
    country_name = encoder.inverse_transform([country_data['Country Code'].iloc[0]])[0]
    print(f"Predictions for {country_name}:")

    # Step 6: Split the data into features and target variable for this country
    
    # Define the columns to exclude from selected_columns
    columns_to_exclude = ['Country Name', 'Country Code', 'Year', 'Poverty Count']
    selected_columns = [col for col in country_data.columns if col not in columns_to_exclude]

    # Now, selected_columns contains the columns you want to use for features (excluding the excluded ones)
    X = country_data[selected_columns].values
    y = country_data['Poverty Count'].values

    # Split the data into training and test sets for this country
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Choose a machine learning model for this country (Linear Regression)
    model = LinearRegression()

    # Train the model for this country
    model.fit(X_train, y_train)

    # Evaluate the model's performance for this country
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    print(f"Training R^2 Score: {train_score:.4f}")
    print(f"Test R^2 Score: {test_score:.4f}")

    # Step 12: Make predictions for the next 5 years for this country
    last_year_data = country_data.iloc[-1][selected_columns]
    future_years = np.arange(2023, 2028)
    future_data_list = []

    for year in future_years:
        future_data_dict = last_year_data.to_dict()
        future_data_dict['Year'] = year
        future_data_list.append(future_data_dict)

    future_data = pd.DataFrame(future_data_list)

    # Step 13: Make predictions using the model
    predictions = model.predict(future_data[selected_columns].values)

    # Collect the predictions for the next 5 years in a list
    predictions_list = []
    for year, prediction in zip(future_years, predictions):
        predictions_list.append((year, prediction))

    # Return the country name and the list of predictions for this country
    return country_name, predictions_list

# Step 13: Iterate over each country and make predictions
for country_code, country_data in grouped_data:
    country_name, predictions = train_and_predict_country(country_data)
    print(f"Predictions for {country_name}:")
    for year, prediction in predictions:
        print(f"Year {year}: {prediction:.2f}")
    print("\n")


Predictions for Albania:
Training R^2 Score: 1.0000
Test R^2 Score: nan
Predictions for Albania:
Year 2023: 0.10
Year 2024: 0.10
Year 2025: 0.10
Year 2026: 0.10
Year 2027: 0.10


Predictions for Algeria:
Training R^2 Score: 1.0000
Test R^2 Score: -5.4659
Predictions for Algeria:
Year 2023: 1.22
Year 2024: 1.22
Year 2025: 1.22
Year 2026: 1.22
Year 2027: 1.22


Predictions for American Samoa:
Training R^2 Score: 1.0000
Test R^2 Score: -28.3526
Predictions for American Samoa:
Year 2023: 0.48
Year 2024: 0.48
Year 2025: 0.48
Year 2026: 0.48
Year 2027: 0.48


Predictions for Andorra:
Training R^2 Score: 1.0000
Test R^2 Score: nan
Predictions for Andorra:
Year 2023: 0.25
Year 2024: 0.25
Year 2025: 0.25
Year 2026: 0.25
Year 2027: 0.25


Predictions for Antigua and Barbuda:
Training R^2 Score: 1.0000
Test R^2 Score: -41.0691
Predictions for Antigua and Barbuda:
Year 2023: 0.55
Year 2024: 0.55
Year 2025: 0.55
Year 2026: 0.55
Year 2027: 0.55


Predictions for Arab World:
Training R^2 Score: 1.000

Training R^2 Score: 1.0000
Test R^2 Score: -84.8027
Predictions for Hong Kong SAR:
Year 2023: 0.38
Year 2024: 0.38
Year 2025: 0.38
Year 2026: 0.38
Year 2027: 0.38


Predictions for Hungary:
Training R^2 Score: 1.0000
Test R^2 Score: -0.1160
Predictions for Hungary:
Year 2023: 0.40
Year 2024: 0.40
Year 2025: 0.40
Year 2026: 0.40
Year 2027: 0.40


Predictions for IBRD only:
Training R^2 Score: 1.0000
Test R^2 Score: -3.5549
Predictions for IBRD only:
Year 2023: 1.08
Year 2024: 1.08
Year 2025: 1.08
Year 2026: 1.08
Year 2027: 1.08


Predictions for IDA & IBRD total:
Training R^2 Score: 1.0000
Test R^2 Score: -4.1498
Predictions for IDA & IBRD total:
Year 2023: 1.76
Year 2024: 1.76
Year 2025: 1.76
Year 2026: 1.76
Year 2027: 1.76


Predictions for IDA blend:
Training R^2 Score: 1.0000
Test R^2 Score: 0.3349
Predictions for IDA blend:
Year 2023: 3.12
Year 2024: 3.12
Year 2025: 3.12
Year 2026: 3.12
Year 2027: 3.12


Predictions for IDA only:
Training R^2 Score: 1.0000
Test R^2 Score: -3.3612
P

Training R^2 Score: 1.0000
Test R^2 Score: 0.2794
Predictions for Sri Lanka:
Year 2023: 2.26
Year 2024: 2.26
Year 2025: 2.26
Year 2026: 2.26
Year 2027: 2.26


Predictions for St. Kitts and Nevis:
Training R^2 Score: 1.0000
Test R^2 Score: -20.6066
Predictions for St. Kitts and Nevis:
Year 2023: 1.95
Year 2024: 1.95
Year 2025: 1.95
Year 2026: 1.95
Year 2027: 1.95


Predictions for St. Lucia:
Training R^2 Score: 1.0000
Test R^2 Score: -61.2555
Predictions for St. Lucia:
Year 2023: 3.68
Year 2024: 3.68
Year 2025: 3.68
Year 2026: 3.68
Year 2027: 3.68


Predictions for St. Martin (French part):
Training R^2 Score: 1.0000
Test R^2 Score: -5.9430
Predictions for St. Martin (French part):
Year 2023: 1.08
Year 2024: 1.08
Year 2025: 1.08
Year 2026: 1.08
Year 2027: 1.08


Predictions for St. Vincent and the Grenadines:
Training R^2 Score: 1.0000
Test R^2 Score: -7422.3200
Predictions for St. Vincent and the Grenadines:
Year 2023: 0.37
Year 2024: 0.37
Year 2025: 0.37
Year 2026: 0.37
Year 2027: 0.3

##### Step 6: Split the data into features and target variable for this country
selected_columns = ['Tech Cooperation Grant($M)', 'High Tech Import', 'Med High Tech Export', 'IP Income($M)', 'Unemployment Rate']
X = country_data[selected_columns].values
y = country_data['Poverty Count'].values

#Split the data into training and test sets for this country
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
#Split the data into training and test sets for this country
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)



Shape of X_train: (8, 5)
Shape of X_test: (3, 5)


In [16]:
#Choose a machine learning model for this country (Linear Regression)
model = LinearRegression()

In [17]:
#Train the model for this country
model.fit(X_train, y_train)

LinearRegression()

In [18]:
 # Evaluate the model's performance for this country
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Training R^2 Score: {train_score:.4f}")
print(f"Test R^2 Score: {test_score:.4f}")

Training R^2 Score: 0.9581
Test R^2 Score: -24.6576


In [19]:
# Step 12: Make predictions for the next 5 years for this country
# Step 12: Make predictions for the next 5 years for this country
last_year_data = country_data.iloc[-1][selected_columns]

future_years = np.arange(2023, 2028)
future_data_list = []

for year in future_years:
    future_data_dict = last_year_data.to_dict()
    future_data_dict['Year'] = year
    future_data_list.append(future_data_dict)

future_data = pd.DataFrame(future_data_list)

# Step 13: Make predictions using the model
predictions = model.predict(future_data[selected_columns].values)

# Print the predictions for the next 5 years
print("Predicted Poverty Count for the next 5 years:")
for year, prediction in zip(future_years, predictions):
    print(f"Year {year}: {prediction:.2f}")



Predicted Poverty Count for the next 5 years:
Year 2023: 1.85
Year 2024: 1.85
Year 2025: 1.85
Year 2026: 1.85
Year 2027: 1.85
