In [67]:
# Import my Variables

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [68]:
# Step 1: Load and preprocess the data
data = pd.read_csv("Resources/World_Bank.csv")
data_df = data
data_df.head()

Unnamed: 0,Country Name,Country Code,Year,ATM Usage,GDP,High Tech Export ($M),High Tech Import,Internet Subscription Per 100,Internet Usage Per 100,IP Income($M),Labour Force (M),Med_High Tech Manufacturing,Mobile Sub Per 100,Personal Remittance ($M),Population Living In Slums,Poverty Count,Stock Traded $B,Tech Cooperation Grant($M),Unemployment Rate,Med High Tech Export
0,Argentina,ARG,2010,13.86,9.84,1686.22,33.65,9.8,45.0,1531.73,18.32,26.18,138.89,628.54,16.66,15.05,0.45,71.88,8.65,45.02
1,Bangladesh,BGD,2010,1.29,4.37,37.56,19.5,0.28,3.7,21.92,55.76,9.14,45.77,10520.65,55.09,31.5,2.28,196.45,5.0,2.14
2,Belarus,BLR,2010,33.38,8.03,407.45,24.16,17.13,31.8,75.2,5.02,39.98,106.18,503.8,9.92,5.2,0.02,32.34,6.1,39.17
3,Colombia,COL,2010,28.67,3.32,451.12,30.78,5.9,36.5,470.86,21.26,24.95,99.24,4124.86,14.45,7.11,8.54,273.77,11.32,35.97
4,Costa Rica,CRI,2010,41.49,4.01,2203.39,16.11,8.57,36.5,86.39,2.1,19.28,67.68,513.09,8.12,21.96,0.1,30.03,7.71,58.94


In [69]:
print(data_df.columns)

Index(['Country Name', 'Country Code', 'Year', 'ATM Usage', 'GDP',
       'High Tech Export ($M)', 'High Tech Import',
       'Internet Subscription Per 100', 'Internet Usage Per 100',
       'IP Income($M)', 'Labour Force (M)', 'Med_High Tech Manufacturing',
       'Mobile Sub Per 100', 'Personal Remittance ($M)',
       'Population Living In Slums', 'Poverty Count', 'Stock Traded $B',
       'Tech Cooperation Grant($M)', 'Unemployment Rate',
       'Med High Tech Export'],
      dtype='object')


In [70]:
#Perform label encoding for the 'Country_Name' column
encoder = LabelEncoder()
data_df['Country Code'] = encoder.fit_transform(data_df['Country Name'])

In [71]:
grouped_data = data.groupby('Country Code')

In [74]:
# Step 5: Define a function to train and predict for each country
def train_and_predict_country(country_data):
    country_name = encoder.inverse_transform([country_data['Country Code'].iloc[0]])[0]
    print(f"Predictions for {country_name}:")

    # Step 6: Split the data into features and target variable for this country
    X = country_data.drop(columns=['Poverty Count', 'Year', 'Country Name', 'Country Code'])  # Features
    y = country_data['Poverty Count']  # Target variable

    #Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Step 8: Split the data into training and test sets for this country
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Step 9: Choose a machine learning model for this country
    model = SVR(kernel='rbf', C=10.0) 
    
    # Step 10: Train the model for this country
    model.fit(X_train, y_train)

    # Step 11: Evaluate the model's performance for this country
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)

    print(f"Training R^2 Score: {train_score:.4f}")
    print(f"Test R^2 Score: {test_score:.4f}")

    # Step 12: Make predictions for the next 5 years for this country
    last_year_data = country_data.iloc[-1].drop(['Poverty Count', 'Year', 'Country Name', 'Country Code'])  # Use data from the last available year
    future_years = np.arange(2024, 2029)  # Next 5 years

    future_data = pd.DataFrame(columns=X.columns)
    for year in future_years:
        future_data = future_data.append(last_year_data, ignore_index=True)

    predictions = model.predict(future_data)

    # Print the predictions for this country
    print("Predicted Poverty Count for the next 5 years:")
    for year, prediction in zip(future_years, predictions):
        print(f"Year {year}: {prediction:.2f}")

    print("\n")

# Step 13: Iterate over each country and make predictions
for country_code, country_data in grouped_data:
    train_and_predict_country(country_data)



Predictions for Argentina:
Training R^2 Score: 0.6396
Test R^2 Score: 0.6507
Predicted Poverty Count for the next 5 years:
Year 2024: 30.86
Year 2025: 30.86
Year 2026: 30.86
Year 2027: 30.86
Year 2028: 30.86


Predictions for Bangladesh:
Training R^2 Score: -0.1299
Test R^2 Score: -0.4971
Predicted Poverty Count for the next 5 years:
Year 2024: 5.17
Year 2025: 5.17
Year 2026: 5.17
Year 2027: 5.17
Year 2028: 5.17


Predictions for Belarus:
Training R^2 Score: 0.8946
Test R^2 Score: -254.5286
Predicted Poverty Count for the next 5 years:
Year 2024: 4.90
Year 2025: 4.90
Year 2026: 4.90
Year 2027: 4.90
Year 2028: 4.90


Predictions for Colombia:
Training R^2 Score: -0.0486
Test R^2 Score: -0.3841
Predicted Poverty Count for the next 5 years:
Year 2024: 8.60
Year 2025: 8.60
Year 2026: 8.60
Year 2027: 8.60
Year 2028: 8.60


Predictions for Costa Rica:
Training R^2 Score: -0.2025
Test R^2 Score: -1.9271
Predicted Poverty Count for the next 5 years:
Year 2024: 22.91
Year 2025: 22.91
Year 2026: