##  Predict GDP per capita

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [23]:
# Import data set
file = '../data/WEOOct2020all.xlsx'
df = pd.read_excel(file)

### Data Preparation

In [24]:
# 'Gross domestic product per capita, constant prices' = NGDPRPPPPC => Dollars
# 'Population' = LP
# 'Gross national savings' = NGSD_NGDP(% of GDP) => Percent

data_df = df[df['WEO Subject Code'].isin(['NGDPRPPPPC', 'LP', 'NGSD_NGDP'])]

years = [int(year) for year in range(1980, 2020)]
data_df_pivot = data_df.pivot_table(index=['Country', 'ISO'], columns='WEO Subject Code', values=years).reset_index()


In [25]:
import pycountry_convert as pc

def get_continent(country):

    if country == 'UVK':
        return 'Europe'
    elif country == 'TLS' or country == 'WBG':
        return 'Asia'
    
    country_code = pc.country_alpha3_to_country_alpha2(country)
    continent_code = pc.country_alpha2_to_continent_code(country_code)
    continent = pc.convert_continent_code_to_continent_name(continent_code)

    return continent

In [26]:
# Add continent column to the dataframe and remove unwanted columns
data_df_pivot['Continent'] = data_df_pivot['ISO'].apply(get_continent)


In [29]:
# Extract the relevant columns for features and target

feature_columns = [(year, subject) for year in range(1980, 2020) for subject in ['LP', 'NGDPRPPPPC', 'NGSD_NGDP']]
target_column = (2019, 'NGDPRPPPPC')  # Target variable is 'NGDPRPPPPC' for 2019


In [30]:
# Separate features and target
X = data_df_pivot[feature_columns].values
y = data_df_pivot[target_column].values

In [32]:
# Impute missing values with the mean (you can use other strategies if needed)
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)
y = imputer.fit_transform(y.reshape(-1, 1)).flatten()

In [33]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [34]:
# Train a Linear Regression model (you can choose other regression models if desired)
model = LinearRegression()
model.fit(X_train, y_train)

In [35]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [36]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


In [37]:
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

Mean Squared Error: 0.00
R-squared: 1.00


### Fields that were used during training

In [38]:
# Get the list of field (feature) names used during training
feature_names = data_df_pivot[feature_columns].columns.tolist()

# Print the feature names
print(feature_names)

[(1980, 'LP'), (1980, 'NGDPRPPPPC'), (1980, 'NGSD_NGDP'), (1981, 'LP'), (1981, 'NGDPRPPPPC'), (1981, 'NGSD_NGDP'), (1982, 'LP'), (1982, 'NGDPRPPPPC'), (1982, 'NGSD_NGDP'), (1983, 'LP'), (1983, 'NGDPRPPPPC'), (1983, 'NGSD_NGDP'), (1984, 'LP'), (1984, 'NGDPRPPPPC'), (1984, 'NGSD_NGDP'), (1985, 'LP'), (1985, 'NGDPRPPPPC'), (1985, 'NGSD_NGDP'), (1986, 'LP'), (1986, 'NGDPRPPPPC'), (1986, 'NGSD_NGDP'), (1987, 'LP'), (1987, 'NGDPRPPPPC'), (1987, 'NGSD_NGDP'), (1988, 'LP'), (1988, 'NGDPRPPPPC'), (1988, 'NGSD_NGDP'), (1989, 'LP'), (1989, 'NGDPRPPPPC'), (1989, 'NGSD_NGDP'), (1990, 'LP'), (1990, 'NGDPRPPPPC'), (1990, 'NGSD_NGDP'), (1991, 'LP'), (1991, 'NGDPRPPPPC'), (1991, 'NGSD_NGDP'), (1992, 'LP'), (1992, 'NGDPRPPPPC'), (1992, 'NGSD_NGDP'), (1993, 'LP'), (1993, 'NGDPRPPPPC'), (1993, 'NGSD_NGDP'), (1994, 'LP'), (1994, 'NGDPRPPPPC'), (1994, 'NGSD_NGDP'), (1995, 'LP'), (1995, 'NGDPRPPPPC'), (1995, 'NGSD_NGDP'), (1996, 'LP'), (1996, 'NGDPRPPPPC'), (1996, 'NGSD_NGDP'), (1997, 'LP'), (1997, 'NGDPRPPP

### Top 5 fields/features that contribute the most to the predictions.

In [41]:
from sklearn.linear_model import LinearRegression

In [42]:

# Get the coefficients of the linear model
coefficients = model.coef_

# Get the absolute values of the coefficients
abs_coefficients = abs(coefficients)

# Get the indices of the top five features
top_five_indices = abs_coefficients.argsort()[-5:][::-1]

# Get the top five feature names
top_five_features = [feature_names[i] for i in top_five_indices]

In [43]:
# Print the top five features
print("Top five features:")
print(top_five_features)

Top five features:
[(2019, 'NGDPRPPPPC'), (1983, 'LP'), (1982, 'LP'), (2008, 'LP'), (1984, 'LP')]


#### Train another predictor that uses those top 5 features

In [45]:
# Extract the top five features from the original dataset
X_top_five = data_df_pivot[top_five_features].values

# Impute missing values with the mean (you can use other strategies if needed)
imputer = SimpleImputer(strategy='mean')
X_top_five = imputer.fit_transform(X_top_five)

# Train a new Linear Regression model using the top five features
model_top_five = LinearRegression()
model_top_five.fit(X_top_five, y)

### Save the predictor in a file

In [47]:
import joblib

In [49]:
# Save the trained model to a file
model_file = 'linear_regression_model3.joblib'
joblib.dump(model_top_five, model_file)

['linear_regression_model3.joblib']