##  Predict GDP per capita

In [27]:
import pandas as pd

In [28]:
# Import data set
file = '../data/WEOOct2020all.xlsx'
df = pd.read_excel(file)

In [35]:
# Select all rows where WEO Sublect Code is NGDPDPC
df_GDPPC = df[df['WEO Subject Code'] == 'NGDPDPC']

In [36]:
# Extract the years as features and 2025 GDP per capita as the target variable
X = df_GDPPC.iloc[:, 10:54] # 1980-2024
y = df_GDPPC[2025]

In [40]:
# Check for missing values in the target variable (y)
if y.isnull().any():
    # If there are missing values, drop the corresponding rows from both X and y
    missing_rows = y.isnull()
    X = X[~missing_rows]
    y = y.dropna()

In [41]:
from sklearn.impute import SimpleImputer

# Impute missing values in features (X) if needed
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

In [42]:
from sklearn.model_selection import train_test_split

# Split the imputed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)


In [43]:
from sklearn.linear_model import LinearRegression

# Create the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

######################################################################
Show prediction error (MSE) on the training and the testing data sets.
######################################################################

In [46]:
# Make predictions on the training and testing data
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [48]:
from sklearn.metrics import mean_squared_error

# Calculate MSE for the training and testing data
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

In [49]:
print(f"Mean Squared Error (MSE) on the training data: {mse_train:.2f}")
print(f"Mean Squared Error (MSE) on the testing data: {mse_test:.2f}")

Mean Squared Error (MSE) on the training data: 3375.17
Mean Squared Error (MSE) on the testing data: 181470.65


######################################################################
Name the fields that were used during training.
######################################################################

In [51]:
fields = df_GDPPC.iloc[:, 10:54].columns
print(fields)

Index([1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992,
       1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
       2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024],
      dtype='object')


######################################################################
Top 5 fields/features that contribute the most to the predictions
######################################################################

In [53]:
# Get the coefficients (weights) from the model
coefficients = model.coef_

# Calculate the absolute coefficients to measure the feature importance
absolute_coefficients = abs(coefficients)

In [55]:
# Get the indices of the top 5 features
top_5_indices = absolute_coefficients.argsort()[-5:][::-1]

# Get the corresponding feature names from the original dataframe (assuming X_train is a DataFrame)
top_5_features = df_GDPPC.columns[top_5_indices]

In [56]:
print("Top 5 features contributing the most to predictions:")
for feature in top_5_features:
    print(feature)

Top 5 features contributing the most to predictions:
2014
2013
2012
2011
2009


######################################################################
Train another predictor that uses those top 5 features
######################################################################

In [61]:
# Select the top 5 features based on their column indices
X_train_top5 = X_train[:, top_5_indices]

In [62]:
# Create the Linear Regression model for the new predictor
model_top5 = LinearRegression()

# Train the model on the training data with the top 5 features
model_top5.fit(X_train_top5, y_train)

######################################################################
Save the predictor in a file.
######################################################################

In [65]:
import joblib

In [68]:
# Save the model to a file using joblib
filename = 'linear_regression_model.joblib'
joblib.dump(model_top5, filename)


['linear_regression_model.joblib']

In [69]:
print(f"The trained model has been saved to '{filename}'.")

The trained model has been saved to 'linear_regression_model.joblib'.
