In [1]:
# standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
# import data

expense = pd.read_csv('Detailed_Expense_Breakdown.csv')
revenue = pd.read_csv('Detailed_Revenue_Breakdown.csv')
debt = pd.read_csv('Historical Public Debt Database.csv')


In [3]:
# fix column titles

expense.columns = expense.columns.str.lower().str.replace(' ', '_')
revenue.columns = revenue.columns.str.lower().str.replace(' ', '_')
debt.columns = debt.columns.str.lower().str.replace(' ', '_')


In [4]:
european_countries = [
    'Portugal', 'Romania', 'Russia', 'San Marino', 'São Tomé and Príncipe', 'Serbia',
    'Seychelles', 'Slovak Republic', 'Slovenia', 'Spain', 'Sweden', 'Italy',
    'Switzerland', 'Poland', 'Norway', 'Netherlands', 'Montenegro, Rep. of', 'Luxembourg',
    'Lithuania', 'Latvia', 'Kosovo', 'Iceland', 'Hungary', 'Greece', 'Germany',
    'France', 'Finland', 'Estonia', 'Euro area', 'Belgium', 'Austria', 'Albania',
    'Bosnia and Herzegovina', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark',
    'United Kingdom', 'Turkey', 'Ukraine', 'Switzerland', 'Sweden', 'Spain', 'Slovenia'
   
]

# Filter the "expense" DataFrame for European countries
expense_europe = expense[expense['country_name'].isin(european_countries)]



# Filter the "revenue" DataFrame for European countries
revenue_europe = revenue[revenue['country_name'].isin(european_countries)]



# Filter the "debt" DataFrame for European countries
debt_europe = debt[debt['country_name'].isin(european_countries)]


In [5]:
expense_europe = expense_europe[expense_europe['attribute'] == 'Value']
revenue_europe = revenue_europe[revenue_europe['attribute'] == 'Value']
debt_europe = debt_europe[debt_europe['attribute'] == 'Value']

In [6]:
# Columns to drop (remove the outer list)
expense_columns_to_drop = ['country_code', 'unit_name', 'unit_code', 'indicator_code', 'classification_name',
                           'global_dsd_time_series_code', 'classification_code', 'sector_code', 'attribute', 'sector_name']

# Use axis=1 to indicate that you want to drop columns
expense_europe = expense_europe.drop(columns=expense_columns_to_drop)

# Display the new DataFrame
expense_europe.head(2)


Unnamed: 0,country_name,1972,1973,1974,1975,1976,1977,1978,1979,1980,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Belgium,,,,,,,,,,...,2.628057228,2.611153533,2.5774434,1.596776593,1.52993013,1.513784968,1.605595415,1.666422717,2.681309827,2.108019478
76,Austria,0.61331997,0.588700326,0.581780822,0.728918317,1.056904473,1.208750988,1.495720728,1.5874903,1.689121558,...,2.666046557,2.539129672,2.376957137,2.280235269,2.037720073,1.815921408,1.592841025,1.415853602,1.378380188,1.150676327


In [7]:
# Columns to drop (remove the outer list)
revenue_columns_to_drop = ['country_code', 'unit_name', 'unit_code', 'indicator_code', 'global_dsd_time_series_code',
                           'classification_name', 'sector_name','classification_code', 'sector_code', 'unnamed:_61', 'attribute']

# Use axis=1 to indicate that you want to drop columns
revenue_europe =  revenue_europe.drop(columns=revenue_columns_to_drop)

# Display the new DataFrame
revenue_europe.head(2)

Unnamed: 0,country_name,1972,1973,1974,1975,1976,1977,1978,1979,1980,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Austria,,,,,,,,,,...,0.0419578532265966,0.04165352001882,0.0413812438446675,0.0408662637069294,0.0408687725106821,0.0398633534924294,0.0411417143565825,0.0373366271003739,0.0328429302284136,0.0304437715546219
6,Austria,,,,,,,,,,...,,,,,,,,,,


In [8]:
# Columns to drop (remove the outer list)
debt_columns_to_drop = ['country_code', 'indicator_name', 'indicator_code', 'unnamed:_221', 'attribute']

# Use axis=1 to indicate that you want to drop columns
debt_europe = debt_europe.drop(columns= debt_columns_to_drop)

# Display the new DataFrame
debt_europe.head(2)

Unnamed: 0,country_name,1800,1801,1802,1803,1804,1805,1806,1807,1808,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,Portugal,,,,,,,,,,...,61.6188,68.439083,71.666333,83.609401,96.183319,111.389679,126.20989,129.000844,130.165395,128.976822
2,Romania,,,,,,,,,,...,12.536329,12.651644,13.387073,23.347557,30.535338,33.874314,37.630485,38.826068,40.495409,39.304125


In [9]:
# Select columns for the years 2000 to 2015
selected_columns = ['country_name']+ [str(year) for year in range(2000, 2016)]

# Select data only for the European Countries
expense_europe = expense_europe[selected_columns]
revenue_europe = revenue_europe[selected_columns]
debt_europe = debt_europe[selected_columns]


In [10]:
# replace missing values with zero

expense_column_range =expense_europe.columns[1:]  # Select columns from index 2 to the end
expense_europe.loc[:, expense_column_range] =  expense_europe.loc[:, expense_column_range].fillna(0)

revenue_column_range =revenue_europe.columns[1:]  # Select columns from index 2 to the end
revenue_europe.loc[:, revenue_column_range] =  revenue_europe.loc[:, revenue_column_range].fillna(0)

debt_column_range =debt_europe.columns[1:]  # Select columns from index 2 to the end
debt_europe.loc[:, debt_column_range] =  debt_europe.loc[:, debt_column_range].fillna(0)



In [11]:
# Merge expense and revenue on 'country_name'
merged_df = pd.merge(expense_europe, revenue_europe, on='country_name', how='inner', suffixes=('_exp', '_rev'))

# Merge the result with debt on 'country_name'
merged_df = pd.merge(merged_df, debt_europe, on='country_name', how='inner')

# Display the merged DataFrame
merged_df.head(2)


Unnamed: 0,country_name,2000_exp,2001_exp,2002_exp,2003_exp,2004_exp,2005_exp,2006_exp,2007_exp,2008_exp,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,Belgium,1.016085749,1.006374487,0.955115185,1.124213641,1.036217236,1.425862075,1.604331526,1.847220323,2.089315464,...,91.038109,87.021213,92.528538,99.563594,99.710354,102.336996,104.115402,105.186223,106.568051,106.052412
1,Belgium,1.016085749,1.006374487,0.955115185,1.124213641,1.036217236,1.425862075,1.604331526,1.847220323,2.089315464,...,91.038109,87.021213,92.528538,99.563594,99.710354,102.336996,104.115402,105.186223,106.568051,106.052412


In [12]:
# Convert values in each year's column to numeric

column_range =merged_df.columns[1:]  # Select columns from index 1 to the end

merged_df[column_range] = merged_df[column_range].apply(pd.to_numeric, errors='coerce')


In [13]:
# Check for missing values in the entire DataFrame
missing_values = merged_df.isnull().sum()

# Display the count of missing values for each column
print("Missing values per column:")
print(missing_values)

# Check if there are any missing values in the entire DataFrame
has_missing_values = merged_df.isnull().values.any()

# Display the result
if has_missing_values:
    print("The DataFrame has missing values.")
else:
    print("The DataFrame does not have any missing values.")


Missing values per column:
country_name    0
2000_exp        0
2001_exp        0
2002_exp        0
2003_exp        0
2004_exp        0
2005_exp        0
2006_exp        0
2007_exp        0
2008_exp        0
2009_exp        0
2010_exp        0
2011_exp        0
2012_exp        0
2013_exp        0
2014_exp        0
2015_exp        0
2000_rev        0
2001_rev        0
2002_rev        0
2003_rev        0
2004_rev        0
2005_rev        0
2006_rev        0
2007_rev        0
2008_rev        0
2009_rev        0
2010_rev        0
2011_rev        0
2012_rev        0
2013_rev        0
2014_rev        0
2015_rev        0
2000            0
2001            0
2002            0
2003            0
2004            0
2005            0
2006            0
2007            0
2008            0
2009            0
2010            0
2011            0
2012            0
2013            0
2014            0
2015            0
dtype: int64
The DataFrame does not have any missing values.


In [14]:
# Extract expense columns
expense_columns = [col for col in merged_df.columns if col.endswith('_exp')]

# Extract revenue columns
revenue_columns = [col for col in merged_df.columns if col.endswith('_rev')]

# Extract target columns (without suffix) for the selected years (2000 to 2015)
target_columns = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']

# Combine all columns into features list
features = merged_df[expense_columns + revenue_columns]

# target
target = merged_df[target_columns]



In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# Define X_train and y_train
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print('Linear Regressor Model Metrics:')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')



Linear Regressor Model Metrics:
Mean Absolute Error: 23.780823293289227
Mean Squared Error: 1003.4054105742175
RMSE: 31.676575108022924
R2: 0.045713192827855846


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create a Decision Tree Regressor
model2 = DecisionTreeRegressor(max_depth=10, random_state=42) # max_depth=10 to speed it up

# Train the model
model2.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model2.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # RMSE
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print('Desicion Tree Regressor Model Metrics:')
print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'R-squared (R2): {r2:.2f}')



Desicion Tree Regressor Model Metrics:
Mean Absolute Error (MAE): 19.14
Mean Squared Error (MSE): 776.00
Root Mean Squared Error (RMSE): 27.79
R-squared (R2): 0.26


In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming 'merged_df' is your DataFrame

# Extract expense columns
expense_columns = [col for col in merged_df.columns if col.endswith('_exp')]

# Extract revenue columns
revenue_columns = [col for col in merged_df.columns if col.endswith('_rev')]

# Extract target columns (without suffix) for the selected years (2000 to 2015)
target_columns = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']

# Combine expense and revenue columns into features list
features_df = merged_df[['country_name'] + expense_columns + revenue_columns]

# Melt the DataFrame to create a 'year' column
features_melted = pd.melt(features_df, id_vars=['country_name'], var_name='year', value_name='value')

# Pivot the melted DataFrame to create a tabular structure
features_tabular = features_melted.pivot_table(index=['country_name'], columns='year', values='value').reset_index()

# Extract target columns
target_df = merged_df[['country_name'] + target_columns]

# Melt the target DataFrame to create a 'year' column
target_melted = pd.melt(target_df, id_vars=['country_name'], var_name='year', value_name='value')

# Pivot the melted target DataFrame to create a tabular structure
target_tabular = target_melted.pivot_table(index=['country_name'], columns='year', values='value').reset_index()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_tabular.drop('country_name', axis=1), target_tabular.drop('country_name', axis=1), test_size=0.2, random_state=42)

# Create a Decision Tree Regressor
model_decision_tree = DecisionTreeRegressor(max_depth=10, random_state=42)  # Adjust hyperparameters if needed

# Train the model
model_decision_tree.fit(X_train, y_train)

# Make predictions on the test set
y_pred_decision_tree = model_decision_tree.predict(X_test)

# Evaluate the model
mae_decision_tree = mean_absolute_error(y_test, y_pred_decision_tree)
mse_decision_tree = mean_squared_error(y_test, y_pred_decision_tree)
rmse_decision_tree = mean_squared_error(y_test, y_pred_decision_tree, squared=False)  # RMSE
r2_decision_tree = r2_score(y_test, y_pred_decision_tree)

# Print the evaluation metrics for the decision tree model
print('Decision Tree Regressor Model Metrics:')
print(f'Mean Absolute Error (MAE): {mae_decision_tree:.2f}')
print(f'Mean Squared Error (MSE): {mse_decision_tree:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse_decision_tree:.2f}')
print(f'R-squared (R2): {r2_decision_tree:.2f}')


Decision Tree Regressor Model Metrics:
Mean Absolute Error (MAE): 39.67
Mean Squared Error (MSE): 2569.76
Root Mean Squared Error (RMSE): 49.45
R-squared (R2): -3.84


In [None]:
# Take a Long time to run
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

# Define a smaller hyperparameter space
param_dist = {
    'max_depth': sp_randint(5, 21),
    # Add more hyperparameters to tune if needed
}

# Use RandomizedSearchCV instead of GridSearchCV
random_search = RandomizedSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=10,  # Adjust the number of iterations
    scoring='neg_mean_squared_error',
    cv=5,
    random_state=42,
    n_jobs=-1,  # Use all available processors
)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params_random = random_search.best_params_
print(f'Best Hyperparameters: {best_params_random}')

# Use the best model to make predictions
best_model_random = random_search.best_estimator_
y_pred_tuned_random = best_model_random.predict(X_test)

# Evaluate the tuned model
mae_tuned_random = mean_absolute_error(y_test, y_pred_tuned_random)
mse_tuned_random = mean_squared_error(y_test, y_pred_tuned_random)
rmse_tuned_random = mean_squared_error(y_test, y_pred_tuned_random, squared=False)  # RMSE
r2_tuned_random = r2_score(y_test, y_pred_tuned_random)

# Print the evaluation metrics for the tuned model
print('\nTuned Decision Tree Regressor Model Metrics (RandomizedSearchCV):')
print(f'Mean Absolute Error (MAE): {mae_tuned_random:.2f}')
print(f'Mean Squared Error (MSE): {mse_tuned_random:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse_tuned_random:.2f}')
print(f'R-squared (R2): {r2_tuned_random:.2f}')


In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming 'merged_df' is your DataFrame

# Extract expense columns
expense_columns = [col for col in merged_df.columns if col.endswith('_exp')]

# Extract revenue columns
revenue_columns = [col for col in merged_df.columns if col.endswith('_rev')]

# Extract target columns (without suffix) for the selected years (2000 to 2015)
target_columns = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']

# Combine expense and revenue columns into features list
features_df = merged_df[['country_name'] + expense_columns + revenue_columns]

# Melt the DataFrame to create a 'year' column
features_melted = pd.melt(features_df, id_vars=['country_name'], var_name='year', value_name='value')

# Pivot the melted DataFrame to create a tabular structure
features_tabular = features_melted.pivot_table(index=['country_name'], columns='year', values='value').reset_index()

# Extract target columns
target_df = merged_df[['country_name'] + target_columns]

# Melt the target DataFrame to create a 'year' column
target_melted = pd.melt(target_df, id_vars=['country_name'], var_name='year', value_name='value')

# Pivot the melted target DataFrame to create a tabular structure
target_tabular = target_melted.pivot_table(index=['country_name'], columns='year', values='value').reset_index()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_tabular.drop('country_name', axis=1), target_tabular.drop('country_name', axis=1), test_size=0.2, random_state=42)

# Create a Linear Regression model
model_linear = LinearRegression()

# Train the model
model_linear.fit(X_train, y_train)

# Make predictions on the test set
y_pred_linear = model_linear.predict(X_test)

# Evaluate the model
mae_linear = mean_absolute_error(y_test, y_pred_linear)
mse_linear = mean_squared_error(y_test, y_pred_linear)
rmse_linear = mean_squared_error(y_test, y_pred_linear, squared=False)  # RMSE
r2_linear = r2_score(y_test, y_pred_linear)

# Print the evaluation metrics for the linear regression model
print('Linear Regression Model Metrics:')
print(f'Mean Absolute Error (MAE): {mae_linear:.2f}')
print(f'Mean Squared Error (MSE): {mse_linear:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse_linear:.2f}')
print(f'R-squared (R2): {r2_linear:.2f}')


Linear Regression Model Metrics:
Mean Absolute Error (MAE): 55.59
Mean Squared Error (MSE): 4939.49
Root Mean Squared Error (RMSE): 68.07
R-squared (R2): -8.41


In [18]:
# Took a long time to run

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Define X_train and y_train
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create a Random Forest Regressor
model_rf = RandomForestRegressor(max_depth=10, random_state=42)  # max_depth=10 to speed it up

# Train the model
model_rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = model_rf.predict(X_test)

# Evaluate the model
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)  # RMSE
r2_rf = r2_score(y_test, y_pred_rf)

# Print the Random Forest Regressor model metrics
print("Random Forest Regressor Model Metrics:")
print(f'Mean Absolute Error (MAE): {mae_rf:.2f}')
print(f'Mean Squared Error (MSE): {mse_rf:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse_rf:.2f}')
print(f'R-squared (R2): {r2_rf:.2f}')



Random Forest Regressor Model Metrics:
Mean Absolute Error (MAE): 19.14
Mean Squared Error (MSE): 766.24
Root Mean Squared Error (RMSE): 27.61
R-squared (R2): 0.27


In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming 'merged_df' is your DataFrame

# Extract expense columns
expense_columns = [col for col in merged_df.columns if col.endswith('_exp')]

# Extract revenue columns
revenue_columns = [col for col in merged_df.columns if col.endswith('_rev')]

# Extract target columns (without suffix) for the selected years (2000 to 2015)
target_columns = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']

# Combine expense and revenue columns into features list
features_df = merged_df[['country_name'] + expense_columns + revenue_columns]

# Melt the DataFrame to create a 'year' column
features_melted = pd.melt(features_df, id_vars=['country_name'], var_name='year', value_name='value')

# Pivot the melted DataFrame to create a tabular structure
features_tabular = features_melted.pivot_table(index=['country_name'], columns='year', values='value').reset_index()

# Extract target columns
target_df = merged_df[['country_name'] + target_columns]

# Melt the target DataFrame to create a 'year' column
target_melted = pd.melt(target_df, id_vars=['country_name'], var_name='year', value_name='value')

# Pivot the melted target DataFrame to create a tabular structure
target_tabular = target_melted.pivot_table(index=['country_name'], columns='year', values='value').reset_index()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_tabular.drop('country_name', axis=1), target_tabular.drop('country_name', axis=1), test_size=0.2, random_state=42)

# Define the hyperparameter grid
param_dist = {
    'n_estimators': [50, 100, 150],  # Adjust the number of estimators
    'max_depth': [None, 10, 20, 30],  # Adjust the maximum depth
    # Add more hyperparameters to tune if needed
}

# Create a Random Forest Regressor
base_model = RandomForestRegressor(random_state=42)

# Instantiate RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    n_iter=10,  # Adjust the number of iterations
    scoring='neg_mean_squared_error',
    cv=5,
    random_state=42,
    n_jobs=-1,  # Use all available processors
)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params_random = random_search.best_params_
print(f'Best Hyperparameters: {best_params_random}')

# Use the best model to make predictions
best_model_random = random_search.best_estimator_
y_pred_random_forest = best_model_random.predict(X_test)

# Evaluate the tuned model
mae_random_forest = mean_absolute_error(y_test, y_pred_random_forest)
mse_random_forest = mean_squared_error(y_test, y_pred_random_forest)
rmse_random_forest = mean_squared_error(y_test, y_pred_random_forest, squared=False)  # RMSE
r2_random_forest = r2_score(y_test, y_pred_random_forest)

# Print the evaluation metrics for the tuned model
print('\nRandom Forest Regressor Model Metrics:')
print(f'Mean Absolute Error (MAE): {mae_random_forest:.2f}')
print(f'Mean Squared Error (MSE): {mse_random_forest:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse_random_forest:.2f}')
print(f'R-squared (R2): {r2_random_forest:.2f}')


Best Hyperparameters: {'n_estimators': 50, 'max_depth': 30}

Random Forest Regressor Model Metrics:
Mean Absolute Error (MAE): 23.02
Mean Squared Error (MSE): 884.29
Root Mean Squared Error (RMSE): 29.58
R-squared (R2): -0.70


In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Assuming 'merged_df' is your DataFrame

# Extract expense columns
expense_columns = [col for col in merged_df.columns if col.endswith('_exp')]

# Extract revenue columns
revenue_columns = [col for col in merged_df.columns if col.endswith('_rev')]

# Extract target columns (without suffix) for the selected years (2000 to 2015)
target_columns = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']

# Combine expense and revenue columns into features list
features_df = merged_df[['country_name'] + expense_columns + revenue_columns]

# Melt the DataFrame to create a 'year' column
features_melted = pd.melt(features_df, id_vars=['country_name'], var_name='year', value_name='value')

# Pivot the melted DataFrame to create a tabular structure
features_tabular = features_melted.pivot_table(index=['country_name'], columns='year', values='value').reset_index()

# Extract target columns
target_df = merged_df[['country_name'] + target_columns]

# Melt the target DataFrame to create a 'year' column
target_melted = pd.melt(target_df, id_vars=['country_name'], var_name='year', value_name='value')

# Pivot the melted target DataFrame to create a tabular structure
target_tabular = target_melted.pivot_table(index=['country_name'], columns='year', values='value').reset_index()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_tabular.drop('country_name', axis=1), target_tabular.drop('country_name', axis=1), test_size=0.2, random_state=42)

# Feature scaling (optional but can be beneficial)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Check for missing values
if np.isnan(X_train_scaled).sum().sum() > 0 or np.isnan(y_train.values).sum().sum() > 0:
    raise ValueError("There are missing values in the data.")

# Ensure the shapes are consistent
if X_train_scaled.shape[0] != y_train.shape[0]:
    raise ValueError("Inconsistent number of samples between X_train_scaled and y_train.")

# Define an extended hyperparameter grid
param_grid_extended = {
    'n_estimators': [100, 150, 200],  # Try higher values
    'max_depth': [20, 30, 40, None],  # Include higher values and None
    # Add more hyperparameters to tune if needed
}

# Create a Random Forest Regressor
base_model = RandomForestRegressor(random_state=42)

# Instantiate GridSearchCV with the extended grid
grid_search_extended = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid_extended,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
)

# Fit the extended grid search to the data
grid_search_extended.fit(X_train_scaled, y_train)

# Get the best hyperparameters from the extended grid search
best_params_extended = grid_search_extended.best_params_
print(f'Best Hyperparameters (Extended Grid): {best_params_extended}')

# Use the best model to make predictions
best_model_extended = grid_search_extended.best_estimator_
y_pred_random_forest_extended = best_model_extended.predict(X_test_scaled)

# Evaluate the extended grid search model
mae_random_forest_extended = mean_absolute_error(y_test, y_pred_random_forest_extended)
mse_random_forest_extended = mean_squared_error(y_test, y_pred_random_forest_extended)
rmse_random_forest_extended = mean_squared_error(y_test, y_pred_random_forest_extended, squared=False)  # RMSE
r2_random_forest_extended = r2_score(y_test, y_pred_random_forest_extended)

# Print the evaluation metrics for the extended grid search model
print('\nRandom Forest Regressor (Extended Grid) Model Metrics:')
print(f'Mean Absolute Error (MAE): {mae_random_forest_extended:.2f}')
print(f'Mean Squared Error (MSE): {mse_random_forest_extended:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse_random_forest_extended:.2f}')
print(f'R-squared (R2): {r2_random_forest_extended:.2f}')


Best Hyperparameters (Extended Grid): {'max_depth': 20, 'n_estimators': 100}

Random Forest Regressor (Extended Grid) Model Metrics:
Mean Absolute Error (MAE): 22.32
Mean Squared Error (MSE): 810.17
Root Mean Squared Error (RMSE): 28.29
R-squared (R2): -0.55


In [33]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define an extended hyperparameter grid
param_grid_extended = {
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [15, 20, 25, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    # Add more hyperparameters to tune if needed
}

# Create a Random Forest Regressor
base_model = RandomForestRegressor(random_state=42)

# Instantiate GridSearchCV with the extended grid
grid_search_extended = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid_extended,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
)

# Fit the extended grid search to the data
grid_search_extended.fit(X_train_scaled, y_train)

# Get the best hyperparameters from the extended grid search
best_params_extended = grid_search_extended.best_params_
print(f'Best Hyperparameters (Extended Grid): {best_params_extended}')

# Use the best model to make predictions
best_model_extended = grid_search_extended.best_estimator_
y_pred_random_forest_extended = best_model_extended.predict(X_test_scaled)

# Evaluate the extended grid search model
mae_random_forest_extended = mean_absolute_error(y_test, y_pred_random_forest_extended)
mse_random_forest_extended = mean_squared_error(y_test, y_pred_random_forest_extended)
rmse_random_forest_extended = mean_squared_error(y_test, y_pred_random_forest_extended, squared=False)  # RMSE
r2_random_forest_extended = r2_score(y_test, y_pred_random_forest_extended)

# Print the evaluation metrics for the extended grid search model
print('\nRandom Forest Regressor (Extended Grid) Model Metrics:')
print(f'Mean Absolute Error (MAE): {mae_random_forest_extended:.2f}')
print(f'Mean Squared Error (MSE): {mse_random_forest_extended:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse_random_forest_extended:.2f}')
print(f'R-squared (R2): {r2_random_forest_extended:.2f}')


Best Hyperparameters (Extended Grid): {'max_depth': 15, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 150}

Random Forest Regressor (Extended Grid) Model Metrics:
Mean Absolute Error (MAE): 23.72
Mean Squared Error (MSE): 828.40
Root Mean Squared Error (RMSE): 28.66
R-squared (R2): -0.61


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Extract expense columns
expense_columns = [col for col in merged_df.columns if col.endswith('_exp')]

# Extract revenue columns
revenue_columns = [col for col in merged_df.columns if col.endswith('_rev')]

# Extract target columns (without suffix) for the selected years (2000 to 2015)
target_columns = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']

# Combine all columns into features list
features = merged_df[expense_columns + revenue_columns]

# Summing debt values across all years
target = merged_df[target_columns].sum(axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define an extended hyperparameter grid for Random Forest Regressor
param_grid_extended = {
    'n_estimators': [100, 150, 200],
    'max_depth': [15, 20, 25],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    # Add more hyperparameters to tune if needed
}

# Create a Random Forest Regressor
base_model = RandomForestRegressor(random_state=42)

# Instantiate GridSearchCV with the extended grid
grid_search_extended = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid_extended,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
)

# Fit the extended grid search to the data
grid_search_extended.fit(X_train_scaled, y_train)

# Get the best hyperparameters from the extended grid search
best_params_extended = grid_search_extended.best_params_
print(f'Best Hyperparameters (Extended Grid): {best_params_extended}')

# Use the best model to make predictions
best_model_extended = grid_search_extended.best_estimator_
y_pred_random_forest_extended = best_model_extended.predict(X_test_scaled)

# Evaluate the extended grid search model
mae_random_forest_extended = mean_absolute_error(y_test, y_pred_random_forest_extended)
mse_random_forest_extended = mean_squared_error(y_test, y_pred_random_forest_extended)
rmse_random_forest_extended = mean_squared_error(y_test, y_pred_random_forest_extended, squared=False)  # RMSE
r2_random_forest_extended = r2_score(y_test, y_pred_random_forest_extended)

# Print the evaluation metrics for the extended grid search model
print('\nRandom Forest Regressor (Extended Grid) Model Metrics:')
print(f'Mean Absolute Error (MAE): {mae_random_forest_extended:.2f}')
print(f'Mean Squared Error (MSE): {mse_random_forest_extended:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse_random_forest_extended:.2f}')
print(f'R-squared (R2): {r2_random_forest_extended:.2f}')


In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define hyperparameter grid for Random Forest
param_dist = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

# Create Random Forest Regressor model
rf_model = RandomForestRegressor(random_state=42)

# Perform randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_

# Train Random Forest Regressor with the best hyperparameters
model_rf_best = RandomForestRegressor(**best_params, random_state=42)
model_rf_best.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_rf_best = model_rf_best.predict(X_test_scaled)

# Evaluate the model
mae_rf_best = mean_absolute_error(y_test, y_pred_rf_best)
mse_rf_best = mean_squared_error(y_test, y_pred_rf_best)
rmse_rf_best = mean_squared_error(y_test, y_pred_rf_best, squared=False)  # RMSE
r2_rf_best = r2_score(y_test, y_pred_rf_best)

# Print the evaluation metrics for Random Forest with feature scaling and hyperparameter tuning
print('Random Forest Regressor Model Metrics (with Feature Scaling and Hyperparameter Tuning):')
print(f'Mean Absolute Error (MAE): {mae_rf_best:.2f}')
print(f'Mean Squared Error (MSE): {mse_rf_best:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse_rf_best:.2f}')
print(f'R-squared (R2): {r2_rf_best:.2f}')
print('Best Hyperparameters:', best_params)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Reduce the number of estimators (trees) to speed up training
model_gbr = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# target is a DataFrame with shape (2533256, 16), selecting a specific column from your multi-output target variable to get 1D array
y_train_single_column = y_train.iloc[:, 0]

# Create and train the Gradient Boosting Regressor
model_gbr.fit(X_train, y_train_single_column)

# Make predictions on the test set
y_pred = model_gbr.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test.iloc[:, 0], y_pred)
mse = mean_squared_error(y_test.iloc[:, 0], y_pred)
rmse = mean_squared_error(y_test.iloc[:, 0], y_pred, squared=False)  # RMSE
r2 = r2_score(y_test.iloc[:, 0], y_pred)

# Print the evaluation metrics
print('Gradient Boosting Regressor Model Metrics:')
print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'R-squared (R2): {r2:.2f}')


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create and train the Lasso regression model
model_lasso = Lasso(alpha=1.0, random_state=42)  # You can adjust the regularization strength (alpha) as needed
model_lasso.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lasso = model_lasso.predict(X_test)

# Evaluate the model
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = mean_squared_error(y_test, y_pred_lasso, squared=False)  # RMSE
r2_lasso = r2_score(y_test, y_pred_lasso)

# Print the evaluation metrics for Lasso regression
print('Lasso Regression Model Metrics:')
print(f'Mean Absolute Error (MAE): {mae_lasso:.2f}')
print(f'Mean Squared Error (MSE): {mse_lasso:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse_lasso:.2f}')
print(f'R-squared (R2): {r2_lasso:.2f}')


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# target is a DataFrame with shape (2533256, 16), selecting a specific column from your multi-output target variable to get 1D array
y_train_single_column = y_train.iloc[:, 0]

# Create and train the Lasso regression model
model_lasso = Lasso(alpha=1.0, random_state=42)  # You can adjust the regularization strength (alpha) as needed
model_lasso.fit(X_train, y_train_single_column)

# Make predictions on the test set
y_pred_lasso = model_lasso.predict(X_test)

# Evaluate the model
mae_lasso = mean_absolute_error(y_test.iloc[:, 0], y_pred_lasso)
mse_lasso = mean_squared_error(y_test.iloc[:, 0], y_pred_lasso)
rmse_lasso = mean_squared_error(y_test.iloc[:, 0], y_pred_lasso, squared=False)  # RMSE
r2_lasso = r2_score(y_test.iloc[:, 0], y_pred_lasso)

# Print the evaluation metrics for Lasso regression
print('Lasso Regression Model Metrics:')
print(f'Mean Absolute Error (MAE): {mae_lasso:.2f}')
print(f'Mean Squared Error (MSE): {mse_lasso:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse_lasso:.2f}')
print(f'R-squared (R2): {r2_lasso:.2f}')


In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create and train the Ridge regression model
model_ridge = Ridge(alpha=1.0)  # You can adjust the regularization strength (alpha) as needed
model_ridge.fit(X_train, y_train)

# Make predictions on the test set
y_pred_ridge = model_ridge.predict(X_test)

# Evaluate the model
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = mean_squared_error(y_test, y_pred_ridge, squared=False)  # RMSE
r2_ridge = r2_score(y_test, y_pred_ridge)

# Print the evaluation metrics for Ridge regression
print('Ridge Regression Model Metrics:')
print(f'Mean Absolute Error (MAE): {mae_ridge:.2f}')
print(f'Mean Squared Error (MSE): {mse_ridge:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse_ridge:.2f}')
print(f'R-squared (R2): {r2_ridge:.2f}')


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define a range of alpha values for hyperparameter tuning
alphas = [0.1, 1.0, 10.0, 100.0]

# Create Ridge Regression model
ridge_model = Ridge()

# Perform grid search for hyperparameter tuning
param_grid = {'alpha': alphas}
grid_search = GridSearchCV(ridge_model, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters
best_alpha = grid_search.best_params_['alpha']

# Train Ridge Regression with the best alpha
model_ridge_best = Ridge(alpha=best_alpha)
model_ridge_best.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_ridge_best = model_ridge_best.predict(X_test_scaled)

# Evaluate the model
mae_ridge_best = mean_absolute_error(y_test, y_pred_ridge_best)
mse_ridge_best = mean_squared_error(y_test, y_pred_ridge_best)
rmse_ridge_best = mean_squared_error(y_test, y_pred_ridge_best, squared=False)  # RMSE
r2_ridge_best = r2_score(y_test, y_pred_ridge_best)

# Print the evaluation metrics for Ridge regression with feature scaling and hyperparameter tuning
print('Ridge Regression Model Metrics (with Feature Scaling and Hyperparameter Tuning):')
print(f'Mean Absolute Error (MAE): {mae_ridge_best:.2f}')
print(f'Mean Squared Error (MSE): {mse_ridge_best:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse_ridge_best:.2f}')
print(f'R-squared (R2): {r2_ridge_best:.2f}')
