# VIEWS-Nowcasting

This notebook walks your through the VIEWS-nowcasting model. This model aims to provide more timely estimates for UCDP data than the candidate data provides. 


In [None]:
import numpy as np
import pandas as pdx
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
import tabula
import xlwings as xw
import seaborn as sns
import warnings
import statsmodels.api as sm
import statsmodels.formula.api as smf
from matplotlib.ticker import ScalarFormatter
from nowcast import * #plot_statebased, plot_nonstate, plot_onesided 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# VIEWS 3
from viewser.operations import fetch
from viewser import Queryset, Column
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views
from views_runs import storage, ModelMetadata
from views_runs.storage import store, retrieve, fetch_metadata
#from views_forecasts.extensions import *

# Ingester
from ingester3.config import source_db_path
from ingester3.Country import Country
from ingester3.extensions import *
from ingester3.ViewsMonth import ViewsMonth

warnings.filterwarnings('ignore')

import os
home = os.path.expanduser("~")

## Gathering all data

Setting the training years to range from 2018 until 2022 **AND** the test year for 2023. This allows for all of the candidate data to be used. 


In [None]:
df = pd.read_csv('nowcasting_master_final_data_1.csv', index_col=(['month_id','c_id']))

# drop source_version
df.drop('source_version', axis=1, inplace=True)

# Training set uses data from 2018 until 2022
train = df.loc[(df['year'] >= 2018) & (df['year'] < 2023)]

# Test set uses 2022
test = df.loc[(df['year'] > 2022)]

# Training and test Labels
y_train = train['sb_final_best_ln']
y_test = test['sb_final_best_ln']


# Setting the training data

In [None]:
countries = [
    'AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 
    'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 
    'BRA', 'BRB', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 
    'COD', 'COG', 'COL', 'COM', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 
    'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 
    'FRA', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 
    'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 
    'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 
    'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LKA', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 
    'MDG', 'MDV', 'MEX', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 
    'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 
    'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL', 'PRK', 'PRT', 'PRY', 'QAT', 
    'ROU', 'RUS', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SLE', 'SLV', 'SOM', 'SRB', 
    'SSD', 'SVK', 'SVN', 'SWE', 'SWZ', 'SYC', 'SYR', 'TCD', 'TGO', 'THA', 'TJK', 
    'TKM', 'TLS', 'TTO', 'TUN', 'TUR', 'TWN', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 
    'UZB', 'VEN', 'VNM', 'YEM', 'ZAF', 'ZMB', 'ZWE']   
 
short_features = [
    'candidate_sb_best_sum_nokgi_ln1','candidate_os_best_sum_nokgi_ln1','candidate_ns_best_sum_nokgi_ln1', 
    'candidate_sb_best_count_nokgi','candidate_ns_best_count_nokgi','candidate_os_best_count_nokgi',
    'acled_sb_fat_ln', 'acled_sb_fat_ln_1', 'acled_sb_fat_ln_2',
    'acled_sb_count','acled_ns_count', 'acled_os_count',
    'acled_pr_count',
    'topic_conflict_1', 'topic_judiciary_1', 'topic_diplomacy_1',
    'vdem_v2x_delibdem', 'vdem_v2x_clphy', 'vdem_v2x_rule', 'vdem_v2x_freexp', 
    'acled_sb_fat_ln_24', 'acled_sb_fat_ln_3', 'sb_final_best_ln_24','sb_final_best_ln_12', 
    'month', 'candidate_sb_best_sum_nokgi_ln1_lag1', 'candidate_sb_best_sum_nokgi_ln1_lag2','candidate_sb_best_sum_nokgi_ln1_lag3'
    ] 

short_list = short_features + countries

# Create a new DataFrame with only the selected columns
train_new = train[short_list]
test_new = test[short_list]

In [None]:
# Run this cell to get list of all columns

#list(train_new.columns)

# Plotting Candidate vs GED final vs ALCED

In [None]:
plot_statebased(df)

In [None]:
plot_nonstate(df)

In [None]:
plot_onesided(df)

## Visualizing the training and test sets

In [None]:
print('Length of training set:', len(train), '\n' 'Length of test set:', len(test))

year_min = min(train["year"].min(), test["year"].min())
year_max = max(train["year"].max(), test["year"].max())

# Create bin edges from the minimum year to the maximum year
bin_edges = np.arange(year_min - 0.5, year_max + 1.5, 1)

# Plot histograms 
train["year"].hist(bins=bin_edges, figsize=(10, 5), label="training set", alpha=0.7, rwidth=0.5)
test["year"].hist(bins=bin_edges, figsize=(10, 5), label="test set", alpha=0.7, rwidth=0.5)


plt.title("Observations by Year in Train and Test Sets")
plt.ylim(0, 2500)
plt.xlabel("Year")
plt.ylabel("Number of Observations")
plt.grid(False)
plt.legend()

## Models

Model 1: Simple model taking only the basic conflict variables

Model 2: Simple model taking only the basic conflict variables + ACLED

In [None]:
countries = [
    'AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 
    'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 
    'BRA', 'BRB', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 
    'COD', 'COG', 'COL', 'COM', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 
    'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 
    'FRA', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 
    'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 
    'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 
    'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LKA', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 
    'MDG', 'MDV', 'MEX', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 
    'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 
    'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL', 'PRK', 'PRT', 'PRY', 'QAT', 
    'ROU', 'RUS', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SLE', 'SLV', 'SOM', 'SRB', 
    'SSD', 'SVK', 'SVN', 'SWE', 'SWZ', 'SYC', 'SYR', 'TCD', 'TGO', 'THA', 'TJK', 
    'TKM', 'TLS', 'TTO', 'TUN', 'TUR', 'TWN', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 
    'UZB', 'VEN', 'VNM', 'YEM', 'ZAF', 'ZMB', 'ZWE']   
 
short_features = [
    'candidate_sb_best_sum_nokgi_ln1','candidate_os_best_sum_nokgi_ln1','candidate_ns_best_sum_nokgi_ln1', 
    'candidate_sb_best_count_nokgi','candidate_ns_best_count_nokgi','candidate_os_best_count_nokgi', 
    'month', 'candidate_sb_best_sum_nokgi_ln1_lag1', 'candidate_sb_best_sum_nokgi_ln1_lag2','candidate_sb_best_sum_nokgi_ln1_lag3',
    ] 

short_list = short_features + countries

# Create a new DataFrame with only the selected columns
train_minimal = train[short_list]
test_minimal = test[short_list]

In [None]:
# Creating the Test and train sets

# Y test
y_train = train['sb_final_best_ln']
y_test = test['sb_final_best_ln']

X_train_1 = train_minimal
X_test_1 = test_minimal

# Random Forest Model (non-log)
rf_model2 = RandomForestRegressor(random_state=17)
rf_model2.fit(X_train_1, y_train)

rf_m2_predictions = rf_model2.predict(X_test_1)


# Calculate Mean Squared Error
mse = mean_squared_error(y_test, rf_m2_predictions)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)


print("Random Forest Regressor MSE:", mse)
print("Random Forest Regressor RMSE:", rmse)
print('')
final_candidate = mean_squared_error(test['sb_final_best_ln'], test['candidate_sb_best_sum_nokgi_ln1'])
print('MSE GED Final vs UCDP Candidate:', final_candidate)
print('RMSE GED Final vs UCDP Candidate:', np.sqrt(final_candidate))


print(" ")

## PLOTTING
# Parameters for jitter
jitter_amount = 0.15
# Creating jitter by adding a small random number to x and y coordinates
x_jittered = rf_m2_predictions + np.random.normal(0, jitter_amount, size=len(rf_m2_predictions))
y_jittered = y_test + np.random.normal(0, jitter_amount, size=len(y_test))
# Creating subplots
fig, ax = plt.subplots(1, 2, figsize=(16, 7))
# Original plot
ax[0].scatter(rf_m2_predictions, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0].set_title('Actuals vs Predictions')
ax[0].set_xlabel('Predicted Values (Logged)')
ax[0].set_ylabel('Actual Values (Logged)')
ax[0].set_xlim([-1, 11])
ax[0].set_ylim([-1, 11])
ax[0].grid(True)
# Plot with jitter
ax[1].scatter(x_jittered, y_jittered, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1].set_title('Plot with Jitter')
ax[1].set_xlabel('Predicted Values (Logged)')
ax[1].set_ylabel('Actual Values (Logged)')
ax[1].set_xlim([-1, 11])
ax[1].set_ylim([-1, 11])
ax[1].grid(True)
plt.show()

### Model 2: With ACLED

In [None]:
countries = [
    'AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 
    'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 
    'BRA', 'BRB', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 
    'COD', 'COG', 'COL', 'COM', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 
    'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 
    'FRA', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 
    'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 
    'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 
    'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LKA', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 
    'MDG', 'MDV', 'MEX', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 
    'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 
    'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL', 'PRK', 'PRT', 'PRY', 'QAT', 
    'ROU', 'RUS', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SLE', 'SLV', 'SOM', 'SRB', 
    'SSD', 'SVK', 'SVN', 'SWE', 'SWZ', 'SYC', 'SYR', 'TCD', 'TGO', 'THA', 'TJK', 
    'TKM', 'TLS', 'TTO', 'TUN', 'TUR', 'TWN', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 
    'UZB', 'VEN', 'VNM', 'YEM', 'ZAF', 'ZMB', 'ZWE']   
 
short_features = [
    'candidate_sb_best_sum_nokgi_ln1','candidate_os_best_sum_nokgi_ln1','candidate_ns_best_sum_nokgi_ln1', 
    'candidate_sb_best_count_nokgi','candidate_ns_best_count_nokgi','candidate_os_best_count_nokgi', 
    'month', 'candidate_sb_best_sum_nokgi_ln1_lag1', 'candidate_sb_best_sum_nokgi_ln1_lag2','candidate_sb_best_sum_nokgi_ln1_lag3',
    'acled_sb_fat_ln', 'acled_sb_fat_ln_1', 'acled_sb_fat_ln_2','acled_pr_count',
    'acled_sb_count','acled_ns_count', 'acled_os_count','acled_sb_fat_ln_24', 'acled_sb_fat_ln_3'
    ] 

short_list = short_features + countries

# Create a new DataFrame with only the selected columns
train_acled = train[short_list]
test_acled = test[short_list]

In [None]:
# Creating the Test and train sets

# Y test
y_train = train['sb_final_best_ln']
y_test = test['sb_final_best_ln']

X_train_1 = train_acled
X_test_1 = test_acled

# Random Forest Model (non-log)
rf_model2 = RandomForestRegressor(random_state=17)
rf_model2.fit(X_train_1, y_train)

rf_m2_predictions = rf_model2.predict(X_test_1)


# Calculate Mean Squared Error
mse = mean_squared_error(y_test, rf_m2_predictions)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)


print("Random Forest Regressor MSE:", mse)
print("Random Forest Regressor RMSE:", rmse)
print('')
final_candidate = mean_squared_error(test['sb_final_best_ln'], test['candidate_sb_best_sum_nokgi_ln1'])
print('MSE GED Final vs UCDP Candidate:', final_candidate)
print('RMSE GED Final vs UCDP Candidate:', np.sqrt(final_candidate))


print(" ")

## PLOTTING
# Parameters for jitter
jitter_amount = 0.15
# Creating jitter by adding a small random number to x and y coordinates
x_jittered = rf_m2_predictions + np.random.normal(0, jitter_amount, size=len(rf_m2_predictions))
y_jittered = y_test + np.random.normal(0, jitter_amount, size=len(y_test))
# Creating subplots
fig, ax = plt.subplots(1, 2, figsize=(16, 7))
# Original plot
ax[0].scatter(rf_m2_predictions, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0].set_title('Actuals vs Predictions')
ax[0].set_xlabel('Predicted Values (Logged)')
ax[0].set_ylabel('Actual Values (Logged)')
ax[0].set_xlim([-1, 11])
ax[0].set_ylim([-1, 11])
ax[0].grid(True)
# Plot with jitter
ax[1].scatter(x_jittered, y_jittered, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1].set_title('Plot with Jitter')
ax[1].set_xlabel('Predicted Values (Logged)')
ax[1].set_ylabel('Actual Values (Logged)')
ax[1].set_xlim([-1, 11])
ax[1].set_ylim([-1, 11])
ax[1].grid(True)
plt.show()

## Model 3: ACLED Model with additional features

In [None]:
countries = [
    'AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 
    'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 
    'BRA', 'BRB', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 
    'COD', 'COG', 'COL', 'COM', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 
    'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 
    'FRA', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 
    'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 
    'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 
    'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LKA', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 
    'MDG', 'MDV', 'MEX', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 
    'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 
    'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL', 'PRK', 'PRT', 'PRY', 'QAT', 
    'ROU', 'RUS', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SLE', 'SLV', 'SOM', 'SRB', 
    'SSD', 'SVK', 'SVN', 'SWE', 'SWZ', 'SYC', 'SYR', 'TCD', 'TGO', 'THA', 'TJK', 
    'TKM', 'TLS', 'TTO', 'TUN', 'TUR', 'TWN', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 
    'UZB', 'VEN', 'VNM', 'YEM', 'ZAF', 'ZMB', 'ZWE']   
 
short_features = [
    'candidate_sb_best_sum_nokgi_ln1','candidate_os_best_sum_nokgi_ln1','candidate_ns_best_sum_nokgi_ln1', 
    'candidate_sb_best_count_nokgi','candidate_ns_best_count_nokgi','candidate_os_best_count_nokgi', 
    'month', 'candidate_sb_best_sum_nokgi_ln1_lag1', 'candidate_sb_best_sum_nokgi_ln1_lag2','candidate_sb_best_sum_nokgi_ln1_lag3',
    'acled_sb_fat_ln', 'acled_sb_fat_ln_1', 'acled_sb_fat_ln_2','acled_pr_count',
    'acled_sb_count','acled_ns_count', 'acled_os_count','acled_sb_fat_ln_24', 'acled_sb_fat_ln_3',
    'topic_conflict_1', 'topic_judiciary_1', 'topic_diplomacy_1',
    ] 

short_list = short_features + countries

# Create a new DataFrame with only the selected columns
train_long = train[short_list]
test_long = test[short_list]

In [None]:
# Creating the Test and train sets

# Y test
y_train = train['sb_final_best_ln']
y_test = test['sb_final_best_ln']

X_train_1 = train_long
X_test_1 = test_long

# Random Forest Model (non-log)
rf_model2 = RandomForestRegressor(random_state=17)
rf_model2.fit(X_train_1, y_train)

rf_m2_predictions = rf_model2.predict(X_test_1)


# Calculate Mean Squared Error
mse = mean_squared_error(y_test, rf_m2_predictions)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)


print("Random Forest Regressor MSE:", mse)
print("Random Forest Regressor RMSE:", rmse)
print('')
final_candidate = mean_squared_error(test['sb_final_best_ln'], test['candidate_sb_best_sum_nokgi_ln1'])
print('MSE GED Final vs UCDP Candidate:', final_candidate)
print('RMSE GED Final vs UCDP Candidate:', np.sqrt(final_candidate))


print(" ")

## PLOTTING
# Parameters for jitter
jitter_amount = 0.15
# Creating jitter by adding a small random number to x and y coordinates
x_jittered = rf_m2_predictions + np.random.normal(0, jitter_amount, size=len(rf_m2_predictions))
y_jittered = y_test + np.random.normal(0, jitter_amount, size=len(y_test))
# Creating subplots
fig, ax = plt.subplots(1, 2, figsize=(16, 7))
# Original plot
ax[0].scatter(rf_m2_predictions, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0].set_title('Actuals vs Predictions')
ax[0].set_xlabel('Predicted Values (Logged)')
ax[0].set_ylabel('Actual Values (Logged)')
ax[0].set_xlim([-1, 11])
ax[0].set_ylim([-1, 11])
ax[0].grid(True)
# Plot with jitter
ax[1].scatter(x_jittered, y_jittered, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1].set_title('Plot with Jitter')
ax[1].set_xlabel('Predicted Values (Logged)')
ax[1].set_ylabel('Actual Values (Logged)')
ax[1].set_xlim([-1, 11])
ax[1].set_ylim([-1, 11])
ax[1].grid(True)
plt.show()

## Model 4: Training long with VDEM

In [None]:
countries = [
    'AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 
    'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 
    'BRA', 'BRB', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 
    'COD', 'COG', 'COL', 'COM', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 
    'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 
    'FRA', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 
    'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 
    'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 
    'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LKA', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 
    'MDG', 'MDV', 'MEX', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 
    'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 
    'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL', 'PRK', 'PRT', 'PRY', 'QAT', 
    'ROU', 'RUS', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SLE', 'SLV', 'SOM', 'SRB', 
    'SSD', 'SVK', 'SVN', 'SWE', 'SWZ', 'SYC', 'SYR', 'TCD', 'TGO', 'THA', 'TJK', 
    'TKM', 'TLS', 'TTO', 'TUN', 'TUR', 'TWN', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 
    'UZB', 'VEN', 'VNM', 'YEM', 'ZAF', 'ZMB', 'ZWE']   
 
short_features = [
    'candidate_sb_best_sum_nokgi_ln1','candidate_os_best_sum_nokgi_ln1','candidate_ns_best_sum_nokgi_ln1', 
    'candidate_sb_best_count_nokgi','candidate_ns_best_count_nokgi','candidate_os_best_count_nokgi', 
    'month', 'candidate_sb_best_sum_nokgi_ln1_lag1', 'candidate_sb_best_sum_nokgi_ln1_lag2','candidate_sb_best_sum_nokgi_ln1_lag3',
    'acled_sb_fat_ln', 'acled_sb_fat_ln_1', 'acled_sb_fat_ln_2','acled_pr_count',
    'acled_sb_count','acled_ns_count', 'acled_os_count','acled_sb_fat_ln_24', 'acled_sb_fat_ln_3',
    'topic_conflict_1', 'topic_judiciary_1', 'topic_diplomacy_1',
    'vdem_v2x_delibdem', 'vdem_v2x_clphy', 'vdem_v2x_rule', 'vdem_v2x_freexp'
    ] 

short_list = short_features + countries

# Create a new DataFrame with only the selected columns
train_longer = train[short_list]
test_longer = test[short_list]

In [None]:
# Creating the Test and train sets

# Y test
y_train = train['sb_final_best_ln']
y_test = test['sb_final_best_ln']

X_train_1 = train_longer
X_test_1 = test_longer

# Random Forest Model (non-log)
rf_model2 = RandomForestRegressor(random_state=17)
rf_model2.fit(X_train_1, y_train)

rf_m2_predictions = rf_model2.predict(X_test_1)


# Calculate Mean Squared Error
mse = mean_squared_error(y_test, rf_m2_predictions)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)


print("Random Forest Regressor MSE:", mse)
print("Random Forest Regressor RMSE:", rmse)
print('')
final_candidate = mean_squared_error(test['sb_final_best_ln'], test['candidate_sb_best_sum_nokgi_ln1'])
print('MSE GED Final vs UCDP Candidate:', final_candidate)
print('RMSE GED Final vs UCDP Candidate:', np.sqrt(final_candidate))


print(" ")

## PLOTTING
# Parameters for jitter
jitter_amount = 0.15
# Creating jitter by adding a small random number to x and y coordinates
x_jittered = rf_m2_predictions + np.random.normal(0, jitter_amount, size=len(rf_m2_predictions))
y_jittered = y_test + np.random.normal(0, jitter_amount, size=len(y_test))
# Creating subplots
fig, ax = plt.subplots(1, 2, figsize=(16, 7))
# Original plot
ax[0].scatter(rf_m2_predictions, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0].set_title('Actuals vs Predictions')
ax[0].set_xlabel('Predicted Values (Logged)')
ax[0].set_ylabel('Actual Values (Logged)')
ax[0].set_xlim([-1, 11])
ax[0].set_ylim([-1, 11])
ax[0].grid(True)
# Plot with jitter
ax[1].scatter(x_jittered, y_jittered, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1].set_title('Plot with Jitter')
ax[1].set_xlabel('Predicted Values (Logged)')
ax[1].set_ylabel('Actual Values (Logged)')
ax[1].set_xlim([-1, 11])
ax[1].set_ylim([-1, 11])
ax[1].grid(True)
plt.show()

## Using an XgBoost Model instead

In [None]:
countries = [
    'AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 
    'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 
    'BRA', 'BRB', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 
    'COD', 'COG', 'COL', 'COM', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 
    'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 
    'FRA', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 
    'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 
    'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 
    'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LKA', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 
    'MDG', 'MDV', 'MEX', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 
    'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 
    'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL', 'PRK', 'PRT', 'PRY', 'QAT', 
    'ROU', 'RUS', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SLE', 'SLV', 'SOM', 'SRB', 
    'SSD', 'SVK', 'SVN', 'SWE', 'SWZ', 'SYC', 'SYR', 'TCD', 'TGO', 'THA', 'TJK', 
    'TKM', 'TLS', 'TTO', 'TUN', 'TUR', 'TWN', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 
    'UZB', 'VEN', 'VNM', 'YEM', 'ZAF', 'ZMB', 'ZWE']   
 
short_features = [
    'candidate_sb_best_sum_nokgi_ln1','candidate_os_best_sum_nokgi_ln1','candidate_ns_best_sum_nokgi_ln1', 
    'candidate_sb_best_count_nokgi','candidate_ns_best_count_nokgi','candidate_os_best_count_nokgi', 
    'month', 'candidate_sb_best_sum_nokgi_ln1_lag1', 'candidate_sb_best_sum_nokgi_ln1_lag2','candidate_sb_best_sum_nokgi_ln1_lag3',
    'acled_sb_fat_ln', 'acled_sb_fat_ln_1', 'acled_sb_fat_ln_2','acled_pr_count',
    'acled_sb_count','acled_ns_count', 'acled_os_count','acled_sb_fat_ln_24', 'acled_sb_fat_ln_3',
    'topic_conflict_1', 'topic_judiciary_1', 'topic_diplomacy_1',
    'vdem_v2x_delibdem', 'vdem_v2x_clphy', 'vdem_v2x_rule', 'vdem_v2x_freexp'
    ] 

short_list = short_features + countries

# Create a new DataFrame with only the selected columns
train_longer = train[short_list]
test_longer = test[short_list]

In [None]:
# Create an XGBRegressor model

xgb_m3_regressor = xgb.XGBRegressor(eval_metric='rmsle',
                                       learning_rate = 0.02,
                                       max_depth = 6,
                                       n_estimators = 250,
                                       colsample_bytree = 0.6,
                                       subsample = 0.8,
                                       #reg_alpha = 0.4,
                                       #reg_lambda =0.4,
                                       random_state = 17
                                      ) 


# Fit the model on the training set
xgb_m3_regressor.fit(train_longer, y_train)

# Make predictions on the test set
xgb_m3_predictions = xgb_m3_regressor.predict(test_longer)

xgb_m3_predictions = np.maximum(xgb_m3_predictions, 0)

# Evaluate the model using MSE or any other suitable regression metric
mse_m3_xgb = mean_squared_error(y_test, xgb_m3_predictions)
print("XGBoost MSE:", mse_m3_xgb)
print("XGBoost RMSE:", np.sqrt(mse_m3_xgb))
print(' ')
final_candidate = mean_squared_error(test['sb_final_best_ln'], test['candidate_sb_best_sum_nokgi_ln1'])
print('MSE GED Final vs UCDP Candidate:', final_candidate)
print('RMSE GED Final vs UCDP Candidate:', np.sqrt(final_candidate))

## PLOTTING (2x2 layout)
# Parameters for jitter
jitter_amount = 0.15

# Jittered predictions for XGBoost
x_jittered_xgb = xgb_m3_predictions + np.random.normal(0, jitter_amount, size=len(xgb_m3_predictions))
y_jittered_xgb = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Jittered predictions for Candidate Model
candidate_preds = test['candidate_sb_best_sum_nokgi_ln1']
x_jittered_candidate = candidate_preds + np.random.normal(0, jitter_amount, size=len(candidate_preds))
y_jittered_candidate = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Create 2x2 subplot layout
fig, ax = plt.subplots(2, 2, figsize=(16, 16))

# 1. XGBoost Predictions vs Actual (Original)
ax[0, 0].scatter(xgb_m3_predictions, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0, 0].set_title('XGBoost: Predicted vs Actual')
ax[0, 0].set_xlabel('Predicted Values (Logged)')
ax[0, 0].set_ylabel('Actual Values (Logged)')
ax[0, 0].set_xlim([-1, 11])
ax[0, 0].set_ylim([-1, 11])
ax[0, 0].grid(True)

# 2. XGBoost Predictions vs Actual (with Jitter)
ax[0, 1].scatter(x_jittered_xgb, y_jittered_xgb, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[0, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0, 1].set_title('XGBoost: Plot with Jitter')
ax[0, 1].set_xlabel('Predicted Values (Logged)')
ax[0, 1].set_ylabel('Actual Values (Logged)')
ax[0, 1].set_xlim([-1, 11])
ax[0, 1].set_ylim([-1, 11])
ax[0, 1].grid(True)

# 3. Candidate Predictions vs Actual (with Jitter)
ax[1, 0].scatter(test['candidate_sb_best_sum_nokgi_ln1'], y_test, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1, 0].set_title('Candidate vs GED Final')
ax[1, 0].set_xlabel('Candidate Values (Logged)')
ax[1, 0].set_ylabel('GED Final Values (Logged)')
ax[1, 0].set_xlim([-1, 11])
ax[1, 0].set_ylim([-1, 11])
ax[1, 0].grid(True)

# 4. Candidate Predictions vs Actual (with Jitter)
ax[1, 1].scatter(x_jittered_candidate, y_jittered_candidate, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1, 1].set_title('Candidate: Plot with Jitter')
ax[1, 1].set_xlabel('Candidate Predicted Values (Logged)')
ax[1, 1].set_ylabel('Actual Values (Logged)')
ax[1, 1].set_xlim([-1, 11])
ax[1, 1].set_ylim([-1, 11])
ax[1, 1].grid(True)

plt.tight_layout()
plt.show()

# EVAL METRICS
# Assuming your RandomForest model is named 'rf_best' and is already trained
feature_importances_xgb = xgb_m3_regressor.feature_importances_

# Assuming 'X_train' is your training dataset
feature_names_xgb = X_train_1.columns

# Create a pandas DataFrame for easier visualization
feature_importance_xgb = pd.DataFrame({'Feature': feature_names_xgb, 'Importance': feature_importances_xgb})

# Sort the DataFrame to show the most important features at the top
feature_importance_xgb = feature_importance_xgb.sort_values(by='Importance', ascending=False)

# Display the feature importance scores
feature_importance_xgb.head(25)


## Tuning the XgBoost Model

In [None]:
from sklearn.model_selection import GridSearchCV

# Define a parameter grid to search
param_grid = {
    'max_depth': [6, 8, 10],
    'learning_rate': [0.01, 0.02],
    'subsample': [0.4, 0.6, 0.8],
    'colsample_bytree': [0.5,0.6, 0.7],
    'n_estimators': [250, 500, 750],
}

# Time series cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Initialize the XGBRegressor
xgb_regressor = xgb.XGBRegressor(eval_metric='rmse')

# Initialize Grid Search
grid_search = GridSearchCV(estimator=xgb_regressor, param_grid=param_grid, cv=tscv, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(train_longer, y_train)

# Best parameter set
print("Best parameters found: ", grid_search.best_params_)

# Best estimator with parameters set
xgb_regressor = grid_search.best_estimator_

# Make predictions with the best estimator
xgb_m2_predictions = xgb_regressor.predict(test_longer)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, xgb_m2_predictions))
print("RMSE: %f" % (rmse))

## XgBoost with tuned hyperparameters

In [None]:
# Create an XGBRegressor model

xgb_m3_regressor = xgb.XGBRegressor(eval_metric='rmse',
                                       learning_rate = 0.01,
                                       max_depth = 6,
                                       n_estimators = 750,
                                       colsample_bytree = 0.6,
                                       subsample = 0.8,
                                       #reg_alpha = 0.4,
                                       #reg_lambda =0.4,
                                       min_child_weight=1,
                                       gamma=0.1,
                                       booster='dart',
                                       random_state = 17
                                      ) 


# Fit the model on the training set
xgb_m3_regressor.fit(train_longer, y_train)

# Make predictions on the test set
xgb_m3_predictions = xgb_m3_regressor.predict(test_longer)

xgb_m3_predictions = np.maximum(xgb_m3_predictions, 0)

# Evaluate the model using MSE or any other suitable regression metric
mse_m3_xgb = mean_squared_error(y_test, xgb_m3_predictions)
print("XGBoost MSE:", mse_m3_xgb)
print("XGBoost RMSE:", np.sqrt(mse_m3_xgb))
print(' ')
final_candidate = mean_squared_error(test['sb_final_best_ln'], test['candidate_sb_best_sum_nokgi_ln1'])
print('MSE GED Final vs UCDP Candidate:', final_candidate)
print('RMSE GED Final vs UCDP Candidate:', np.sqrt(final_candidate))

## PLOTTING (2x2 layout)
# Parameters for jitter
jitter_amount = 0.15

# Jittered predictions for XGBoost
x_jittered_xgb = xgb_m3_predictions + np.random.normal(0, jitter_amount, size=len(xgb_m3_predictions))
y_jittered_xgb = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Jittered predictions for Candidate Model
candidate_preds = test['candidate_sb_best_sum_nokgi_ln1']
x_jittered_candidate = candidate_preds + np.random.normal(0, jitter_amount, size=len(candidate_preds))
y_jittered_candidate = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Create 2x2 subplot layout
fig, ax = plt.subplots(2, 2, figsize=(16, 16))

# 1. XGBoost Predictions vs Actual (Original)
ax[0, 0].scatter(xgb_m3_predictions, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0, 0].set_title('XGBoost: Predicted vs Actual')
ax[0, 0].set_xlabel('Predicted Values (Logged)')
ax[0, 0].set_ylabel('Actual Values (Logged)')
ax[0, 0].set_xlim([-1, 11])
ax[0, 0].set_ylim([-1, 11])
ax[0, 0].grid(True)

# 2. XGBoost Predictions vs Actual (with Jitter)
ax[0, 1].scatter(x_jittered_xgb, y_jittered_xgb, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[0, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0, 1].set_title('XGBoost: Plot with Jitter')
ax[0, 1].set_xlabel('Predicted Values (Logged)')
ax[0, 1].set_ylabel('Actual Values (Logged)')
ax[0, 1].set_xlim([-1, 11])
ax[0, 1].set_ylim([-1, 11])
ax[0, 1].grid(True)

# 3. Candidate Predictions vs Actual (with Jitter)
ax[1, 0].scatter(test['candidate_sb_best_sum_nokgi_ln1'], y_test, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1, 0].set_title('Candidate vs GED Final')
ax[1, 0].set_xlabel('Candidate Values (Logged)')
ax[1, 0].set_ylabel('GED Final Values (Logged)')
ax[1, 0].set_xlim([-1, 11])
ax[1, 0].set_ylim([-1, 11])
ax[1, 0].grid(True)

# 4. Candidate Predictions vs Actual (with Jitter)
ax[1, 1].scatter(x_jittered_candidate, y_jittered_candidate, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1, 1].set_title('Candidate: Plot with Jitter')
ax[1, 1].set_xlabel('Candidate Predicted Values (Logged)')
ax[1, 1].set_ylabel('Actual Values (Logged)')
ax[1, 1].set_xlim([-1, 11])
ax[1, 1].set_ylim([-1, 11])
ax[1, 1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# EVAL METRICS
# Assuming your RandomForest model is named 'rf_best' and is already trained
feature_importances_xgb = xgb_m3_regressor.feature_importances_

# Assuming 'X_train' is your training dataset
feature_names_xgb = train_longer.columns

# Create a pandas DataFrame for easier visualization
feature_importance_xgb = pd.DataFrame({'Feature': feature_names_xgb, 'Importance': feature_importances_xgb})

# Sort the DataFrame to show the most important features at the top
feature_importance_xgb = feature_importance_xgb.sort_values(by='Importance', ascending=False)

# Display the feature importance scores
feature_importance_xgb.head(10)

In [None]:
## PLOTTING (2x2 layout)
# Parameters for jitter
jitter_amount = 0.15

# Jittered predictions for XGBoost
x_jittered_xgb = xgb_m3_predictions + np.random.normal(0, jitter_amount, size=len(xgb_m3_predictions))
y_jittered_xgb = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Jittered predictions for Candidate Model
candidate_preds = test['candidate_sb_best_sum_nokgi_ln1']
x_jittered_candidate = candidate_preds + np.random.normal(0, jitter_amount, size=len(candidate_preds))
y_jittered_candidate = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Create 2x2 subplot layout
fig, ax = plt.subplots(2, 2, figsize=(16, 16))

# 1. XGBoost Predictions vs Actual (Original)
ax[0, 0].scatter(xgb_m3_predictions, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0, 0].set_title('XGBoost: Predicted vs Actual')
ax[0, 0].set_xlabel('Predicted Values (Logged)')
ax[0, 0].set_ylabel('Actual Values (Logged)')
ax[0, 0].set_xlim([-1, 11])
ax[0, 0].set_ylim([-1, 11])
ax[0, 0].grid(True)

# 2. XGBoost Predictions vs Actual (with Jitter)
ax[0, 1].scatter(x_jittered_xgb, y_jittered_xgb, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[0, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0, 1].set_title('XGBoost: Plot with Jitter')
ax[0, 1].set_xlabel('Predicted Values (Logged)')
ax[0, 1].set_ylabel('Actual Values (Logged)')
ax[0, 1].set_xlim([-1, 11])
ax[0, 1].set_ylim([-1, 11])
ax[0, 1].grid(True)

# 3. Candidate Predictions vs Actual (with Jitter)
ax[1, 0].scatter(test['candidate_sb_best_sum_nokgi_ln1'], y_test, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1, 0].set_title('Candidate vs GED Final')
ax[1, 0].set_xlabel('Candidate Values (Logged)')
ax[1, 0].set_ylabel('GED Final Values (Logged)')
ax[1, 0].set_xlim([-1, 11])
ax[1, 0].set_ylim([-1, 11])
ax[1, 0].grid(True)

# 4. Candidate Predictions vs Actual (with Jitter)
ax[1, 1].scatter(x_jittered_candidate, y_jittered_candidate, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1, 1].set_title('Candidate: Plot with Jitter')
ax[1, 1].set_xlabel('Candidate Predicted Values (Logged)')
ax[1, 1].set_ylabel('Actual Values (Logged)')
ax[1, 1].set_xlim([-1, 11])
ax[1, 1].set_ylim([-1, 11])
ax[1, 1].grid(True)

plt.tight_layout()
plt.show()

## Interactive Plots

In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import numpy as np

# Jitter parameters
jitter_amount = 0.15

# Create jittered versions of predictions
x_jittered_xgb = xgb_m3_predictions + np.random.normal(0, jitter_amount, size=len(xgb_m3_predictions))
y_jittered_xgb = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

candidate_preds = test['candidate_sb_best_sum_nokgi_ln1']
x_jittered_candidate = candidate_preds + np.random.normal(0, jitter_amount, size=len(candidate_preds))
y_jittered_candidate = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Extract hover info (assuming these are in the index)
month_ids = test.index.get_level_values('month_id') if 'month_id' in test.index.names else test['month_id']
c_ids = test.index.get_level_values('c_id') if 'c_id' in test.index.names else test['c_id']
hover_text = [f"month_id: {m}, c_id: {c}" for m, c in zip(month_ids, c_ids)]

# Create subplots
fig = make_subplots(rows=2, cols=2, subplot_titles=[
    "XGBoost: Predicted vs Actual",
    "XGBoost: Plot with Jitter",
    "Candidate vs GED Final",
    "Candidate: Plot with Jitter"
])

# 1. XGBoost: Predicted vs Actual
fig.add_trace(go.Scatter(
    x=xgb_m3_predictions,
    y=y_test,
    mode='markers',
    marker=dict(color='crimson', opacity=0.3),
    text=hover_text,
    hoverinfo='text',
    name='XGB'
), row=1, col=1)

# 2. XGBoost: Jittered
fig.add_trace(go.Scatter(
    x=x_jittered_xgb,
    y=y_jittered_xgb,
    mode='markers',
    marker=dict(color='crimson', opacity=0.3),
    text=hover_text,
    hoverinfo='text',
    name='XGB Jitter'
), row=1, col=2)

# 3. Candidate vs Actual
fig.add_trace(go.Scatter(
    x=candidate_preds,
    y=y_test,
    mode='markers',
    marker=dict(color='teal', opacity=0.3),
    text=hover_text,
    hoverinfo='text',
    name='Candidate'
), row=2, col=1)

# 4. Candidate: Jittered
fig.add_trace(go.Scatter(
    x=x_jittered_candidate,
    y=y_jittered_candidate,
    mode='markers',
    marker=dict(color='teal', opacity=0.3),
    text=hover_text,
    hoverinfo='text',
    name='Candidate Jitter'
), row=2, col=2)

# Add 45-degree line to each subplot
for row in [1, 2]:
    for col in [1, 2]:
        fig.add_trace(go.Scatter(
            x=[-1, 11],
            y=[-1, 11],
            mode='lines',
            line=dict(color='black', dash='dash'),
            showlegend=False
        ), row=row, col=col)

# Update layout
fig.update_layout(
    height=900,
    width=1000,
    title_text="Interactive Model Comparison Plots",
)

# Set x and y limits for all plots
for i in range(1, 5):
    fig.update_xaxes(range=[-1, 11], row=(i - 1) // 2 + 1, col=(i - 1) % 2 + 1)
    fig.update_yaxes(range=[-1, 11], row=(i - 1) // 2 + 1, col=(i - 1) % 2 + 1)

fig.show()


In [None]:
import plotly.express as px

# Compute residuals and group
test['residual'] = y_test - xgb_m3_predictions
resid_by_country = test.groupby('c_id')['residual'].mean()

# Convert to DataFrame and sort
resid_df = resid_by_country.reset_index()
resid_df.columns = ['c_id', 'avg_residual']
resid_df = resid_df.sort_values(by='avg_residual', ascending=True)  # ← sort here

# Plot interactive bar chart
fig = px.bar(
    resid_df,
    x='c_id',
    y='avg_residual',
    title='Average Residual by Country (c_id)',
    labels={'c_id': 'Country (c_id)', 'avg_residual': 'Average Residual'},
    hover_data={'c_id': True, 'avg_residual': True}
)

fig.update_layout(
    xaxis_tickangle=-45,
    height=500,
    width=1000
)

fig.show()


In [None]:
test['residual'] = y_test - xgb_m3_predictions
resid_by_country = test.groupby('c_id')['residual'].mean().sort_values()
resid_by_country.plot(kind='bar', figsize=(12, 5), title='Average Residual by Country (c_id)')

## Attempting to reduce residuals

In [None]:
# Create an XGBRegressor model

xgb_m3_regressor = xgb.XGBRegressor(eval_metric='rmse',
                                       learning_rate = 0.02,
                                       max_depth = 12,
                                       n_estimators = 500,
                                       colsample_bytree = 0.8,
                                       subsample = 0.8,
                                       #reg_alpha = 0.4,
                                       #reg_lambda =0.4,
                                       min_child_weight=1,
                                       gamma=0.1,
                                       booster='dart',
                                       random_state = 17
                                      ) 


# Fit the model on the training set
xgb_m3_regressor.fit(train_longer, y_train)

# Make predictions on the test set
xgb_m3_predictions = xgb_m3_regressor.predict(test_longer)

xgb_m3_predictions = np.maximum(xgb_m3_predictions, 0)

# Evaluate the model using MSE or any other suitable regression metric
mse_m3_xgb = mean_squared_error(y_test, xgb_m3_predictions)
print("XGBoost MSE:", mse_m3_xgb)
print("XGBoost RMSE:", np.sqrt(mse_m3_xgb))
print(' ')
final_candidate = mean_squared_error(test['sb_final_best_ln'], test['candidate_sb_best_sum_nokgi_ln1'])
print('MSE GED Final vs UCDP Candidate:', final_candidate)
print('RMSE GED Final vs UCDP Candidate:', np.sqrt(final_candidate))

## PLOTTING (2x2 layout)
# Parameters for jitter
jitter_amount = 0.15

# Jittered predictions for XGBoost
x_jittered_xgb = xgb_m3_predictions + np.random.normal(0, jitter_amount, size=len(xgb_m3_predictions))
y_jittered_xgb = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Jittered predictions for Candidate Model
candidate_preds = test['candidate_sb_best_sum_nokgi_ln1']
x_jittered_candidate = candidate_preds + np.random.normal(0, jitter_amount, size=len(candidate_preds))
y_jittered_candidate = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Create 2x2 subplot layout
fig, ax = plt.subplots(2, 2, figsize=(16, 16))

# 1. XGBoost Predictions vs Actual (Original)
ax[0, 0].scatter(xgb_m3_predictions, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0, 0].set_title('XGBoost: Predicted vs Actual')
ax[0, 0].set_xlabel('Predicted Values (Logged)')
ax[0, 0].set_ylabel('Actual Values (Logged)')
ax[0, 0].set_xlim([-1, 11])
ax[0, 0].set_ylim([-1, 11])
ax[0, 0].grid(True)

# 2. XGBoost Predictions vs Actual (with Jitter)
ax[0, 1].scatter(x_jittered_xgb, y_jittered_xgb, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[0, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0, 1].set_title('XGBoost: Plot with Jitter')
ax[0, 1].set_xlabel('Predicted Values (Logged)')
ax[0, 1].set_ylabel('Actual Values (Logged)')
ax[0, 1].set_xlim([-1, 11])
ax[0, 1].set_ylim([-1, 11])
ax[0, 1].grid(True)

# 3. Candidate Predictions vs Actual (with Jitter)
ax[1, 0].scatter(test['candidate_sb_best_sum_nokgi_ln1'], y_test, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1, 0].set_title('Candidate vs GED Final')
ax[1, 0].set_xlabel('Candidate Values (Logged)')
ax[1, 0].set_ylabel('GED Final Values (Logged)')
ax[1, 0].set_xlim([-1, 11])
ax[1, 0].set_ylim([-1, 11])
ax[1, 0].grid(True)

# 4. Candidate Predictions vs Actual (with Jitter)
ax[1, 1].scatter(x_jittered_candidate, y_jittered_candidate, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1, 1].set_title('Candidate: Plot with Jitter')
ax[1, 1].set_xlabel('Candidate Predicted Values (Logged)')
ax[1, 1].set_ylabel('Actual Values (Logged)')
ax[1, 1].set_xlim([-1, 11])
ax[1, 1].set_ylim([-1, 11])
ax[1, 1].grid(True)

plt.tight_layout()
plt.show()

# EVAL METRICS
# Assuming your RandomForest model is named 'rf_best' and is already trained
feature_importances_xgb = xgb_m3_regressor.feature_importances_

# Assuming 'X_train' is your training dataset
feature_names_xgb = X_train_1.columns

# Create a pandas DataFrame for easier visualization
feature_importance_xgb = pd.DataFrame({'Feature': feature_names_xgb, 'Importance': feature_importances_xgb})

# Sort the DataFrame to show the most important features at the top
feature_importance_xgb = feature_importance_xgb.sort_values(by='Importance', ascending=False)

# Display the feature importance scores
feature_importance_xgb.head(25)


In [None]:
test['residual'] = y_test - xgb_m3_predictions
resid_by_country = test.groupby('c_id')['residual'].mean().sort_values()
resid_by_country.plot(kind='bar', figsize=(12, 5), title='Average Residual by Country (c_id)')

# Improving underpredictions

In [None]:
countries = [
    'AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 
    'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 
    'BRA', 'BRB', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 
    'COD', 'COG', 'COL', 'COM', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 
    'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 
    'FRA', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 
    'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 
    'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 
    'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LKA', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 
    'MDG', 'MDV', 'MEX', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 
    'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 
    'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL', 'PRK', 'PRT', 'PRY', 'QAT', 
    'ROU', 'RUS', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SLE', 'SLV', 'SOM', 'SRB', 
    'SSD', 'SVK', 'SVN', 'SWE', 'SWZ', 'SYC', 'SYR', 'TCD', 'TGO', 'THA', 'TJK', 
    'TKM', 'TLS', 'TTO', 'TUN', 'TUR', 'TWN', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 
    'UZB', 'VEN', 'VNM', 'YEM', 'ZAF', 'ZMB', 'ZWE']   
 
short_features = [
    'candidate_sb_best_sum_nokgi_ln1','candidate_os_best_sum_nokgi_ln1','candidate_ns_best_sum_nokgi_ln1', 
    'candidate_sb_best_count_nokgi','candidate_ns_best_count_nokgi','candidate_os_best_count_nokgi', 
    'month', 'candidate_sb_best_sum_nokgi_ln1_lag1', 'candidate_sb_best_sum_nokgi_ln1_lag2','candidate_sb_best_sum_nokgi_ln1_lag3',
    'acled_sb_fat_ln', 'acled_sb_fat_ln_1', 'acled_sb_fat_ln_2','acled_pr_count',
    'acled_sb_count','acled_ns_count', 'acled_os_count','acled_sb_fat_ln_24', 'acled_sb_fat_ln_3',
    'topic_conflict_1', 'topic_judiciary_1', 'topic_diplomacy_1',
    'vdem_v2x_delibdem', 'vdem_v2x_clphy', 'vdem_v2x_rule', 'vdem_v2x_freexp'
    ] 

short_list = short_features + countries

# Create a new DataFrame with only the selected columns
train_longer = train[short_list]
test_longer = test[short_list]

In [None]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error

# === First Pass: Base Model ===
xgb_base = xgb.XGBRegressor(
    eval_metric='rmse',
    learning_rate=0.01,
    max_depth=6,
    n_estimators=750,
    colsample_bytree=0.6,
    subsample=0.8,
    min_child_weight=1,
    gamma=0.1,
    booster='dart',
    random_state=17
)

# Fit base model to get residuals
xgb_base.fit(train_longer, y_train)
train_preds = xgb_base.predict(train_longer)

# Compute residuals
train_residuals = y_train - train_preds

# Reset index of train_longer and y_train
train_longer = train_longer.copy()
train_longer = train_longer.reset_index()
y_train_reset = y_train.reset_index(drop=True)  # Align index with reset train_longer
train_residuals = y_train_reset - train_preds   # Now residuals are aligned

# Assign residuals
train_longer['residual'] = train_residuals

# Identify underpredicted countries (e.g., residual < -0.4)
resid_by_cid = train_longer.groupby('c_id')['residual'].mean()
underpredicted_cids = resid_by_cid[resid_by_cid < -0.38].index.tolist()

# Assign sample weights
train_longer['sample_weight'] = 1.0
train_longer.loc[train_longer['c_id'].isin(underpredicted_cids), 'sample_weight'] = 2.0

# === Second Pass: Weighted Model ===
xgb_m3_regressor = xgb.XGBRegressor(
    eval_metric='rmse',
    learning_rate=0.01,
    max_depth=6,
    n_estimators=750,
    colsample_bytree=0.6,
    subsample=0.8,
    min_child_weight=1,
    gamma=0.1,
    booster='dart',
    random_state=17
)

# Refit using sample weights
xgb_m3_regressor.fit(
    train_longer[short_features],
    y_train,
    sample_weight=train_longer['sample_weight']
)

# Predict on test set
xgb_m3_predictions = xgb_m3_regressor.predict(test_longer[short_features])
xgb_m3_predictions = np.maximum(xgb_m3_predictions, 0)

# Evaluate
mse_m3_xgb = mean_squared_error(y_test, xgb_m3_predictions)
print("XGBoost MSE:", mse_m3_xgb)
print("XGBoost RMSE:", np.sqrt(mse_m3_xgb))
print()

# Compare to candidate model
final_candidate = mean_squared_error(test['sb_final_best_ln'], test['candidate_sb_best_sum_nokgi_ln1'])
print('MSE GED Final vs UCDP Candidate:', final_candidate)
print('RMSE GED Final vs UCDP Candidate:', np.sqrt(final_candidate))



In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import numpy as np

# Jitter parameters
jitter_amount = 0.15

# Create jittered versions of predictions
x_jittered_xgb = xgb_m3_predictions + np.random.normal(0, jitter_amount, size=len(xgb_m3_predictions))
y_jittered_xgb = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

candidate_preds = test['candidate_sb_best_sum_nokgi_ln1']
x_jittered_candidate = candidate_preds + np.random.normal(0, jitter_amount, size=len(candidate_preds))
y_jittered_candidate = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Extract hover info (assuming these are in the index)
month_ids = test.index.get_level_values('month_id') if 'month_id' in test.index.names else test['month_id']
c_ids = test.index.get_level_values('c_id') if 'c_id' in test.index.names else test['c_id']
hover_text = [f"month_id: {m}, c_id: {c}" for m, c in zip(month_ids, c_ids)]

# Create subplots
fig = make_subplots(rows=2, cols=2, subplot_titles=[
    "XGBoost: Predicted vs Actual",
    "XGBoost: Plot with Jitter",
    "Candidate vs GED Final",
    "Candidate: Plot with Jitter"
])

# 1. XGBoost: Predicted vs Actual
fig.add_trace(go.Scatter(
    x=xgb_m3_predictions,
    y=y_test,
    mode='markers',
    marker=dict(color='crimson', opacity=0.3),
    text=hover_text,
    hoverinfo='text',
    name='XGB'
), row=1, col=1)

# 2. XGBoost: Jittered
fig.add_trace(go.Scatter(
    x=x_jittered_xgb,
    y=y_jittered_xgb,
    mode='markers',
    marker=dict(color='crimson', opacity=0.3),
    text=hover_text,
    hoverinfo='text',
    name='XGB Jitter'
), row=1, col=2)

# 3. Candidate vs Actual
fig.add_trace(go.Scatter(
    x=candidate_preds,
    y=y_test,
    mode='markers',
    marker=dict(color='teal', opacity=0.3),
    text=hover_text,
    hoverinfo='text',
    name='Candidate'
), row=2, col=1)

# 4. Candidate: Jittered
fig.add_trace(go.Scatter(
    x=x_jittered_candidate,
    y=y_jittered_candidate,
    mode='markers',
    marker=dict(color='teal', opacity=0.3),
    text=hover_text,
    hoverinfo='text',
    name='Candidate Jitter'
), row=2, col=2)

# Add 45-degree line to each subplot
for row in [1, 2]:
    for col in [1, 2]:
        fig.add_trace(go.Scatter(
            x=[-1, 11],
            y=[-1, 11],
            mode='lines',
            line=dict(color='black', dash='dash'),
            showlegend=False
        ), row=row, col=col)

# Update layout
fig.update_layout(
    height=900,
    width=1000,
    title_text="Interactive Model Comparison Plots",
)

# Set x and y limits for all plots
for i in range(1, 5):
    fig.update_xaxes(range=[-1, 11], row=(i - 1) // 2 + 1, col=(i - 1) % 2 + 1)
    fig.update_yaxes(range=[-1, 11], row=(i - 1) // 2 + 1, col=(i - 1) % 2 + 1)

fig.show()


In [None]:
## PLOTTING (2x2 layout)
# Parameters for jitter
jitter_amount = 0.15

# Jittered predictions for XGBoost
x_jittered_xgb = xgb_m3_predictions + np.random.normal(0, jitter_amount, size=len(xgb_m3_predictions))
y_jittered_xgb = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Jittered predictions for Candidate Model
candidate_preds = test['candidate_sb_best_sum_nokgi_ln1']
x_jittered_candidate = candidate_preds + np.random.normal(0, jitter_amount, size=len(candidate_preds))
y_jittered_candidate = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Create 2x2 subplot layout
fig, ax = plt.subplots(2, 2, figsize=(16, 16))

# 1. XGBoost Predictions vs Actual (Original)
ax[0, 0].scatter(xgb_m3_predictions, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0, 0].set_title('XGBoost: Predicted vs Actual')
ax[0, 0].set_xlabel('Predicted Values (Logged)')
ax[0, 0].set_ylabel('Actual Values (Logged)')
ax[0, 0].set_xlim([-1, 11])
ax[0, 0].set_ylim([-1, 11])
ax[0, 0].grid(True)

# 2. XGBoost Predictions vs Actual (with Jitter)
ax[0, 1].scatter(x_jittered_xgb, y_jittered_xgb, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[0, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0, 1].set_title('XGBoost: Plot with Jitter')
ax[0, 1].set_xlabel('Predicted Values (Logged)')
ax[0, 1].set_ylabel('Actual Values (Logged)')
ax[0, 1].set_xlim([-1, 11])
ax[0, 1].set_ylim([-1, 11])
ax[0, 1].grid(True)

# 3. Candidate Predictions vs Actual (with Jitter)
ax[1, 0].scatter(test['candidate_sb_best_sum_nokgi_ln1'], y_test, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1, 0].set_title('Candidate vs GED Final')
ax[1, 0].set_xlabel('Candidate Values (Logged)')
ax[1, 0].set_ylabel('GED Final Values (Logged)')
ax[1, 0].set_xlim([-1, 11])
ax[1, 0].set_ylim([-1, 11])
ax[1, 0].grid(True)

# 4. Candidate Predictions vs Actual (with Jitter)
ax[1, 1].scatter(x_jittered_candidate, y_jittered_candidate, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1, 1].set_title('Candidate: Plot with Jitter')
ax[1, 1].set_xlabel('Candidate Predicted Values (Logged)')
ax[1, 1].set_ylabel('Actual Values (Logged)')
ax[1, 1].set_xlim([-1, 11])
ax[1, 1].set_ylim([-1, 11])
ax[1, 1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
import plotly.express as px

# Compute residuals and group by c_id
test['residual'] = y_test - xgb_m3_predictions
resid_by_country = test.groupby('c_id')['residual'].mean().sort_values()

# Convert to a DataFrame for Plotly
resid_df = resid_by_country.reset_index()
resid_df.columns = ['c_id', 'avg_residual']

# Create interactive bar plot
fig = px.bar(
    resid_df,
    x='c_id',
    y='avg_residual',
    title='Average Residual by Country (c_id)',
    labels={'c_id': 'Country (c_id)', 'avg_residual': 'Average Residual'},
    hover_data={'c_id': True, 'avg_residual': True}
)

# Customize layout
fig.update_layout(
    xaxis_tickangle=-45,
    height=500,
    width=1000
)

fig.show()


# Plotting these countries

In [None]:
def plot_actual_vs_predicted(df, c_id):
    """
    Plots actual vs predicted values over month_id for a given c_id.

    :param df: DataFrame containing the data
    :param c_id: The c_id to filter the data
    """
    # Filter the DataFrame for the selected c_id
    filtered_df = df[df['c_id'] == c_id]

    # Set the index to month_id for plotting
    filtered_df.set_index('month_id', inplace=True)

    # Plotting
    plt.figure(figsize=(10, 6))
    plt.plot(filtered_df['Actual'], color='black', label='Actuals')
    plt.plot(filtered_df['Predicted'], color='#FF6700', linestyle='dashed', label='Predictions')
    plt.plot(filtered_df['sb_candidate_ln'], color='#800080', linestyle='dashed', label='Candidate')
    plt.plot(filtered_df['acled_sb_fat_ln'], color='#008000', alpha=0.3, label='ACLED')


    # Adding labels and title
    plt.xlabel('Month ID')
    plt.ylabel('Fatlities (Logged)')
    plt.title(f'Actual vs Predicted Fatalities for c_id: {c_id}')
    plt.legend()

    # Show the plot
    plt.show()

In [None]:
# Reset index to ensure c_id and month_id are available as columns
df_results = test_longer.reset_index().copy()

# Keep only necessary columns
df_results = df_results[['c_id', 'month_id']]

# Add predictions and actual values
df_results['Actual'] = y_test.values  # Ensure alignment
df_results['Predicted'] = xgb_m3_predictions

# Check if 'candidate_sb_best_sum_nokgi_ln1' exists in test, otherwise assign NaN
if 'candidate_sb_best_sum_nokgi_ln1' in test_longer.columns:
    df_results['sb_candidate_ln'] = test_longer.reset_index()['candidate_sb_best_sum_nokgi_ln1']
else:
    df_results['sb_candidate_ln'] = np.nan

# Check if 'acled_sb_fat_ln' exists in test, otherwise assign NaN
if 'acled_sb_fat_ln' in test_longer.columns:
    df_results['acled_sb_fat_ln'] = test_longer.reset_index()['acled_sb_fat_ln']
else:
    df_results['acled_sb_fat_ln'] = np.nan

# Save to CSV
df_results.to_csv("xgb_predictions_vs_actuals.csv", index=False)

In [None]:
plot_df = df_results.copy()
plot_df = plot_df.reset_index(drop=False)

In [None]:
# Define the directory where the plots will be saved
save_dir = "/Users/chandlerwilliams/Desktop/views_projects/nowcasting/nowcasting/country_plots/"

# Loop through values from 1 to 246
for i in range(1, 247):
    try:
        plt.figure()  # Start a new figure
        plot_actual_vs_predicted(plot_df, i)  # Generate the plot
        save_path = os.path.join(save_dir, f"country_plot_{i}.png")
        plt.savefig(save_path, bbox_inches='tight')  # Save the plot
        plt.close()  # Free up memory
    except Exception as e:
        print(f"Skipping {i} due to error: {e}")

In [None]:
def plot_actual_vs_predicted_save(df, i):
    fig, ax = plt.subplots()
    subset = df[df['id'] == i]  # Or whatever filtering logic
    if subset.empty:
        return None  # Skip if there's no data
    
    ax.plot(subset['x'], subset['actual'], label='Actual')
    ax.plot(subset['x'], subset['predicted'], label='Predicted')
    ax.set_title(f"Actual vs. Predicted for ID {i}")
    ax.legend()
    return fig




In [None]:
plot_actual_vs_predicted(plot_df, 218) # Isreal
plot_actual_vs_predicted(plot_df, 47) # Burkina Faso
plot_actual_vs_predicted(plot_df, 245) # Sudan
plot_actual_vs_predicted(plot_df, 57) # Ethiopia
plot_actual_vs_predicted(plot_df, 4) # Venezuela
plot_actual_vs_predicted(plot_df, 237) # Kenya

plot_actual_vs_predicted(plot_df, 246) # 
plot_actual_vs_predicted(plot_df, 172) #
plot_actual_vs_predicted(plot_df, 17) # 
plot_actual_vs_predicted(plot_df, 78) # Niger


plot_actual_vs_predicted(plot_df, 117) # ukraine
plot_actual_vs_predicted(plot_df, 79) # ukraine
plot_actual_vs_predicted(plot_df, 137) # Tahjikistan


# Trying the non-logged versions

In [None]:
df = pd.read_csv('nowcasting_master_final_data_1.csv', index_col=(['month_id','c_id']))

# drop source_version
df.drop('source_version', axis=1, inplace=True)

# Training set uses data from 2018 until 2022
train = df.loc[(df['year'] >= 2018) & (df['year'] < 2023)]

# Test set uses 2022
test = df.loc[(df['year'] > 2022)]


In [None]:
list(train.columns)

In [None]:
# Training and test Labels
y_train = train['sb_final_best']
y_test = test['sb_final_best']

In [None]:
countries = [
    'AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 
    'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 
    'BRA', 'BRB', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 
    'COD', 'COG', 'COL', 'COM', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 
    'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 
    'FRA', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 
    'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 
    'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 
    'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LKA', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 
    'MDG', 'MDV', 'MEX', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 
    'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 
    'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL', 'PRK', 'PRT', 'PRY', 'QAT', 
    'ROU', 'RUS', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SLE', 'SLV', 'SOM', 'SRB', 
    'SSD', 'SVK', 'SVN', 'SWE', 'SWZ', 'SYC', 'SYR', 'TCD', 'TGO', 'THA', 'TJK', 
    'TKM', 'TLS', 'TTO', 'TUN', 'TUR', 'TWN', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 
    'UZB', 'VEN', 'VNM', 'YEM', 'ZAF', 'ZMB', 'ZWE']   
 
short_features = ['acled_sb_fat', 'acled_sb_count', 'acled_ns_fat', 'acled_ns_count', 'acled_os_fat', 'acled_os_count', 'acled_pr_count','month','topic_conflict_1',
 'topic_judiciary_1', 'topic_diplomacy_1', 'vdem_v2x_delibdem', 'vdem_v2x_clphy', 'vdem_v2x_rule', 'vdem_v2x_freexp', 'candidate_sb_best_sum_nokgi',
 'candidate_ns_best_sum_nokgi',
 'candidate_os_best_sum_nokgi',
 'candidate_sb_best_count_nokgi',
 'candidate_ns_best_count_nokgi',
 'candidate_os_best_count_nokgi',
 'candidate_sb_high_sum_nokgi',
 'candidate_ns_high_sum_nokgi',
 'candidate_os_high_sum_nokgi',
 'candidate_sb_high_count_nokgi',
 'candidate_ns_high_count_nokgi',
 'candidate_os_high_count_nokgi',] 

short_list = short_features + countries

# Create a new DataFrame with only the selected columns
train_nonlog = train[short_list]
test_nonlog = test[short_list]

In [None]:
# Create an XGBRegressor model

xgb_m3_regressor = xgb.XGBRegressor(eval_metric='rmse',
                                       learning_rate = 0.02,
                                       max_depth = 10,
                                       n_estimators = 500,
                                       colsample_bytree = 0.6,
                                       subsample = 0.8,
                                       #reg_alpha = 0.4,
                                       #reg_lambda =0.4,
                                       random_state = 17
                                      ) 


# Fit the model on the training set
xgb_m3_regressor.fit(train_nonlog, y_train)

# Make predictions on the test set
xgb_m3_predictions = xgb_m3_regressor.predict(test_nonlog)

xgb_m3_predictions = np.maximum(xgb_m3_predictions, 0)

# Evaluate the model using MSE or any other suitable regression metric
mse_m3_xgb = mean_squared_error(y_test, xgb_m3_predictions)
print("XGBoost MSE:", mse_m3_xgb)
print("XGBoost RMSE:", np.sqrt(mse_m3_xgb))
print(' ')
final_candidate = mean_squared_error(test['sb_final_best'], test['candidate_sb_best_sum_nokgi'])
print('MSE GED Final vs UCDP Candidate:', final_candidate)
print('RMSE GED Final vs UCDP Candidate:', np.sqrt(final_candidate))

## PLOTTING (2x2 layout)
# Parameters for jitter
jitter_amount = 0.15

# Jittered predictions for XGBoost
x_jittered_xgb = xgb_m3_predictions + np.random.normal(0, jitter_amount, size=len(xgb_m3_predictions))
y_jittered_xgb = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Jittered predictions for Candidate Model
candidate_preds = test['candidate_sb_best_sum_nokgi']
x_jittered_candidate = candidate_preds + np.random.normal(0, jitter_amount, size=len(candidate_preds))
y_jittered_candidate = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Create 2x2 subplot layout
fig, ax = plt.subplots(2, 2, figsize=(16, 16))

# 1. XGBoost Predictions vs Actual (Original)
ax[0, 0].scatter(xgb_m3_predictions, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0, 0].set_title('XGBoost: Predicted vs Actual')
ax[0, 0].set_xlabel('Predicted Values (Logged)')
ax[0, 0].set_ylabel('Actual Values (Logged)')
ax[0, 0].grid(True)

# 2. XGBoost Predictions vs Actual (with Jitter)
ax[0, 1].scatter(x_jittered_xgb, y_jittered_xgb, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[0, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0, 1].set_title('XGBoost: Plot with Jitter')
ax[0, 1].set_xlabel('Predicted Values (Logged)')
ax[0, 1].set_ylabel('Actual Values (Logged)')
ax[0, 1].grid(True)

# 3. Candidate Predictions vs Actual (with Jitter)
ax[1, 0].scatter(test['candidate_sb_best_sum_nokgi'], y_test, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1, 0].set_title('Candidate vs GED Final')
ax[1, 0].set_xlabel('Candidate Values (Logged)')
ax[1, 0].set_ylabel('GED Final Values (Logged)')
ax[1, 0].grid(True)

# 4. Candidate Predictions vs Actual (with Jitter)
ax[1, 1].scatter(x_jittered_candidate, y_jittered_candidate, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1, 1].set_title('Candidate: Plot with Jitter')
ax[1, 1].set_xlabel('Candidate Predicted Values (Logged)')
ax[1, 1].set_ylabel('Actual Values (Logged)')
ax[1, 1].grid(True)

plt.tight_layout()
plt.show()

# EVAL METRICS
# Assuming your RandomForest model is named 'rf_best' and is already trained
feature_importances_xgb = xgb_m3_regressor.feature_importances_

# Assuming 'X_train' is your training dataset
feature_names_xgb = train_nonlog.columns

# Create a pandas DataFrame for easier visualization
feature_importance_xgb = pd.DataFrame({'Feature': feature_names_xgb, 'Importance': feature_importances_xgb})

# Sort the DataFrame to show the most important features at the top
feature_importance_xgb = feature_importance_xgb.sort_values(by='Importance', ascending=False)

# Display the feature importance scores
feature_importance_xgb.head(25)


# Tuning the model for non-logged predictions

In [None]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from xgboost import XGBRegressor
import numpy as np

# Set up time-based cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Define your base model structure
xgb_model = XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    booster='gbtree',
    random_state=17
)

# Define the parameter grid around your original values
param_grid = {
    'learning_rate': [0.01, 0.02, 0.05],
    'max_depth': [6, 8, 10],
    'n_estimators': [250, 500],
    'colsample_bytree': [0.6, 0.8],
    'subsample': [0.8, 1.0],
    'min_child_weight': [1, 5],
    # Uncomment to tune regularization too:
    # 'reg_alpha': [0, 0.4],
    # 'reg_lambda': [0, 0.4, 1.0],
}

# Set up the grid search
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  
    cv=tscv,
    verbose=1,
    n_jobs=-1
)

# Run the grid search
grid_search.fit(train_nonlog, y_train)

# Print best results
print("Best parameters found:", grid_search.best_params_)
print("Best RMSE score:", np.sqrt(-grid_search.best_score_))


In [None]:
# Create an XGBRegressor model

xgb_m3_regressor = xgb.XGBRegressor(eval_metric='rmse',
                                       learning_rate = 0.01,
                                       max_depth = 3,
                                       n_estimators = 1000,
                                       colsample_bytree = 0.6,
                                       subsample = 0.8,
                                       #reg_alpha = 0.4,
                                       #reg_lambda =0.4,
                                       random_state = 17
                                      ) 


# Fit the model on the training set
xgb_m3_regressor.fit(train_nonlog, y_train)

# Make predictions on the test set
xgb_m3_predictions = xgb_m3_regressor.predict(test_nonlog)

xgb_m3_predictions = np.maximum(xgb_m3_predictions, 0)

# Evaluate the model using MSE or any other suitable regression metric
mse_m3_xgb = mean_squared_error(y_test, xgb_m3_predictions)
print("XGBoost MSE:", mse_m3_xgb)
print("XGBoost RMSE:", np.sqrt(mse_m3_xgb))
print(' ')
final_candidate = mean_squared_error(test['sb_final_best'], test['candidate_sb_best_sum_nokgi'])
print('MSE GED Final vs UCDP Candidate:', final_candidate)
print('RMSE GED Final vs UCDP Candidate:', np.sqrt(final_candidate))

## PLOTTING (2x2 layout)
# Parameters for jitter
jitter_amount = 0.15

# Jittered predictions for XGBoost
x_jittered_xgb = xgb_m3_predictions + np.random.normal(0, jitter_amount, size=len(xgb_m3_predictions))
y_jittered_xgb = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Jittered predictions for Candidate Model
candidate_preds = test['candidate_sb_best_sum_nokgi']
x_jittered_candidate = candidate_preds + np.random.normal(0, jitter_amount, size=len(candidate_preds))
y_jittered_candidate = y_test + np.random.normal(0, jitter_amount, size=len(y_test))

# Create 2x2 subplot layout
fig, ax = plt.subplots(2, 2, figsize=(16, 16))

# 1. XGBoost Predictions vs Actual (Original)
ax[0, 0].scatter(xgb_m3_predictions, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0, 0].set_title('XGBoost: Predicted vs Actual')
ax[0, 0].set_xlabel('Predicted Values (Logged)')
ax[0, 0].set_ylabel('Actual Values (Logged)')
ax[0, 0].grid(True)

# 2. XGBoost Predictions vs Actual (with Jitter)
ax[0, 1].scatter(x_jittered_xgb, y_jittered_xgb, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[0, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0, 1].set_title('XGBoost: Plot with Jitter')
ax[0, 1].set_xlabel('Predicted Values (Logged)')
ax[0, 1].set_ylabel('Actual Values (Logged)')
ax[0, 1].grid(True)

# 3. Candidate Predictions vs Actual (with Jitter)
ax[1, 0].scatter(test['candidate_sb_best_sum_nokgi'], y_test, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1, 0].set_title('Candidate vs GED Final')
ax[1, 0].set_xlabel('Candidate Values (Logged)')
ax[1, 0].set_ylabel('GED Final Values (Logged)')
ax[1, 0].grid(True)

# 4. Candidate Predictions vs Actual (with Jitter)
ax[1, 1].scatter(x_jittered_candidate, y_jittered_candidate, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1, 1].set_title('Candidate: Plot with Jitter')
ax[1, 1].set_xlabel('Candidate Predicted Values (Logged)')
ax[1, 1].set_ylabel('Actual Values (Logged)')
ax[1, 1].grid(True)

plt.tight_layout()
plt.show()

# EVAL METRICS
# Assuming your RandomForest model is named 'rf_best' and is already trained
feature_importances_xgb = xgb_m3_regressor.feature_importances_

# Assuming 'X_train' is your training dataset
feature_names_xgb = train_nonlog.columns

# Create a pandas DataFrame for easier visualization
feature_importance_xgb = pd.DataFrame({'Feature': feature_names_xgb, 'Importance': feature_importances_xgb})

# Sort the DataFrame to show the most important features at the top
feature_importance_xgb = feature_importance_xgb.sort_values(by='Importance', ascending=False)

# Display the feature importance scores
feature_importance_xgb.head(25)


In [None]:
def plot_actual_vs_predicted(df, c_id):
    """
    Plots actual vs predicted values over month_id for a given c_id.

    :param df: DataFrame containing the data
    :param c_id: The c_id to filter the data
    """
    # Filter the DataFrame for the selected c_id
    filtered_df = df[df['c_id'] == c_id]

    # Set the index to month_id for plotting
    filtered_df.set_index('month_id', inplace=True)

    # Plotting
    plt.figure(figsize=(10, 6))
    plt.plot(filtered_df['Actual'], color='black', label='Actuals')
    plt.plot(filtered_df['Predicted'], color='#FF6700', linestyle='dashed', label='Predictions')
    plt.plot(filtered_df['candidate_sb_best_sum_nokgi'], color='#800080', linestyle='dashed', label='Candidate')
    plt.plot(filtered_df['acled_sb_fat'], color='#008000', alpha=0.3, label='ACLED')


    # Adding labels and title
    plt.xlabel('Month ID')
    plt.ylabel('Fatlities (Logged)')
    plt.title(f'Actual vs Predicted Fatalities for c_id: {c_id}')
    plt.legend()

    # Show the plot
    plt.show()

In [None]:
# Reset index to ensure c_id and month_id are available as columns
df_results = test_nonlog.reset_index().copy()

# Keep only necessary columns
df_results = df_results[['c_id', 'month_id']]

# Add predictions and actual values
df_results['Actual'] = y_test.values  # Ensure alignment
df_results['Predicted'] = xgb_m3_predictions

# Check if 'candidate_sb_best_sum_nokgi_ln1' exists in test, otherwise assign NaN
if 'candidate_sb_best_sum_nokgi' in test.columns:
    df_results['candidate_sb_best_sum_nokgi'] = test.reset_index()['candidate_sb_best_sum_nokgi']
else:
    df_results['candidate_sb_best_sum_nokgi'] = np.nan

# Check if 'acled_sb_fat_ln' exists in test, otherwise assign NaN
if 'acled_sb_fat' in test_nonlog.columns:
    df_results['acled_sb_fat'] = test_nonlog.reset_index()['acled_sb_fat']
else:
    df_results['acled_sb_fat'] = np.nan

# Save to CSV
df_results.to_csv("xgb_predictions_vs_actuals.csv", index=False)

plot_df = df_results.copy()
plot_df = plot_df.reset_index(drop=False)

In [None]:
plot_actual_vs_predicted(plot_df, 218) # Isreal
plot_actual_vs_predicted(plot_df, 47) # Burkina Faso
plot_actual_vs_predicted(plot_df, 245) # Sudan
plot_actual_vs_predicted(plot_df, 57) # Ethiopia
plot_actual_vs_predicted(plot_df, 4) # Venezuela
plot_actual_vs_predicted(plot_df, 237) # Kenya

plot_actual_vs_predicted(plot_df, 246) # 
plot_actual_vs_predicted(plot_df, 172) #
plot_actual_vs_predicted(plot_df, 17) # 
plot_actual_vs_predicted(plot_df, 78) # Niger


plot_actual_vs_predicted(plot_df, 117) # ukraine
plot_actual_vs_predicted(plot_df, 79) # 

In [None]:
import matplotlib.pyplot as plt
plt.hist(y_train, bins=100)
plt.title('Distribution of Raw Target Values')
plt.xlabel('Fatalities')
plt.ylabel('Frequency')
plt.yscale('log')  # helpful if there's a long tail
plt.show()
