# VIEWS Nowcasting

This notebook outlines the "researcher" code for VIEWS Nowcasting.


In [None]:
import numpy as np
import pandas as pdx
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
import tabula
import xlwings as xw
import seaborn as sns
import warnings
import statsmodels.api as sm
import statsmodels.formula.api as smf
from matplotlib.ticker import ScalarFormatter
from nowcast import * #plot_statebased, plot_nonstate, plot_onesided 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# VIEWS 3
from viewser.operations import fetch
from viewser import Queryset, Column
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views
from views_runs import storage, ModelMetadata
from views_runs.storage import store, retrieve, fetch_metadata
#from views_forecasts.extensions import *

# Ingester
from ingester3.config import source_db_path
from ingester3.Country import Country
from ingester3.extensions import *
from ingester3.ViewsMonth import ViewsMonth

warnings.filterwarnings('ignore')

import os
home = os.path.expanduser("~")

# Pulling the Data via VIEWSER

In [None]:
# 2. UCDP GED Final, ACLED, Topics

qs_nowcasting_cm = (Queryset("nowcasting", "country_month") 
                    
# COUNTRY META-DATA       
.with_column(Column('country_name', from_loa = 'country', from_column = 'name'))
.with_column(Column('iso_ab', from_loa = 'country', from_column = 'isoab'))
.with_column(Column('gleditsch_ward', from_loa = 'country', from_column = 'gwcode'))
.with_column(Column('year', from_loa = 'month', from_column = 'year_id'))
.with_column(Column('month', from_loa = 'month', from_column = 'month'))

# UCDP GED FINAL                    
.with_column(Column('sb_final_best', from_loa = 'country_month', from_column = 'ged_sb_best_sum_nokgi'))
.with_column(Column('ns_final_best', from_loa = 'country_month', from_column = 'ged_ns_best_sum_nokgi'))
.with_column(Column('os_final_best', from_loa = 'country_month', from_column = 'ged_os_best_sum_nokgi'))
.with_column(Column('sb_ged_count_events', from_loa = 'country_month', from_column = 'ged_sb_best_count_nokgi'))
.with_column(Column('ns_ged_count_events', from_loa = 'country_month', from_column = 'ged_ns_best_count_nokgi'))
.with_column(Column('os_ged_count_events', from_loa = 'country_month', from_column = 'ged_os_best_count_nokgi'))  
                    
# Logged VERSIONS
.with_column(Column("sb_final_best_ln", from_loa = "country_month", from_column = "ged_sb_best_sum_nokgi")
                 .transform.ops.ln()
                 .transform.missing.replace_na()) 
                    
.with_column(Column("ns_final_best_ln", from_loa = "country_month", from_column = "ged_ns_best_sum_nokgi")
                 .transform.ops.ln()
                 .transform.missing.replace_na()) 
                    
.with_column(Column("os_final_best_ln", from_loa = "country_month", from_column = "ged_os_best_sum_nokgi")
                 .transform.ops.ln()
                 .transform.missing.replace_na()) 

# LAGGED VERSIONS
.with_column(Column("sb_final_best_ln_1", from_loa = "country_month", from_column = "ged_sb_best_sum_nokgi")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(1)
                 .transform.missing.replace_na()) 

.with_column(Column("sb_final_best_ln_2", from_loa = "country_month", from_column = "ged_sb_best_sum_nokgi")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(2)
                 .transform.missing.replace_na())
                    
.with_column(Column("sb_final_best_ln_3", from_loa = "country_month", from_column = "ged_sb_best_sum_nokgi")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(3)
                 .transform.missing.replace_na())
                    
.with_column(Column("sb_final_best_ln_4", from_loa = "country_month", from_column = "ged_sb_best_sum_nokgi")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(4)
                 .transform.missing.replace_na())
                    
.with_column(Column("sb_final_best_ln_5", from_loa = "country_month", from_column = "ged_sb_best_sum_nokgi")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(5)
                 .transform.missing.replace_na())

.with_column(Column("sb_final_best_ln_6", from_loa = "country_month", from_column = "ged_sb_best_sum_nokgi")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(6)
                 .transform.missing.replace_na())

.with_column(Column("sb_final_best_ln_7", from_loa = "country_month", from_column = "ged_sb_best_sum_nokgi")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(7)
                 .transform.missing.replace_na())

.with_column(Column("sb_final_best_ln_8", from_loa = "country_month", from_column = "ged_sb_best_sum_nokgi")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(8)
                 .transform.missing.replace_na())

.with_column(Column("sb_final_best_ln_9", from_loa = "country_month", from_column = "ged_sb_best_sum_nokgi")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(9)
                 .transform.missing.replace_na())

.with_column(Column("sb_final_best_ln_12", from_loa = "country_month", from_column = "ged_sb_best_sum_nokgi")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(12)
                 .transform.missing.replace_na()) 
                    
.with_column(Column("sb_final_best_ln_24", from_loa = "country_month", from_column = "ged_sb_best_sum_nokgi")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(24)
                 .transform.missing.replace_na())  
                    
# ACLED DATA                    
.with_column(Column('acled_sb_fat', from_loa = 'country_month', from_column = 'acled_sb_fat'))
                    
.with_column(Column('acled_sb_fat_ln', from_loa = 'country_month', from_column = 'acled_sb_fat')
                 .transform.ops.ln()
                 .transform.missing.fill())
                    
.with_column(Column('acled_ns_fat_ln', from_loa = 'country_month', from_column = 'acled_ns_fat')
                 .transform.ops.ln()
                 .transform.missing.fill())
                    
.with_column(Column('acled_os_fat_ln', from_loa = 'country_month', from_column = 'acled_os_fat')
                 .transform.ops.ln()
                 .transform.missing.fill())
                    
.with_column(Column('acled_sb_fat_ln_1', from_loa = 'country_month', from_column = 'acled_sb_fat')
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(1)
                 .transform.missing.replace_na()) 
                    
.with_column(Column('acled_sb_fat_ln_2', from_loa = 'country_month', from_column = 'acled_sb_fat')
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(2)
                 .transform.missing.replace_na()) 
                    
.with_column(Column('acled_sb_fat_ln_3', from_loa = 'country_month', from_column = 'acled_sb_fat')
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(3)
                 .transform.missing.replace_na()) 

.with_column(Column('acled_sb_fat_ln_12', from_loa = 'country_month', from_column = 'acled_sb_fat')
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(12)
                 .transform.missing.replace_na()) 

.with_column(Column('acled_sb_fat_ln_24', from_loa = 'country_month', from_column = 'acled_sb_fat')
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(24)
                 .transform.missing.replace_na()) 
                                       
.with_column(Column('acled_sb_count', from_loa = 'country_month', from_column = 'acled_sb_count'))
.with_column(Column('acled_ns_fat', from_loa = 'country_month', from_column = 'acled_ns_fat'))
.with_column(Column('acled_ns_count', from_loa = 'country_month', from_column = 'acled_ns_count'))
.with_column(Column('acled_os_fat', from_loa = 'country_month', from_column = 'acled_os_fat'))
.with_column(Column('acled_os_count', from_loa = 'country_month', from_column = 'acled_os_count'))
.with_column(Column('acled_pr_count', from_loa = 'country_month', from_column = 'acled_pr_count'))

.with_column(Column("ln_acled_sb_reb", from_loa="country_month", from_column="acled_bat_reb_fat")
                                    .transform.ops.ln()
                                    .transform.missing.fill()
                                    )

.with_column(Column("ln_acled_sb_gov", from_loa="country_month", from_column="acled_bat_gov_fat")
                                    .transform.ops.ln()
                                    .transform.missing.fill()
                                    )

# TOPICS DATA                                    
.with_column(Column('topic_conflict_1', from_loa = 'country_month', from_column = 'topic_ste_theta6')
                 .transform.missing.fill()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(1)
                 .transform.missing.fill())

.with_column(Column('topic_judiciary_1', from_loa = 'country_month', from_column = 'topic_ste_theta5')
                 .transform.missing.fill()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(1)
                 .transform.missing.fill())

.with_column(Column('topic_diplomacy_1', from_loa = 'country_month', from_column = 'topic_ste_theta2')
                 .transform.missing.fill()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(1)
                 .transform.missing.fill())
                    
# VDEM Data
.with_column(Column("vdem_v2x_delibdem", from_loa="country_year", from_column="vdem_v2x_delibdem")
                 .transform.missing.fill()
                 .transform.temporal.tlag(12)
                 .transform.missing.fill()
                 .transform.missing.replace_na())

.with_column(Column("vdem_v2x_clphy", from_loa="country_year", from_column="vdem_v2x_clphy")
                 .transform.missing.fill()
                 .transform.temporal.tlag(12)
                 .transform.missing.fill()
                 .transform.missing.replace_na())

.with_column(Column("vdem_v2x_rule", from_loa="country_year", from_column="vdem_v2x_rule")
                 .transform.missing.fill()
                 .transform.temporal.tlag(12)
                 .transform.missing.fill()
                 .transform.missing.replace_na())
                    
.with_column(Column("vdem_v2x_freexp", from_loa="country_year", from_column="vdem_v2x_freexp")
                 .transform.missing.fill()
                 .transform.temporal.tlag(12)
                 .transform.missing.fill()
                 .transform.missing.replace_na())

# Nowcasting Data
# UCDP GED FINAL                    
.with_column(Column('sb_candidate_best', from_loa = 'country_month', from_column = 'candidate_sb_best'))
.with_column(Column('ns_candidate_best', from_loa = 'country_month', from_column = 'candidate_ns_best'))
.with_column(Column('os_candidate_best', from_loa = 'country_month', from_column = 'candidate_os_best'))

.with_column(Column('sb_candidate_high', from_loa = 'country_month', from_column = 'candidate_sb_high'))
.with_column(Column('ns_candidate_high', from_loa = 'country_month', from_column = 'candidate_ns_high'))
.with_column(Column('os_candidate_high', from_loa = 'country_month', from_column = 'candidate_os_high'))

.with_column(Column('sb_candidate_count_events', from_loa = 'country_month', from_column = 'candidate_sb_count'))
.with_column(Column('ns_candidate_count_events', from_loa = 'country_month', from_column = 'candidate_ns_count'))
.with_column(Column('os_candidate_count_events', from_loa = 'country_month', from_column = 'candidate_os_count'))

.with_column(Column('sb_candidate_high_count', from_loa = 'country_month', from_column = 'candidate_sb_high_count'))
.with_column(Column('ns_candidate_high_count', from_loa = 'country_month', from_column = 'candidate_ns_high_count'))
.with_column(Column('os_candidate_high_count', from_loa = 'country_month', from_column = 'candidate_os_high_count'))

# Logged VERSIONS
.with_column(Column('sb_candidate_best_ln', from_loa='country_month', from_column='candidate_sb_best')
             .transform.ops.ln()
             .transform.missing.replace_na())

.with_column(Column('ns_candidate_best_ln', from_loa='country_month', from_column='candidate_ns_best')
             .transform.ops.ln()
             .transform.missing.replace_na())

.with_column(Column('os_candidate_best_ln', from_loa='country_month', from_column='candidate_os_best')
             .transform.ops.ln()
             .transform.missing.replace_na())

.with_column(Column('sb_candidate_high_ln', from_loa='country_month', from_column='candidate_sb_high')
             .transform.ops.ln()
             .transform.missing.replace_na())

.with_column(Column('ns_candidate_high_ln', from_loa='country_month', from_column='candidate_ns_high')
             .transform.ops.ln()
             .transform.missing.replace_na())

.with_column(Column('os_candidate_high_ln', from_loa='country_month', from_column='candidate_os_high')
             .transform.ops.ln()
             .transform.missing.replace_na())

.with_column(Column('sb_candidate_count_events_ln', from_loa='country_month', from_column='candidate_sb_count')
             .transform.ops.ln()
             .transform.missing.replace_na())

.with_column(Column('ns_candidate_count_events_ln', from_loa='country_month', from_column='candidate_ns_count')
             .transform.ops.ln()
             .transform.missing.replace_na())

.with_column(Column('os_candidate_count_events_ln', from_loa='country_month', from_column='candidate_os_count')
             .transform.ops.ln()
             .transform.missing.replace_na())

.with_column(Column('sb_candidate_high_count_ln', from_loa='country_month', from_column='candidate_sb_high_count')
             .transform.ops.ln()
             .transform.missing.replace_na())

.with_column(Column('ns_candidate_high_count_ln', from_loa='country_month', from_column='candidate_ns_high_count')
             .transform.ops.ln()
             .transform.missing.replace_na())

.with_column(Column('os_candidate_high_count_ln', from_loa='country_month', from_column='candidate_os_high_count')
             .transform.ops.ln()
             .transform.missing.replace_na())

# Lagged versions 
.with_column(Column("sb_candidate_best_ln_1", from_loa = "country_month", from_column = "candidate_sb_best")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(1)
                 .transform.missing.replace_na()) 

.with_column(Column("sb_candidate_best_ln_2", from_loa = "country_month", from_column = "candidate_sb_best")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(2)
                 .transform.missing.replace_na())
                    
.with_column(Column("sb_candidate_best_ln_3", from_loa = "country_month", from_column = "candidate_sb_best")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(3)
                 .transform.missing.replace_na())
                    
.with_column(Column("sb_candidate_best_ln_4", from_loa = "country_month", from_column = "candidate_sb_best")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(4)
                 .transform.missing.replace_na())
                    
.with_column(Column("sb_candidate_best_ln_5", from_loa = "country_month", from_column = "candidate_sb_best")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(5)
                 .transform.missing.replace_na())

.with_column(Column("sb_candidate_best_ln_6", from_loa = "country_month", from_column = "candidate_sb_best")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(6)
                 .transform.missing.replace_na())

## High
.with_column(Column("sb_candidate_high_ln_1", from_loa = "country_month", from_column = "candidate_sb_high")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(1)
                 .transform.missing.replace_na()) 

.with_column(Column("sb_candidate_high_ln_2", from_loa = "country_month", from_column = "candidate_sb_high")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(2)
                 .transform.missing.replace_na())
                    
.with_column(Column("sb_candidate_high_ln_3", from_loa = "country_month", from_column = "candidate_sb_high")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(3)
                 .transform.missing.replace_na())
                    
.with_column(Column("sb_candidate_high_ln_4", from_loa = "country_month", from_column = "candidate_sb_high")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(4)
                 .transform.missing.replace_na())
                    
.with_column(Column("sb_candidate_high_ln_5", from_loa = "country_month", from_column = "candidate_sb_high")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(5)
                 .transform.missing.replace_na())

.with_column(Column("sb_candidate_high_ln_6", from_loa = "country_month", from_column = "candidate_sb_high")
                 .transform.ops.ln()
                 .transform.missing.replace_na()
                 .transform.temporal.tlag(6)
                 .transform.missing.replace_na())
 )
                     
qs_nowcasting_cm = qs_nowcasting_cm.publish().fetch()

In [None]:
df = qs_nowcasting_cm.copy()

df.reset_index(inplace = True)
df = df.loc[df['year'] <= 2023]
df = df.loc[df['year'] >= 2018]
df = df.rename(columns={'month_id': 'month_id', 'country_id': 'c_id'})
df = df.set_index(['month_id', 'c_id'])
one_hot = pd.get_dummies(df['iso_ab'])
df = df.join(one_hot)

df = df.drop(['country_name', 'gleditsch_ward'], axis=1)

# Correcting for geographic coverage

### UCDP candidate data coverage varies by region:

1. Africa: 2018-onwards
2. Middle East: April/2020 onwards
3. Asia: August/2020-onwards
4. Europe: Augst/2020-onwards
5. Global: October/2020-onwards

In [None]:
# Define country lists

african_iso3_codes = {
    "AGO", "BDI", "BEN", "BFA", "BWA", "CAF", "CIV", "CMR", "COG", "COM", "CPV",
    "ERI", "ETH", "GAB", "GHA", "GIN", "GMB", "GNB", "GNQ", "KEN", "LBR", "LSO",
    "MDG", "MLI", "MOZ", "MRT", "MUS", "MWI", "NAM", "NER", "NGA", "RWA", "SDN",
    "SEN", "SLE", "SOM", "SSD", "STP", "SWZ", "SYC", "TCD", "TGO", "TZA", "UGA",
    "ZAF", "ZAR", "ZMB", "ZWE"
}

middle_east_iso3_codes = {
    "AFG", "ARE", "BHR", "DJI", "DZA", "EGY", "IRN", "IRQ", "ISR", "JOR", "KWT", "LBN",
    "LBY", "MAR", "MLT", "OMN", "QAT", "SAU", "SYR", "TUN", "TUR", "YEM"
}

new_countries_iso3 = {
    "ALB", "AND", "ARM", "AUT", "AZE", "BEL", "BGR", "BIH", "BLR", "CHE", "CYP", "CZE", "DEU",
    "DNK", "ESP", "EST", "FIN", "FRA", "FRO", "GBR", "GEO", "GRC", "GRL", "HRV", "HUN", "IRL",
    "ISL", "ITA", "KAZ", "KGZ", "KSV", "LIE", "LTU", "LUX", "LVA", "MCO", "MDA", "MKD", "MNT",
    "NLD", "NOR", "POL", "PRT", "ROM", "RUS", "SER", "SMR", "SVK", "SVN", "SWE", "TJK", "TKM",
    "UKR", "UZB", "BGD", "BTN", "IND", "LKA", "MDV", "NPL", "PAK", "ASM", "AUS",
    "BRN", "CHN", "FJI", "FSM", "GUM", "HKG", "IDN", "JPN", "KHM", "KIR", "KOR", "LAO", "MAC",
    "MHL", "MMR", "MNG", "MNP", "MYS", "NCL", "NZL", "PHL", "PLW", "PNG", "PRK", "PYF", "SGP",
    "SLB", "THA", "TMP", "TON", "TUV", "VNM", "VUT", "WSM"
}


# Start by copying the master DataFrame
master_new = df.copy()

# Define your inclusion mask — where data should be kept (i.e., values kept for 'sb_candidate_best_ln')
keep_mask = (
    # All years except 2018-2019
    (~master_new["year"].isin([2018, 2019])) |

    # African countries in 2018-2019
    (master_new["iso_ab"].isin(african_iso3_codes)) |

    # African countries for Jan-Mar 2020
    ((master_new["year"] == 2020) & (master_new["month"].isin([1, 2, 3])) & master_new["iso_ab"].isin(african_iso3_codes)) |

    # African + Middle East countries for Apr-Jul 2020
    ((master_new["year"] == 2020) & (master_new["month"].between(4, 7)) & 
     (master_new["iso_ab"].isin(african_iso3_codes) | master_new["iso_ab"].isin(middle_east_iso3_codes))) |

    # African + Middle East + new countries from Aug 2020 onward
    ((master_new["year"] > 2020) |
     ((master_new["year"] == 2020) & (master_new["month"] >= 8) &
      (master_new["iso_ab"].isin(african_iso3_codes) |
       master_new["iso_ab"].isin(middle_east_iso3_codes) |
       master_new["iso_ab"].isin(new_countries_iso3))))
)

# Explicitly exclude Middle East countries before April 2020
keep_mask &= ~((master_new["year"] == 2020) & (master_new["month"] < 4) & master_new["iso_ab"].isin(middle_east_iso3_codes))

# Explicitly exclude new countries before August 2020
keep_mask &= ~((master_new["year"] == 2020) & (master_new["month"] < 8) & master_new["iso_ab"].isin(new_countries_iso3))

# Now, set sb_candidate_best_ln to NaN for rows NOT in the keep_mask
master_new.loc[~keep_mask, "sb_candidate_best_ln"] = np.nan

# drop columns with nan on 'sb_candidate_best_ln'
master_new = master_new.dropna(subset=['sb_candidate_best_ln'])

df = master_new.copy()


# Visualizing these data

In [None]:
plot_statebased(df)
plot_nonstate(df)
plot_onesided(df)

# Setting up the Dataframe for State-based nowcasting

### Setting up the dataframes to only include key data

In [None]:
countries = [
    'AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 
    'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 
    'BRA', 'BRB', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 
    'COD', 'COG', 'COL', 'COM', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 
    'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 
    'FRA', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 
    'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 
    'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 
    'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LKA', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 
    'MDG', 'MDV', 'MEX', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 
    'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 
    'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL', 'PRK', 'PRT', 'PRY', 'QAT', 
    'ROU', 'RUS', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SLE', 'SLV', 'SOM', 'SRB', 
    'SSD', 'SVK', 'SVN', 'SWE', 'SWZ', 'SYC', 'SYR', 'TCD', 'TGO', 'THA', 'TJK', 
    'TKM', 'TLS', 'TTO', 'TUN', 'TUR', 'TWN', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 
    'UZB', 'VEN', 'VNM', 'YEM', 'ZAF', 'ZMB', 'ZWE'] 


short_features = ['sb_final_best_ln', 'sb_candidate_best_ln', 'ns_candidate_best_ln', 'os_candidate_best_ln',
       'sb_candidate_high_ln', 'ns_candidate_high_ln', 'os_candidate_high_ln',
       'sb_candidate_count_events_ln', 'ns_candidate_count_events_ln',
       'os_candidate_count_events_ln', 'sb_candidate_high_count_ln',
       'ns_candidate_high_count_ln', 'os_candidate_high_count_ln',
       'acled_sb_fat_ln', 'acled_ns_fat_ln', 'acled_os_fat_ln',
       'acled_sb_count', 'acled_ns_count', 'acled_os_count',
       'sb_final_best_ln_12', 'sb_final_best_ln_24', 'acled_sb_fat_ln_1',
       'acled_sb_fat_ln_2', 'acled_sb_fat_ln_3', 'acled_sb_fat_ln_12',
       'acled_sb_fat_ln_24', 'topic_conflict_1', 'topic_judiciary_1',
       'topic_diplomacy_1', 'vdem_v2x_delibdem', 'vdem_v2x_clphy',
       'vdem_v2x_rule', 'vdem_v2x_freexp', 'sb_candidate_best_ln_1',
       'sb_candidate_best_ln_2', 'sb_candidate_best_ln_3',
       'sb_candidate_best_ln_4', 'sb_candidate_best_ln_5',
       'sb_candidate_best_ln_6', 'sb_candidate_high_ln_1',
       'sb_candidate_high_ln_2', 'sb_candidate_high_ln_3',
       'sb_candidate_high_ln_4', 'sb_candidate_high_ln_5',
       'sb_candidate_high_ln_6', 'acled_pr_count', 
       'year', 'month'
       ]

short_list = short_features + countries

# Create a new DataFrame with only the selected columns
df = df[short_list]


### Setting up the training and test sets

In [None]:
df_train = df[df['year'].between(2018, 2022)]
df_test = df[df['year'] == 2023]

# Define X and y for train and test
X_train = df_train.drop(columns=['sb_final_best_ln'])
y_train = df_train['sb_final_best_ln']

X_test = df_test.drop(columns=['sb_final_best_ln'])
y_test = df_test['sb_final_best_ln']

# Visulizing the training and testset

In [None]:
year_min = min(X_train["year"].min(), X_test["year"].min())
year_max = max(X_train["year"].max(), X_test["year"].max())

# Create bin edges from the minimum year to the maximum year
bin_edges = np.arange(year_min - 0.5, year_max + 1.5, 1)

# Plot histograms 
X_train["year"].hist(bins=bin_edges, figsize=(10, 5), label="training set", alpha=0.7, rwidth=0.5)
X_test["year"].hist(bins=bin_edges, figsize=(10, 5), label="test set", alpha=0.7, rwidth=0.5)


plt.title("Observations by Year in Train and Test Sets")
plt.ylim(0, 2500)
plt.xlabel("Year")
plt.ylabel("Number of Observations")
plt.grid(False)
plt.legend()

# Nowcasting Models

    1: Simple model
    2: Added features 

### Model 1: Simple Model

In [None]:
countries = [
    'AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 
    'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 
    'BRA', 'BRB', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 
    'COD', 'COG', 'COL', 'COM', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 
    'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 
    'FRA', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 
    'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 
    'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 
    'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LKA', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 
    'MDG', 'MDV', 'MEX', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 
    'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 
    'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL', 'PRK', 'PRT', 'PRY', 'QAT', 
    'ROU', 'RUS', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SLE', 'SLV', 'SOM', 'SRB', 
    'SSD', 'SVK', 'SVN', 'SWE', 'SWZ', 'SYC', 'SYR', 'TCD', 'TGO', 'THA', 'TJK', 
    'TKM', 'TLS', 'TTO', 'TUN', 'TUR', 'TWN', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 
    'UZB', 'VEN', 'VNM', 'YEM', 'ZAF', 'ZMB', 'ZWE']   
 
short_features = ['sb_final_best_ln','sb_candidate_best_ln', 'ns_candidate_best_ln', 'os_candidate_best_ln',
       'sb_candidate_high_ln', 'ns_candidate_high_ln', 'os_candidate_high_ln',
       'sb_candidate_count_events_ln', 'ns_candidate_count_events_ln',
       'os_candidate_count_events_ln', 'sb_candidate_high_count_ln',
       'ns_candidate_high_count_ln', 'os_candidate_high_count_ln',
       'sb_candidate_best_ln_1','sb_candidate_best_ln_2', 'sb_candidate_best_ln_3',
       'sb_candidate_best_ln_4', 'sb_candidate_best_ln_5','sb_candidate_best_ln_6', 
       'month'] 

short_list = short_features + countries

# Create a new DataFrame with only the selected columns
train_minimal = df_train[short_list]
test_minimal = df_test[short_list]

In [None]:
# Creating the Test and train sets

# Training sets
X_train = train_minimal.drop(columns=['sb_final_best_ln'])
y_train = train_minimal['sb_final_best_ln']

# testsets
X_test = test_minimal.drop(columns=['sb_final_best_ln'])
y_test = test_minimal['sb_final_best_ln']

# Random Forest Model (non-log)
rf_model2 = RandomForestRegressor(random_state=17)
rf_model2.fit(X_train, y_train)

rf_m1_predictions = rf_model2.predict(X_test)


# Calculate Mean Squared Error
mse = mean_squared_error(y_test, rf_m1_predictions)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)


print("Random Forest Regressor MSE:", mse)
print("Random Forest Regressor RMSE:", rmse)
print('')
final_candidate = mean_squared_error(test_minimal['sb_final_best_ln'], test_minimal['sb_candidate_best_ln'])
print('MSE GED Final vs UCDP Candidate:', final_candidate)
print('RMSE GED Final vs UCDP Candidate:', np.sqrt(final_candidate))


print(" ")

## PLOTTING
# Parameters for jitter
jitter_amount = 0.15
# Creating jitter by adding a small random number to x and y coordinates
x_jittered = rf_m1_predictions + np.random.normal(0, jitter_amount, size=len(rf_m1_predictions))
y_jittered = y_test + np.random.normal(0, jitter_amount, size=len(y_test))
# Creating subplots
fig, ax = plt.subplots(1, 2, figsize=(16, 7))
# Original plot
ax[0].scatter(rf_m1_predictions, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0].set_title('Actuals vs Predictions')
ax[0].set_xlabel('Predicted Values (Logged)')
ax[0].set_ylabel('Actual Values (Logged)')
ax[0].set_xlim([-1, 10])
ax[0].set_ylim([-1, 10])
ax[0].grid(True)
# Plot with jitter
ax[1].scatter(x_jittered, y_jittered, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1].set_title('Plot with Jitter')
ax[1].set_xlabel('Predicted Values (Logged)')
ax[1].set_ylabel('Actual Values (Logged)')
ax[1].set_xlim([-1, 10])
ax[1].set_ylim([-1, 10])
ax[1].grid(True)
plt.show()

### Model 2: With ACLED Data

In [None]:
countries = [
    'AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 
    'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 
    'BRA', 'BRB', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 
    'COD', 'COG', 'COL', 'COM', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 
    'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 
    'FRA', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 
    'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 
    'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 
    'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LKA', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 
    'MDG', 'MDV', 'MEX', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 
    'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 
    'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL', 'PRK', 'PRT', 'PRY', 'QAT', 
    'ROU', 'RUS', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SLE', 'SLV', 'SOM', 'SRB', 
    'SSD', 'SVK', 'SVN', 'SWE', 'SWZ', 'SYC', 'SYR', 'TCD', 'TGO', 'THA', 'TJK', 
    'TKM', 'TLS', 'TTO', 'TUN', 'TUR', 'TWN', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 
    'UZB', 'VEN', 'VNM', 'YEM', 'ZAF', 'ZMB', 'ZWE']   
 
short_features = ['sb_final_best_ln','sb_candidate_best_ln', 'ns_candidate_best_ln', 'os_candidate_best_ln',
       'sb_candidate_high_ln', 'ns_candidate_high_ln', 'os_candidate_high_ln',
       'sb_candidate_count_events_ln', 'ns_candidate_count_events_ln',
       'os_candidate_count_events_ln', 'sb_candidate_high_count_ln',
       'ns_candidate_high_count_ln', 'os_candidate_high_count_ln',
       'sb_candidate_best_ln_1','sb_candidate_best_ln_2', 'sb_candidate_best_ln_3',
       'sb_candidate_best_ln_4', 'sb_candidate_best_ln_5','sb_candidate_best_ln_6',
       'acled_sb_fat_ln', 'acled_ns_fat_ln', 'acled_os_fat_ln', 
       'acled_sb_fat_ln_1', 'acled_sb_fat_ln_2', 'acled_sb_fat_ln_3', 'acled_sb_fat_ln_12',
       'acled_sb_count', 'acled_ns_count', 'acled_os_count', 
       'month'] 

acled_list = short_features + countries

# Create a new DataFrame with only the selected columns
train_acled = df_train[acled_list]
test_acled = df_test[acled_list]

In [None]:
# Creating the Test and train sets

# Training sets
X_train = train_acled.drop(columns=['sb_final_best_ln'])
y_train = train_acled['sb_final_best_ln']

# testsets
X_test = test_acled.drop(columns=['sb_final_best_ln'])
y_test = test_acled['sb_final_best_ln']

# Random Forest Model (non-log)
rf_model2 = RandomForestRegressor(random_state=17)
rf_model2.fit(X_train, y_train)

rf_m2_predictions = rf_model2.predict(X_test)


# Calculate Mean Squared Error
mse = mean_squared_error(y_test, rf_m2_predictions)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)


print("Random Forest Regressor MSE:", mse)
print("Random Forest Regressor RMSE:", rmse)
print('')
final_candidate = mean_squared_error(test_acled['sb_final_best_ln'], test_acled['sb_candidate_best_ln'])
print('MSE GED Final vs UCDP Candidate:', final_candidate)
print('RMSE GED Final vs UCDP Candidate:', np.sqrt(final_candidate))


print(" ")

## PLOTTING
# Parameters for jitter
jitter_amount = 0.15
# Creating jitter by adding a small random number to x and y coordinates
x_jittered = rf_m2_predictions + np.random.normal(0, jitter_amount, size=len(rf_m2_predictions))
y_jittered = y_test + np.random.normal(0, jitter_amount, size=len(y_test))
# Creating subplots
fig, ax = plt.subplots(1, 2, figsize=(16, 7))
# Original plot
ax[0].scatter(rf_m2_predictions, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0].set_title('Actuals vs Predictions')
ax[0].set_xlabel('Predicted Values (Logged)')
ax[0].set_ylabel('Actual Values (Logged)')
ax[0].set_xlim([-1, 10])
ax[0].set_ylim([-1, 10])
ax[0].grid(True)
# Plot with jitter
ax[1].scatter(x_jittered, y_jittered, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1].set_title('Plot with Jitter')
ax[1].set_xlabel('Predicted Values (Logged)')
ax[1].set_ylabel('Actual Values (Logged)')
ax[1].set_xlim([-1, 10])
ax[1].set_ylim([-1, 10])
ax[1].grid(True)
plt.show()

### Model 3: Long Dataset

In [None]:
countries = [
    'AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 
    'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 
    'BRA', 'BRB', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 
    'COD', 'COG', 'COL', 'COM', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 
    'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 
    'FRA', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 
    'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 
    'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 
    'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LKA', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 
    'MDG', 'MDV', 'MEX', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 
    'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 
    'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL', 'PRK', 'PRT', 'PRY', 'QAT', 
    'ROU', 'RUS', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SLE', 'SLV', 'SOM', 'SRB', 
    'SSD', 'SVK', 'SVN', 'SWE', 'SWZ', 'SYC', 'SYR', 'TCD', 'TGO', 'THA', 'TJK', 
    'TKM', 'TLS', 'TTO', 'TUN', 'TUR', 'TWN', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 
    'UZB', 'VEN', 'VNM', 'YEM', 'ZAF', 'ZMB', 'ZWE']   
 
short_features = ['sb_final_best_ln',
 'sb_candidate_best_ln',
 'ns_candidate_best_ln',
 'os_candidate_best_ln',
 'sb_candidate_high_ln',
 'ns_candidate_high_ln',
 'os_candidate_high_ln',
 'sb_candidate_count_events_ln',
 'ns_candidate_count_events_ln',
 'os_candidate_count_events_ln',
 'sb_candidate_high_count_ln',
 'ns_candidate_high_count_ln',
 'os_candidate_high_count_ln',
 'acled_sb_fat_ln',
 'acled_ns_fat_ln',
 'acled_os_fat_ln',
 'acled_sb_count',
 'acled_ns_count',
 'acled_os_count',
 'sb_final_best_ln_12',
 'sb_final_best_ln_24',
 'acled_sb_fat_ln_1',
 'acled_sb_fat_ln_2',
 'acled_sb_fat_ln_3',
 'acled_sb_fat_ln_12',
 'acled_sb_fat_ln_24',
 'topic_conflict_1',
 'topic_judiciary_1',
 'topic_diplomacy_1',
 'vdem_v2x_delibdem',
 'vdem_v2x_clphy',
 'vdem_v2x_rule',
 'vdem_v2x_freexp',
 'sb_candidate_best_ln_1',
 'sb_candidate_best_ln_2',
 'sb_candidate_best_ln_3',
 'sb_candidate_best_ln_4',
 'sb_candidate_best_ln_5',
 'sb_candidate_best_ln_6',
 'sb_candidate_high_ln_1',
 'sb_candidate_high_ln_2',
 'sb_candidate_high_ln_3',
 'sb_candidate_high_ln_4',
 'sb_candidate_high_ln_5',
 'sb_candidate_high_ln_6',
 'acled_pr_count',
 'year',
 'month'] 

long_list = short_features + countries

# Create a new DataFrame with only the selected columns
train_long = df_train[long_list]
test_long = df_test[long_list]

In [None]:
# Creating the Test and train sets

# Training sets
X_train = train_long.drop(columns=['sb_final_best_ln'])
y_train = train_long['sb_final_best_ln']

# testsets
X_test = test_long.drop(columns=['sb_final_best_ln'])
y_test = test_long['sb_final_best_ln']

# Random Forest Model (non-log)
rf_model3 = RandomForestRegressor(random_state=17)
rf_model2.fit(X_train, y_train)

rf_m3_predictions = rf_model2.predict(X_test)


# Calculate Mean Squared Error
mse = mean_squared_error(y_test, rf_m3_predictions)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)


print("Random Forest Regressor MSE:", mse)
print("Random Forest Regressor RMSE:", rmse)
print('')
final_candidate = mean_squared_error(test_long['sb_final_best_ln'], test_long['sb_candidate_best_ln'])
print('MSE GED Final vs UCDP Candidate:', final_candidate)
print('RMSE GED Final vs UCDP Candidate:', np.sqrt(final_candidate))


print(" ")

## PLOTTING
# Parameters for jitter
jitter_amount = 0.15
# Creating jitter by adding a small random number to x and y coordinates
x_jittered = rf_m3_predictions + np.random.normal(0, jitter_amount, size=len(rf_m3_predictions))
y_jittered = y_test + np.random.normal(0, jitter_amount, size=len(y_test))
# Creating subplots
fig, ax = plt.subplots(1, 2, figsize=(16, 7))
# Original plot
ax[0].scatter(rf_m3_predictions, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[0].set_title('Actuals vs Predictions')
ax[0].set_xlabel('Predicted Values (Logged)')
ax[0].set_ylabel('Testset Values (Logged)')
ax[0].set_xlim([-1, 10])
ax[0].set_ylim([-1, 10])
ax[0].grid(True)
# Plot with jitter
ax[1].scatter(x_jittered, y_jittered, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax[1].set_title('Plot with Jitter')
ax[1].set_xlabel('Predicted Values (Logged)')
ax[1].set_ylabel('Testset (Logged)')
ax[1].set_xlim([-1, 10])
ax[1].set_ylim([-1, 10])
ax[1].grid(True)
plt.show()

### Model 4: XGBoost Model

In [None]:
countries = [
    'AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 
    'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 
    'BRA', 'BRB', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 
    'COD', 'COG', 'COL', 'COM', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 
    'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 
    'FRA', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 
    'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 
    'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 
    'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LKA', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 
    'MDG', 'MDV', 'MEX', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 
    'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 
    'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL', 'PRK', 'PRT', 'PRY', 'QAT', 
    'ROU', 'RUS', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SLE', 'SLV', 'SOM', 'SRB', 
    'SSD', 'SVK', 'SVN', 'SWE', 'SWZ', 'SYC', 'SYR', 'TCD', 'TGO', 'THA', 'TJK', 
    'TKM', 'TLS', 'TTO', 'TUN', 'TUR', 'TWN', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 
    'UZB', 'VEN', 'VNM', 'YEM', 'ZAF', 'ZMB', 'ZWE']   
 
short_features = ['sb_final_best_ln',
 'sb_candidate_best_ln',
 'ns_candidate_best_ln',
 'os_candidate_best_ln',
 'sb_candidate_high_ln',
 'ns_candidate_high_ln',
 'os_candidate_high_ln',
 'sb_candidate_count_events_ln',
 'ns_candidate_count_events_ln',
 'os_candidate_count_events_ln',
 'sb_candidate_high_count_ln',
 'ns_candidate_high_count_ln',
 'os_candidate_high_count_ln',
 'acled_sb_fat_ln',
 'acled_ns_fat_ln',
 'acled_os_fat_ln',
 'acled_sb_count',
 'acled_ns_count',
 'acled_os_count',
 'sb_final_best_ln_12',
 'sb_final_best_ln_24',
 'acled_sb_fat_ln_1',
 'acled_sb_fat_ln_2',
 'acled_sb_fat_ln_3',
 'acled_sb_fat_ln_12',
 'acled_sb_fat_ln_24',
 'topic_conflict_1',
 'topic_judiciary_1',
 'topic_diplomacy_1',
 'vdem_v2x_delibdem',
 'vdem_v2x_clphy',
 'vdem_v2x_rule',
 'vdem_v2x_freexp',
 'sb_candidate_best_ln_1',
 'sb_candidate_best_ln_2',
 'sb_candidate_best_ln_3',
 'sb_candidate_best_ln_4',
 'sb_candidate_best_ln_5',
 'sb_candidate_best_ln_6',
 'sb_candidate_high_ln_1',
 'sb_candidate_high_ln_2',
 'sb_candidate_high_ln_3',
 'sb_candidate_high_ln_4',
 'sb_candidate_high_ln_5',
 'sb_candidate_high_ln_6',
 'acled_pr_count',
 'year',
 'month'] 

long_list = short_features + countries

# Create a new DataFrame with only the selected columns
train_long = df_train[long_list]
test_long = df_test[long_list]

In [None]:
# Training sets
X_train = train_long.drop(columns=['sb_final_best_ln'])
y_train = train_long['sb_final_best_ln']

# Test sets
X_test = test_long.drop(columns=['sb_final_best_ln'])
y_test = test_long['sb_final_best_ln']

# XGBoost Model
xgb_m3_regressor = xgb.XGBRegressor(
    eval_metric='rmsle',
    learning_rate=0.02,
    max_depth=6,
    n_estimators=250,
    colsample_bytree=0.6,
    subsample=0.8,
    random_state=17
)

# Fit model
xgb_m3_regressor.fit(X_train, y_train)

# Predict
xgb_m3_predictions = xgb_m3_regressor.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, xgb_m3_predictions)
rmse = np.sqrt(mse)

print("XGBoost Regressor MSE:", mse)
print("XGBoost Regressor RMSE:", rmse)
print('')

# Compare with baseline (UCDP candidate best)
final_candidate = mean_squared_error(test_long['sb_final_best_ln'], test_long['sb_candidate_best_ln'])
print('MSE GED Final vs UCDP Candidate:', final_candidate)
print('RMSE GED Final vs UCDP Candidate:', np.sqrt(final_candidate))

# Parameters for jitter
jitter_amount = 0.15

# Jittered predictions for XGBoost
x_jittered_xgb = xgb_m3_predictions + np.random.normal(0, jitter_amount, size=len(xgb_m3_predictions))
y_jittered_xgb = y_test.values + np.random.normal(0, jitter_amount, size=len(y_test))

# Jittered predictions for Candidate Model
candidate_preds = test_long['sb_candidate_best_ln']
x_jittered_candidate = candidate_preds + np.random.normal(0, jitter_amount, size=len(candidate_preds))
y_jittered_candidate = y_test.values + np.random.normal(0, jitter_amount, size=len(y_test))

# Create 2x2 subplot layout
fig, ax = plt.subplots(2, 2, figsize=(10, 10))

# Axis limits based on expected range of log-transformed values
lims = [-1, 12]

# 1. XGBoost Predictions vs Actual
ax[0, 0].scatter(xgb_m3_predictions, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0, 0].plot(lims, lims, 'k--', lw=2)
ax[0, 0].set_title('XGBoost: Predicted vs Actual')
ax[0, 0].set_xlabel('Predicted Values (Logged)')
ax[0, 0].set_ylabel('Actual Values (Logged)')
ax[0, 0].set_xlim(lims)
ax[0, 0].set_ylim(lims)
ax[0, 0].grid(True)

# 2. XGBoost Predictions vs Actual (with Jitter)
ax[0, 1].scatter(x_jittered_xgb, y_jittered_xgb, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[0, 1].plot(lims, lims, 'k--', lw=2)
ax[0, 1].set_title('XGBoost: Plot with Jitter')
ax[0, 1].set_xlabel('Predicted Values (Logged)')
ax[0, 1].set_ylabel('Actual Values (Logged)')
ax[0, 1].set_xlim(lims)
ax[0, 1].set_ylim(lims)
ax[0, 1].grid(True)

# 3. Candidate Predictions vs Actual (Original)
ax[1, 0].scatter(candidate_preds, y_test, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 0].plot(lims, lims, 'k--', lw=2)
ax[1, 0].set_title('Candidate vs GED Final')
ax[1, 0].set_xlabel('Candidate Values (Logged)')
ax[1, 0].set_ylabel('GED Final Values (Logged)')
ax[1, 0].set_xlim(lims)
ax[1, 0].set_ylim(lims)
ax[1, 0].grid(True)

# 4. Candidate Predictions vs Actual (with Jitter)
ax[1, 1].scatter(x_jittered_candidate, y_jittered_candidate, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 1].plot(lims, lims, 'k--', lw=2)
ax[1, 1].set_title('Candidate: Plot with Jitter')
ax[1, 1].set_xlabel('Candidate Predicted Values (Logged)')
ax[1, 1].set_ylabel('Actual Values (Logged)')
ax[1, 1].set_xlim(lims)
ax[1, 1].set_ylim(lims)
ax[1, 1].grid(True)

plt.tight_layout()
plt.show()



### Model 4: Improved XgBoost Model

In [None]:
# Training sets
X_train = train_long.drop(columns=['sb_final_best_ln'])
y_train = train_long['sb_final_best_ln']

# Test sets
X_test = test_long.drop(columns=['sb_final_best_ln'])
y_test = test_long['sb_final_best_ln']

# XGBoost Model
xgb_m4_regressor = xgb.XGBRegressor(eval_metric='rmse',
                                       learning_rate = 0.01,
                                       max_depth = 6,
                                       n_estimators = 500,
                                       colsample_bytree = 0.6,
                                       subsample = 0.8,
                                       min_child_weight=1,
                                       gamma=0.1,
                                       booster='dart',
                                       random_state = 17
)

# Fit model
xgb_m4_regressor.fit(X_train, y_train)

# Predict
xgb_m4_regressor = xgb_m4_regressor.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, xgb_m4_regressor)
rmse = np.sqrt(mse)

print("XGBoost Regressor MSE:", mse)
print("XGBoost Regressor RMSE:", rmse)
print('')

# Compare with baseline (UCDP candidate best)
final_candidate = mean_squared_error(test_long['sb_final_best_ln'], test_long['sb_candidate_best_ln'])
print('MSE GED Final vs UCDP Candidate:', final_candidate)
print('RMSE GED Final vs UCDP Candidate:', np.sqrt(final_candidate))

# Parameters for jitter
jitter_amount = 0.15

# Jittered predictions for XGBoost
x_jittered_xgb = xgb_m4_regressor + np.random.normal(0, jitter_amount, size=len(xgb_m4_regressor))
y_jittered_xgb = y_test.values + np.random.normal(0, jitter_amount, size=len(y_test))

# Jittered predictions for Candidate Model
candidate_preds = test_long['sb_candidate_best_ln']
x_jittered_candidate = candidate_preds + np.random.normal(0, jitter_amount, size=len(candidate_preds))
y_jittered_candidate = y_test.values + np.random.normal(0, jitter_amount, size=len(y_test))

# Create 2x2 subplot layout
fig, ax = plt.subplots(2, 2, figsize=(10, 10))

# Axis limits based on expected range of log-transformed values
lims = [-1, 12]

# 1. XGBoost Predictions vs Actual
ax[0, 0].scatter(xgb_m4_regressor, y_test, alpha=0.3, color='crimson', linewidth=1, s=30)
ax[0, 0].plot(lims, lims, 'k--', lw=2)
ax[0, 0].set_title('XGBoost: Predicted vs Actual')
ax[0, 0].set_xlabel('Predicted Values (Logged)')
ax[0, 0].set_ylabel('Actual Values (Logged)')
ax[0, 0].set_xlim(lims)
ax[0, 0].set_ylim(lims)
ax[0, 0].grid(True)

# 2. XGBoost Predictions vs Actual (with Jitter)
ax[0, 1].scatter(x_jittered_xgb, y_jittered_xgb, alpha=0.3, color='crimson', marker='o', linewidth=1, s=30)
ax[0, 1].plot(lims, lims, 'k--', lw=2)
ax[0, 1].set_title('XGBoost: Plot with Jitter')
ax[0, 1].set_xlabel('Predicted Values (Logged)')
ax[0, 1].set_ylabel('Actual Values (Logged)')
ax[0, 1].set_xlim(lims)
ax[0, 1].set_ylim(lims)
ax[0, 1].grid(True)

# 3. Candidate Predictions vs Actual (Original)
ax[1, 0].scatter(candidate_preds, y_test, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 0].plot(lims, lims, 'k--', lw=2)
ax[1, 0].set_title('Candidate vs GED Final')
ax[1, 0].set_xlabel('Candidate Values (Logged)')
ax[1, 0].set_ylabel('GED Final Values (Logged)')
ax[1, 0].set_xlim(lims)
ax[1, 0].set_ylim(lims)
ax[1, 0].grid(True)

# 4. Candidate Predictions vs Actual (with Jitter)
ax[1, 1].scatter(x_jittered_candidate, y_jittered_candidate, alpha=0.3, color='teal', marker='o', linewidth=1, s=30)
ax[1, 1].plot(lims, lims, 'k--', lw=2)
ax[1, 1].set_title('Candidate: Plot with Jitter')
ax[1, 1].set_xlabel('Candidate Predicted Values (Logged)')
ax[1, 1].set_ylabel('Actual Values (Logged)')
ax[1, 1].set_xlim(lims)
ax[1, 1].set_ylim(lims)
ax[1, 1].grid(True)

plt.tight_layout()
plt.show()



# Plotting Predictions

In [None]:
def plot_actual_vs_predicted(df, c_id):
    """
    Plots actual vs predicted values over month_id for a given c_id.

    :param df: DataFrame containing the data
    :param c_id: The c_id to filter the data
    """
    # Filter the DataFrame for the selected c_id
    filtered_df = df[df['c_id'] == c_id]

    # Set the index to month_id for plotting
    filtered_df.set_index('month_id', inplace=True)

    # Plotting
    plt.figure(figsize=(10, 6))
    plt.plot(filtered_df['Actual'], color='black', label='Actuals')
    plt.plot(filtered_df['Predicted'], color='#FF6700', linestyle='dashed', label='Predictions')
    plt.plot(filtered_df['sb_candidate_best_ln'], color='#800080', linestyle='dashed', label='Candidate')
    plt.plot(filtered_df['acled_sb_fat_ln'], color='#008000', alpha=0.3, label='ACLED')


    # Adding labels and title
    plt.xlabel('Month ID')
    plt.ylabel('Fatlities (Logged)')
    plt.title(f'Actual vs Predicted Fatalities for c_id: {c_id}')
    plt.legend()

    # Show the plot
    plt.show()

In [None]:
# Reset index to ensure c_id and month_id are available as columns
df_results = test_long.reset_index().copy()

# Keep only necessary columns
df_results = df_results[['c_id', 'month_id']]

# Add predictions and actual values
df_results['Actual'] = y_test.values  # Ensure alignment
df_results['Predicted'] = xgb_m3_predictions

# Check if 'candidate_sb_best_sum_nokgi_ln1' exists in test, otherwise assign NaN
if 'sb_candidate_best_ln' in test_long.columns:
    df_results['sb_candidate_best_ln'] = test_long.reset_index()['sb_candidate_best_ln']
else:
    df_results['sb_candidate_best_ln'] = np.nan

# Check if 'acled_sb_fat_ln' exists in test, otherwise assign NaN
if 'acled_sb_fat_ln' in test_long.columns:
    df_results['acled_sb_fat_ln'] = test_long.reset_index()['acled_sb_fat_ln']
else:
    df_results['acled_sb_fat_ln'] = np.nan
    
plot_df = df_results.copy()
plot_df = plot_df.reset_index(drop=False)

In [None]:
plot_actual_vs_predicted(plot_df, 218) # Isreal
plot_actual_vs_predicted(plot_df, 47) # Burkina Faso
plot_actual_vs_predicted(plot_df, 245) # Sudan
plot_actual_vs_predicted(plot_df, 57) # Ethiopia
plot_actual_vs_predicted(plot_df, 4) # Venezuela
plot_actual_vs_predicted(plot_df, 237) # Kenya

plot_actual_vs_predicted(plot_df, 246) # 
plot_actual_vs_predicted(plot_df, 172) #
plot_actual_vs_predicted(plot_df, 17) # 
plot_actual_vs_predicted(plot_df, 78) # Niger


plot_actual_vs_predicted(plot_df, 117) # ukraine
plot_actual_vs_predicted(plot_df, 79) # ukraine
plot_actual_vs_predicted(plot_df, 137) # Tahjikistan