# Collect all UCDP Candidate Data 


In [None]:
import pandas as pd
import requests
from ingester3.extensions import *
from ingester3.DBWriter import DBWriter
from ingester3.scratch import cache_manager
from ingester3.config import source_db_path
from diskcache import Cache


# This cell imports the basic packages needed to run the notebook 

import numpy as np
import pandas as pdx
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
import tabula
import xlwings as xw
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Views 3
from viewser.operations import fetch
from viewser import Queryset, Column
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views
#import views_dataviz
from views_runs import storage, ModelMetadata
from views_runs.storage import store, retrieve, fetch_metadata
from views_forecasts.extensions import *

# VIEWS mapper2
#from views_mapper2.label_writer import *

# Ingester
from ingester3.config import source_db_path
from ingester3.Country import Country
from ingester3.extensions import *
from ingester3.ViewsMonth import ViewsMonth
from ingester3.DBWriter import DBWriter
from ingester3.scratch import cache_manager
cache_manager()


import glob


import os
home = os.path.expanduser("~")


##################
print('OS Login is:', os.getlogin())
print('OS path is set to:', home)

In [None]:
print(source_db_path)

In [None]:
ged_cache = Cache('ged_x.cache')
ged_cache.clear(retry=True)


class GedLoader:
    def __init__(self,version,verbose=True, no_val=None):
        
        cache_manager()
        self.version = version
        self.ged = None
        
        self.ged_agg_pgm = None
        self.ged_agg_cm = None

        if no_val is None: 
            self.no_val = []
        else:
            self.no_val = [i.lower() for i in no_val]
            
        self.__get_month_id()
        
        self.verbose_print = print if verbose else lambda *a, **k: None

        
    def __get_month_id(self):
        """
        If trying to load a GED Candidates dataset (20.0.x) infer what ViEWS MonthID it refers to.
        Return nothing otherwise
        """
        self.month_id = None
        if self.version.count('.')==2:
            year_extent=int('20'+self.version.split('.')[0])
            month_extent=int(self.version.split('.')[2])
            self.month_id = ViewsMonth.from_year_month(year_extent,month_extent)
    
    @staticmethod
    @ged_cache.memoize(typed=True, expire=600000, tag='ged_slice') 
    def _get_ged_slice(next_page_url, token=None):
        headers = {'x-ucdp-access-token': token}
        r = requests.get(next_page_url, headers=headers)
        output = r.json()
        next_page_url = output['NextPageUrl'] if output['NextPageUrl'] != '' else None
        ged = pd.DataFrame(output['Result'])
        page_count = output['TotalPages']
        return next_page_url, ged, page_count
    
    def fetch_ged(self, pagesize=50000):
        cur_page = 1
        next_page_url = f"https://ucdpapi.pcr.uu.se/api/gedevents/{self.version}?pagesize={pagesize}&page=0"

        df = pd.DataFrame()
        while next_page_url:
            print(next_page_url)
            next_page_url, ged_slice, total_pages = self._get_ged_slice(\
                next_page_url=next_page_url, token="48dda3460c347f3b"\
            )
            df = pd.concat([df,ged_slice], ignore_index=True)
            print(f"{cur_page} of {total_pages} pages loaded.")
            #cur_page += 1
            
            if cur_page > total_pages:
                ged_cache.clear(retry=True)
                raise ConnectionError('The UCDP API is misbehaving. Try again later!')
            cur_page += 1
            

        self.ged = df
        
    def filter_ged(self):
        self.ged = self.ged[self.ged.priogrid_gid>=1]
        self.ged.date_end = pd.to_datetime(self.ged.date_end)
        self.ged = pd.DataFrame.pgm.from_datetime(self.ged,'date_end').rename(columns = {'priogrid_gid':'pg_id',
                                                                       'type_of_violence':'tv'})
        self.ged = self.ged[self.ged.tv<4]
        self.pg_ged = self.ged[(self.ged.where_prec != 4) & (self.ged.where_prec != 6)]
        
    def aggregate_to_pg(self):
        """
        Aggregate GED to Priogrid level.
        """
        
        #Group by type of violence, priogrid id and month id
        #Aggregate to PG, taking sum and count
        ged_fraction = self.pg_ged[['tv','pg_id','month_id','best','high']]
        ged_agg = ged_fraction.groupby(by=['tv','pg_id','month_id']).aggregate(['sum','count'])
        
        #Eliminate the double-index nesting that resulted from the aggregation process
        ged_agg['best_sum'] = ged_agg['best']['sum']
        ged_agg['best_count'] = ged_agg['best']['count']
        ged_agg['high_sum'] = ged_agg['high']['sum']
        ged_agg['high_count'] = ged_agg['high']['count']
        del ged_agg['best']
        del ged_agg['high']
        
        #Reset the index
        ged_agg = ged_agg.reset_index()
        
        #Pivot the long-form to a wide-form required by the database
        #You don't need to fill in the panel because the DB will automate the infill.
        ged_pivot = ged_agg.pivot(index=['pg_id','month_id'], 
              columns=['tv'],
              values = ['best_sum','best_count','high_sum','high_count']).fillna(0).astype('int64')
        
        #The pivot will produce a multi-level columnar structure
        #Flatten this to column names that we will be using in the DB.
        
        ged_pivot['ged_sb_best_sum_nokgi'] = ged_pivot['best_sum'][1]
        ged_pivot['ged_ns_best_sum_nokgi'] = ged_pivot['best_sum'][2]
        ged_pivot['ged_os_best_sum_nokgi'] = ged_pivot['best_sum'][3]

        ged_pivot['ged_sb_best_count_nokgi'] = ged_pivot['best_count'][1]
        ged_pivot['ged_ns_best_count_nokgi'] = ged_pivot['best_count'][2]
        ged_pivot['ged_os_best_count_nokgi'] = ged_pivot['best_count'][3]

        ged_pivot['ged_sb_high_sum_nokgi'] = ged_pivot['high_sum'][1]
        ged_pivot['ged_ns_high_sum_nokgi'] = ged_pivot['high_sum'][2]
        ged_pivot['ged_os_high_sum_nokgi'] = ged_pivot['high_sum'][3]

        ged_pivot['ged_sb_high_count_nokgi'] = ged_pivot['high_count'][1]
        ged_pivot['ged_ns_high_count_nokgi'] = ged_pivot['high_count'][2]
        ged_pivot['ged_os_high_count_nokgi'] = ged_pivot['high_count'][3]

        del(ged_pivot['best_count'])
        del(ged_pivot['high_count'])
        del(ged_pivot['best_sum'])
        del(ged_pivot['high_sum'])
        
        #Simplify everything by removing the multi-level columnar structure.
        ged_pivot = ged_pivot.reset_index()
        ged_pivot.columns = ged_pivot.columns.droplevel(1)
        
        self.ged_agg_pgm = ged_pivot
        return self.ged_agg_pgm
    
    
    def aggregate_to_cm(self):
        
        ged_cm_agg = self.ged[['tv','country_id','month_id','best','high']].\
        groupby(by=['tv','country_id','month_id']).aggregate(['sum','count'])

        ged_cm_agg['best_sum'] = ged_cm_agg['best']['sum']
        ged_cm_agg['best_count'] = ged_cm_agg['best']['count']
        ged_cm_agg['high_sum'] = ged_cm_agg['high']['sum']
        ged_cm_agg['high_count'] = ged_cm_agg['high']['count']

        del ged_cm_agg['best']
        del ged_cm_agg['high']

        ged_cm_agg = ged_cm_agg.reset_index()
        
        # GED is sometimes faulty in terms of what countries and months it contains.
        # We need to filter out the not working data.
        
        ged_cm_agg = pd.DataFrame.cm.soft_validate_gwcode(ged_cm_agg,'country_id','month_id')
        ged_cm_agg = ged_cm_agg[ged_cm_agg.valid_id==True]
        
        ged_cm_agg = pd.DataFrame.cm.from_gwcode(ged_cm_agg, 
                                                 gw_col='country_id', 
                                                 month_col='month_id')

        ged_cm_agg.columns = ged_cm_agg.columns.droplevel(1)
        del ged_cm_agg['country_id']
        
        ged_cm_pivot = ged_cm_agg.pivot(index=['c_id','month_id'], 
                                columns=['tv'],
                                values = ['best_sum','best_count',
                                          'high_sum','high_count']).fillna(0).astype('int64')
        
        ged_cm_pivot['ged_sb_best_sum_nokgi'] = ged_cm_pivot['best_sum'][1]
        ged_cm_pivot['ged_ns_best_sum_nokgi'] = ged_cm_pivot['best_sum'][2]
        ged_cm_pivot['ged_os_best_sum_nokgi'] = ged_cm_pivot['best_sum'][3]

        ged_cm_pivot['ged_sb_best_count_nokgi'] = ged_cm_pivot['best_count'][1]
        ged_cm_pivot['ged_ns_best_count_nokgi'] = ged_cm_pivot['best_count'][2]
        ged_cm_pivot['ged_os_best_count_nokgi'] = ged_cm_pivot['best_count'][3]
        
        ged_cm_pivot['ged_sb_high_sum_nokgi'] = ged_cm_pivot['high_sum'][1]
        ged_cm_pivot['ged_ns_high_sum_nokgi'] = ged_cm_pivot['high_sum'][2]
        ged_cm_pivot['ged_os_high_sum_nokgi'] = ged_cm_pivot['high_sum'][3]

        ged_cm_pivot['ged_sb_high_count_nokgi'] = ged_cm_pivot['high_count'][1]
        ged_cm_pivot['ged_ns_high_count_nokgi'] = ged_cm_pivot['high_count'][2]
        ged_cm_pivot['ged_os_high_count_nokgi'] = ged_cm_pivot['high_count'][3]

        del(ged_cm_pivot['best_count'])
        del(ged_cm_pivot['high_count'])
        del(ged_cm_pivot['best_sum'])
        del(ged_cm_pivot['high_sum'])

        ged_cm_pivot = ged_cm_pivot.reset_index()
        ged_cm_pivot.columns = ged_cm_pivot.columns.droplevel(1)
        
        self.ged_agg_cm = ged_cm_pivot
        return self.ged_agg_cm
    


## List of all the GED Candidate Versions

In [None]:
GED_VERSIONS = [
    "18.0.1", "18.0.2", "18.0.3", "18.0.4", "18.0.5", "18.0.6",
    "18.0.7", "18.0.8", "18.0.9", "18.0.10", "18.0.11", "18.0.12",
    "19.0.1", "19.0.2", "19.0.3", "19.0.4", "19.0.5", "19.0.6",
    "19.0.7", "19.0.8", "19.0.9", "19.0.10", "19.0.11", "19.0.12",
    "20.0.1", "20.0.2", "20.0.3", "20.0.4", "20.0.5", "20.0.6",
    "20.0.7", "20.0.8", "20.0.9", "20.0.10", "20.0.11", "20.0.12",
    "21.0.1", "21.0.2", "21.0.3", "21.0.4", "21.0.5", "21.0.6",
    "21.0.7", "21.0.8", "21.0.9", "21.0.10", "21.0.11", "21.0.12",
    "22.0.1", "22.0.2", "22.0.3", "22.0.4", "22.0.5", "22.0.6",
    "22.0.7", "22.0.8", "22.0.9", "22.0.10", "22.0.11", "22.0.12",
    "23.0.1", "23.0.2", "23.0.3", "23.0.4", "23.0.5", "23.0.6",
    "23.0.7", "23.0.8", "23.0.9", "23.0.10", "23.0.11", "23.0.12"
]

## Gather all the Datasets

In [None]:
# Version 2, account for data issue in the version 21.0.05

# Loop through each GED version
for version in GED_VERSIONS:
    print(f"\n Processing GED Version: {version}")

    # Initialize the loader
    loader = GedLoader(version)

    # Fetch and filter data
    loader.fetch_ged()
    loader.filter_ged()

    # Aggregate data
    pg_df = loader.aggregate_to_pg()
    cm_df = loader.aggregate_to_cm()

    # Special handling for version '21.0.5'
    if version == '21.0.5':
        pg_df_filtered = pg_df[pg_df["month_id"] == 497]
        cm_df_filtered = cm_df[cm_df["month_id"] == 497]
    else:
        # Find the most common `month_id` for both datasets
        most_common_month_pg = pg_df["month_id"].mode()[0]  # Priogrid
        most_common_month_cm = cm_df["month_id"].mode()[0]  # Country-Month

        # Filter to keep only rows with the most common `month_id`
        pg_df_filtered = pg_df[pg_df["month_id"] == most_common_month_pg]
        cm_df_filtered = cm_df[cm_df["month_id"] == most_common_month_cm]

    # Save filtered datasets
    pg_df_filtered.to_csv(f"candidate_{version.replace('.', '_')}_pgm.csv", index=False)
    cm_df_filtered.to_csv(f"candidate_{version.replace('.', '_')}_cm.csv", index=False)

    print(f"Saved: candidate_{version.replace('.', '_')}_pgm.csv ({len(pg_df_filtered)} rows)")
    print(f"Saved: candidate_{version.replace('.', '_')}_cm.csv ({len(cm_df_filtered)} rows)")

In [None]:
# Loop through each GED version
for version in GED_VERSIONS:
    print(f"\n Processing GED Version: {version}")

    # Initialize the loader
    loader = GedLoader(version)

    # Fetch and filter data
    loader.fetch_ged()
    loader.filter_ged()

    # Aggregate data
    pg_df = loader.aggregate_to_pg()
    cm_df = loader.aggregate_to_cm()

    # Find the most common `month_id` for both datasets
    most_common_month_pg = pg_df["month_id"].mode()[0]  # Priogrid
    most_common_month_cm = cm_df["month_id"].mode()[0]  # Country-Month

    # Filter to keep only rows with the most common `month_id`
    pg_df_filtered = pg_df[pg_df["month_id"] == most_common_month_pg]
    cm_df_filtered = cm_df[cm_df["month_id"] == most_common_month_cm]

    # Save filtered datasets
    pg_df_filtered.to_csv(f"candidate_{version.replace('.', '_')}_pgm_filtered.csv", index=False)
    cm_df_filtered.to_csv(f"candidate_{version.replace('.', '_')}_cm_filtered.csv", index=False)

    print(f"Saved: candidate_{version.replace('.', '_')}_pgm_filtered.csv ({len(pg_df_filtered)} rows)")
    print(f"Saved: candidate_{version.replace('.', '_')}_cm_filtered.csv ({len(cm_df_filtered)} rows)")

## Create the Dataframes

In [None]:
# Find all CM filtered CSV files
cm_files = glob.glob("candidate_*_cm.csv")

# List to store dataframes
df_list = []

# Load each CM file and append it to the list
for file in cm_files:
    print(f"Loading: {file}")
    df = pd.read_csv(file)
    df["source_version"] = file  # Add a column to track the source file
    df_list.append(df)

# Concatenate all DataFrames
combined_df = pd.concat(df_list, ignore_index=True)

# Save the combined DataFrame
combined_df.to_csv("candidates_all.csv", index=False)

print(f"Combined DataFrame saved: candidate_combined_cm.csv ({len(combined_df)} rows)")


## Renaming Variables

In [None]:
combined_df.columns

In [None]:
# rename 'ged_sb_best_sum_nokgi' to 'candidate_sb_best' 
combined_df.rename(columns={'ged_sb_best_sum_nokgi':'candidate_sb_best'}, inplace=True)

# rename 'ged_ns_best_sum_nokgi' to 'candidate_ns_best'
combined_df.rename(columns={'ged_ns_best_sum_nokgi':'candidate_ns_best'}, inplace=True)

# rename 'ged_os_best_sum_nokgi' to 'candidate_os_best'
combined_df.rename(columns={'ged_os_best_sum_nokgi':'candidate_os_best'}, inplace=True)

# rename 'ged_sb_high_sum_nokgi' to 'candidate_sb_high'
combined_df.rename(columns={'ged_sb_high_sum_nokgi':'candidate_sb_high'}, inplace=True)

# rename 'ged_ns_high_sum_nokgi' to 'candidate_ns_high'
combined_df.rename(columns={'ged_ns_high_sum_nokgi':'candidate_ns_high'}, inplace=True)

# rename 'ged_os_high_sum_nokgi' to 'candidate_os_high'
combined_df.rename(columns={'ged_os_high_sum_nokgi':'candidate_os_high'}, inplace=True)

# rename 'ged_sb_best_count_nokgi' to 'candidate_sb_count'
combined_df.rename(columns={'ged_sb_best_count_nokgi':'candidate_sb_count'}, inplace=True)

# rename 'ged_ns_best_count_nokgi' to 'candidate_ns_count'
combined_df.rename(columns={'ged_ns_best_count_nokgi':'candidate_ns_count'}, inplace=True)

# rename 'ged_os_best_count_nokgi' to 'candidate_os_count'
combined_df.rename(columns={'ged_os_best_count_nokgi':'candidate_os_count'}, inplace=True)

# rename 'ged_sb_high_count_nokgi' to 'candidate_sb_high_count'
combined_df.rename(columns={'ged_sb_high_count_nokgi':'candidate_sb_high_count'}, inplace=True)

# rename 'ged_ns_high_count_nokgi' to 'candidate_ns_high_count'
combined_df.rename(columns={'ged_ns_high_count_nokgi':'candidate_ns_high_count'}, inplace=True)

# rename 'ged_os_high_count_nokgi' to 'candidate_os_high_count'
combined_df.rename(columns={'ged_os_high_count_nokgi':'candidate_os_high_count'}, inplace=True)

combined_df.drop(columns=['source_version'], inplace=True)

In [None]:
df = combined_df.copy()

# Soft Validate Data

In [None]:
df = df.astype({'c_id':'int'})
df = df.astype({'cm_id':'int'})

In [None]:
df = df.cm.db_id()

In [None]:
#test the uniqueness of country_id and month combinations
print('Is this CM unique?', df.cm.is_unique)

# Ingestion Step

In [None]:
#ingestion function, adjust as needed to either keep the outpanel or wipe it
def ingestion(data, table_name):
    """ingestion step."""
    print(f"ingestion step")
    cm_writer = DBWriter(data, "cm",
                   in_panel_wipe = True,
                   out_panel_wipe = False,
                   in_panel_zero = True,
                   out_panel_zero = False)  
    cm_writer.set_time_extents_min_max(data.month_id.min(), data.month_id.max())
    cm_writer.transfer(tname=table_name)

In [None]:
ingestion(df, 'nowcasting')

In [None]:
print('All done!')

In [None]:
#ingestion step
#ingestion(data, 'nowcasting')

In [None]:
!viewser features list cm