In [2]:
import pandas as pd
import re
import os

In [3]:
# Load your dataset
file_path = os.path.join('..', 'data', 'global_crisis_data.csv')

In [4]:
df = pd.read_csv(file_path)

In [5]:
# Rename columns
df.rename(columns={
    'Banking Crisis ': 'banking_crisis',
    'Banking_Crisis_Notes': 'banking_notes',
    'Systemic Crisis': 'systemic_crisis',
    'Domestic_Debt_In_Default': 'domestic_debt',
    'Domestic_Debt_ Notes/Sources': 'domestic_notes',
    'SOVEREIGN EXTERNAL DEBT 1: ...': 'external_debt_1',
    'SOVEREIGN EXTERNAL DEBT 2: ...': 'external_debt_2',
    'Defaults_External_Notes': 'external_notes',
    'Currency Crises': 'currency_crisis',
    'Inflation Crises': 'inflation_crisis',
    'Independence': 'independence',
    'Gold Standard': 'gold_standard',
    'national currency': 'national_currency'
}, inplace=True)

In [6]:
# Convert critical columns to numeric
cols_to_convert = [
    'banking_crisis', 'systemic_crisis', 'currency_crisis',
    'inflation_crisis', 'independence', 'gold_standard'
]

In [7]:
for col in cols_to_convert:
    df[col] = pd.to_numeric(df[col], errors='coerce')

infl_col = 'Inflation, Annual percentages of average consumer prices'
if infl_col in df.columns:
    df[infl_col] = pd.to_numeric(df[infl_col], errors='coerce')


In [8]:
# --- Currency Parser Function ---
def parse_currency_periods(currency_str):
    """
    Given a national currency field like:
    "1830-1877-French coins, 1878-1964-Algerian franc, 1964-present-dinar"
    Return list of (start_year, end_year, currency_name)
    """
    if pd.isna(currency_str):
        return []
    
    entries = []
    parts = currency_str.split(',')
    for part in parts:
        match = re.match(r'(\d{4})-(\d{4}|present)[-–](.*)', part.strip())
        if match:
            start_year = int(match.group(1))
            end_year_raw = match.group(2)
            end_year = int(end_year_raw) if end_year_raw != 'present' else 9999
            currency_name = match.group(3).strip()
            entries.append((start_year, end_year, currency_name))
    return entries

In [9]:
# Precompute currency timelines for each country
currency_timeline = {}

for country in df['Country'].dropna().unique():
    currency_info = df[df['Country'] == country]['national_currency'].dropna()
    if not currency_info.empty:
        # Take the first available non-NaN entry
        currency_text = currency_info.iloc[0]
        currency_timeline[country] = parse_currency_periods(currency_text)
    else:
        currency_timeline[country] = []

In [10]:
# Initialize trackers
last_independence = {}
last_gold_standard = {}
refined_events = []

In [11]:
# Process each row
for _, row in df.iterrows():
    country = row['Country']
    year = row['Year']
    if pd.isna(country) or pd.isna(year):
        continue

    domestic_notes = row.get('domestic_notes')
    external_notes = row.get('external_notes')
    events = []

    # --- Crises ---
    if row['banking_crisis'] == 1:
        events.append(("Banking Crisis", row.get('banking_notes') if pd.notna(row.get('banking_notes')) else None))
    if row['systemic_crisis'] == 1:
        events.append(("Systemic Crisis", None))
    if row['currency_crisis'] == 1:
        events.append(("Currency Crisis", None))
    if row['inflation_crisis'] == 1:
        inflation_note = f"Inflation level: {row[infl_col]}" if pd.notna(row[infl_col]) else None
        events.append(("Inflation Crisis", inflation_note))

    # --- Independence ---
    curr_indep = row['independence']
    if country in last_independence and curr_indep != last_independence[country]:
        event = "Gain of Independence" if curr_indep == 1 else "Loss of Independence"
        events.append((event, None))
    last_independence[country] = curr_indep

    # --- Gold Standard ---
    curr_gold = row['gold_standard']
    if country in last_gold_standard:
        if curr_gold != last_gold_standard[country]:
            if curr_gold == 1:
                event = "Incorporation of Gold Standard"
            elif curr_gold == 0 and last_gold_standard[country] == 1:
                event = "Removing of Gold Standard"
            else:
                event = None
            if event:
                events.append((event, None))
    last_gold_standard[country] = curr_gold

    # --- Currency Change ---
    # Use parsed timeline
    timeline = currency_timeline.get(country, [])
    for period in timeline:
        start_year, end_year, currency_name = period
        if int(year) == start_year:
            events.append(("Currency Change", f"New currency: {currency_name}"))

    # Record all events for this year
    for event_type, note in events:
        refined_events.append({
            'country': country,
            'year': year,
            'event': event_type,
            'event_notes': note,
            'domestic_notes': domestic_notes,
            'external_notes': external_notes
        })

In [12]:
# Create final cleaned DataFrame
final_df = pd.DataFrame(refined_events)
final_df.sort_values(by=['country', 'year'], inplace=True)

In [13]:
final_df

Unnamed: 0,country,year,event,event_notes,domestic_notes,external_notes
0,Algeria,1830.0,Currency Change,New currency: French coins,,
1,Algeria,1870.0,Banking Crisis,,,
2,Algeria,1870.0,Systemic Crisis,,,
3,Algeria,1877.0,Inflation Crisis,Inflation level: 29.1,,
4,Algeria,1878.0,Incorporation of Gold Standard,,,
...,...,...,...,...,...,...
4002,Zimbabwe,2008.0,Currency Crisis,,With over 98.5 percent of domestic debt with m...,
4003,Zimbabwe,2008.0,Inflation Crisis,,With over 98.5 percent of domestic debt with m...,
4004,Zimbabwe,2009.0,Banking Crisis,Two of five commercial banks have a high level...,With over 98.5 percent of domestic debt with m...,
4005,Zimbabwe,2009.0,Systemic Crisis,,With over 98.5 percent of domestic debt with m...,


In [14]:
final_df = final_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [15]:
final_df.to_csv(os.path.join('..', 'data', 'cleaned_crisis_data.csv'), index=False)

In [16]:
final_df

Unnamed: 0,country,year,event,event_notes,domestic_notes,external_notes
0,Algeria,1830.0,Currency Change,New currency: French coins,,
1,Algeria,1870.0,Banking Crisis,,,
2,Algeria,1870.0,Systemic Crisis,,,
3,Algeria,1877.0,Inflation Crisis,Inflation level: 29.1,,
4,Algeria,1878.0,Incorporation of Gold Standard,,,
...,...,...,...,...,...,...
4002,Zimbabwe,2008.0,Currency Crisis,,With over 98.5 percent of domestic debt with m...,
4003,Zimbabwe,2008.0,Inflation Crisis,,With over 98.5 percent of domestic debt with m...,
4004,Zimbabwe,2009.0,Banking Crisis,Two of five commercial banks have a high level...,With over 98.5 percent of domestic debt with m...,
4005,Zimbabwe,2009.0,Systemic Crisis,,With over 98.5 percent of domestic debt with m...,


In [17]:
# Replace event names
final_df['event'] = final_df['event'].replace({
    "Incorporation of Gold Standard": "Gold Standard Adoption",
    "Removing of Gold Standard": "Gold Standard Suspension"
})


In [18]:
final_df

Unnamed: 0,country,year,event,event_notes,domestic_notes,external_notes
0,Algeria,1830.0,Currency Change,New currency: French coins,,
1,Algeria,1870.0,Banking Crisis,,,
2,Algeria,1870.0,Systemic Crisis,,,
3,Algeria,1877.0,Inflation Crisis,Inflation level: 29.1,,
4,Algeria,1878.0,Gold Standard Adoption,,,
...,...,...,...,...,...,...
4002,Zimbabwe,2008.0,Currency Crisis,,With over 98.5 percent of domestic debt with m...,
4003,Zimbabwe,2008.0,Inflation Crisis,,With over 98.5 percent of domestic debt with m...,
4004,Zimbabwe,2009.0,Banking Crisis,Two of five commercial banks have a high level...,With over 98.5 percent of domestic debt with m...,
4005,Zimbabwe,2009.0,Systemic Crisis,,With over 98.5 percent of domestic debt with m...,


In [21]:
final_df.to_csv(os.path.join('..', 'data', 'clean_crisis_data.csv'), index=False)