#### Import relevant packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno

%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Data

##### Source:
Wimmer and Min (2006) – “From empire to nation-state: Explaining war in the modern world, 1816-2001”, American Sociological Review 71(6):867-897, 2006.

In [2]:
complete_data = pd.read_csv('../data/original/colonizers.csv')

### Viewing the data

In [3]:
complete_data.head()

Unnamed: 0,yearc,year,cowcode,country,onset,war,warname,warno,wartype,yrbeg,yrend,anarc,anarcl,anoc,anocl,area2001,asia,autoc,autocl,democ,democl,eeurop,ethfrac,gdp,gdppc,gdppcl,implag,imppower,instab,instabl,lamerica,lmtnest,lnpop,lnpopl,milperc,milpercl,nafrme,nbcivil,nbconq,nbinter,nbnatind,nbnonind,nsflag,nsfyear,ocivil,oconq,oil,oilpc,oilpcl,ointer,ointrap,onatind,ononind,pdemnb,pocivil,poconq,pointer,pointrap,poldisc,poldiscl,ponatind,pononind,ponset,pop,relfrac,ssafrica,western
0,18162,1816,2,United States of America,0,0,,,,,,0,,0,,9629090.0,0.0,0,,1,,0.0,0.35,,,,,,1,,0.0,3.21,,,0.0,,0.0,0,0,0,0,0,-49,1865,0,0,0.0,0.0,,0,0,0,0,0.0,0,0,0,0,,,0,0,0,,0.59,0.0,1.0
1,18172,1817,2,United States of America,0,0,,,,,,0,0.0,0,0.0,9629090.0,0.0,0,0.0,1,1.0,0.0,0.35,,,,,,1,1.0,0.0,3.21,,,-0.06,0.0,0.0,0,0,0,1,0,-48,1865,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0,0,,,0,0,0,,0.59,0.0,1.0
2,18182,1818,2,United States of America,0,0,,,,,,0,0.0,0,0.0,9629090.0,0.0,0,0.0,1,1.0,0.0,0.35,,,,,,0,1.0,0.0,3.21,,,-0.08,-0.06,0.0,0,0,0,1,0,-47,1865,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0,0,,,0,0,0,,0.59,0.0,1.0
3,18192,1819,2,United States of America,0,0,,,,,,0,0.0,0,0.0,9629090.0,0.0,0,0.0,1,1.0,0.0,0.35,,,,,,0,0.0,0.0,3.21,,,-0.11,-0.08,0.0,0,0,0,0,0,-46,1865,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0,0,,,0,0,0,,0.59,0.0,1.0
4,18202,1820,2,United States of America,0,0,,,,,,0,0.0,0,0.0,9629090.0,0.0,0,0.0,1,1.0,0.0,0.35,12548.0,1257.18,,,,0,0.0,0.0,3.21,9.208389,,0.01,-0.11,0.0,0,0,0,0,0,-45,1865,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0,0,,,0,0,0,9980.51,0.59,0.0,1.0


## Creating df focusing on countries and their colonizers

In [4]:
imppowerdata = complete_data.filter(
                 ['country',
                 'imppower']
                , axis=1)

In [5]:
imppowerdata['imppower'].unique()

array([nan, 'United Kingdom', 'Spain', 'USA', 'Haiti', 'Colombia',
       'Netherlands', 'Bolivia', 'Portugal', 'Brazil', 'Germany',
       'MIXED RULE', 'Austria-Hungary', 'Russia', 'Czechoslovakia',
       'Turkey', 'Yugoslavia', 'Sweden', 'Denmark', 'France', 'Sudan',
       'Belgium', 'Egypt', 'Italy', 'Ethiopia', 'South Africa', 'China',
       'Korea', 'Japan', 'Pakistan', 'Thailand', 'Australia'],
      dtype=object)

##### Creating df with correct format

In [6]:
# colonies_df = pd.DataFrame(imppowerdata.dropna().groupby('country')['imppower'].unique()).reset_index()
colonies_df = imppowerdata.dropna().drop_duplicates()
colonies_df

Unnamed: 0,country,imppower
186,Canada,United Kingdom
372,Cuba,Spain
454,Cuba,USA
559,Haiti,USA
745,Dominican Republic,Spain
751,Dominican Republic,Haiti
795,Dominican Republic,USA
931,Mexico,Spain
1120,Belize,United Kingdom
1306,Guatemala,Spain


##### How many countries are in the colonies df?

In [7]:
len(colonies_df['country'].unique())

128

In [17]:
hfi_df = pd.read_csv('../data/original/hfi_cc_2018.csv')

TypeError: parser_f() got an unexpected keyword argument 'index'

##### How many countries in the HFI df?

In [9]:
len(hfi_df['countries'].unique())

162

There are more countries in the HFI df than in the colonies df. 
This could be due to several reasons:
- The countries present in hfi df but not in colonies df were never colonized
- There isnt hfi data present for the countries in colonies df but not hfi df
- The country names do not match

##### Which countries are in the hfi df but not in the colonies df?

In [10]:
sorted(list(set(hfi_df['countries']) - set(colonies_df['country'])))

['Argentina',
 'Bahamas',
 'Bahrain',
 'Barbados',
 'Bhutan',
 'Brunei Darussalam',
 'Cape Verde',
 'Central Afr. Rep.',
 'China',
 'Congo, Dem. R.',
 'Congo, Rep. Of',
 "Cote d'Ivoire",
 'Czech Rep.',
 'Denmark',
 'Dominican Rep.',
 'Ethiopia',
 'Fiji',
 'France',
 'Gambia, The',
 'Germany',
 'Hong Kong',
 'Iran',
 'Jamaica',
 'Japan',
 'Korea, South',
 'Kuwait',
 'Kyrgyz Republic',
 'Liberia',
 'Libya',
 'Luxembourg',
 'Malta',
 'Mauritius',
 'Montenegro',
 'Nepal',
 'Oman',
 'Pap. New Guinea',
 'Paraguay',
 'Portugal',
 'Russia',
 'Saudi Arabia',
 'Serbia',
 'Seychelles',
 'Singapore',
 'Slovak Rep.',
 'Spain',
 'Sweden',
 'Switzerland',
 'Thailand',
 'Timor-Leste',
 'Trinidad and Tobago',
 'Turkey',
 'United Arab Emirates',
 'United Kingdom',
 'United States',
 'Yemen, Rep.']

##### Which countries are in the colonies df but not in the hfi df

In [11]:
sorted(list(set(colonies_df['country']) - set(hfi_df['countries'])))

['Central African Republic',
 'Congo',
 'Cuba',
 'Czech Republic',
 'Democratic Republic of the Congo',
 'Djibouti',
 'Dominican Republic',
 'Equatorial Guinea',
 'Eritrea',
 'Gambia',
 'Ivory Coast',
 'Kyrgyzstan',
 'North Korea',
 'Papua New Guinea',
 'Slovakia',
 'Somalia',
 'South Korea',
 'Turkmenistan',
 'Uzbekistan',
 'Yemen',
 'Yugoslavia']

**hfi df does not contain following countries:**
North Korea, Djibouti, Equatorial Guinea, Eritrea, Cuba, Uzbekistan, Yugoslavia, Somalia, Turkmenistan

In [12]:
country_names_replace = {
    'Yemen': 'Yemen, Rep.',
    'Papua New Guinea': 'Pap. New Guinea',
    'Kyrgyzstan': 'Kyrgyz Republic',
    'Central African Republic': 'Central Afr. Rep.',
    'South Korea': 'Korea, South',
    'Slovakia':'Slovak Rep.',
    'Gambia':'Gambia, The',
    'Dominican Republic':'Dominican Rep.',
    'Congo':'Congo, Rep. Of',
    'Ivory Coast':"Cote d'Ivoire",
    'Democratic Republic of the Congo':'Congo, Dem. R.',
    'Czech Republic':'Czech Rep.'
}

In [13]:
colonies_df = colonies_df.replace({'country': country_names_replace})

In [18]:
colonies_df.to_csv('../data/temp.csv', sep=';', index=False)

In [15]:
pd.DataFrame(imppowerdata.dropna().groupby('country')['imppower'].unique())

Unnamed: 0_level_0,imppower
country,Unnamed: 1_level_1
Albania,[Turkey]
Algeria,[France]
Angola,[Portugal]
Armenia,"[MIXED RULE, Russia]"
Australia,[United Kingdom]
Austria,[Germany]
Azerbaijan,[Russia]
Bangladesh,"[United Kingdom, Pakistan]"
Belarus,[Russia]
Belgium,"[Netherlands, Germany]"


In [16]:
imppowerdata.dropna().drop_duplicates()

Unnamed: 0,country,imppower
186,Canada,United Kingdom
372,Cuba,Spain
454,Cuba,USA
559,Haiti,USA
745,Dominican Republic,Spain
751,Dominican Republic,Haiti
795,Dominican Republic,USA
931,Mexico,Spain
1120,Belize,United Kingdom
1306,Guatemala,Spain
