#### Import relevant packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno

%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Data

##### Source:
Wimmer and Min (2006) – “From empire to nation-state: Explaining war in the modern world, 1816-2001”, American Sociological Review 71(6):867-897, 2006.

In [2]:
complete_data = pd.read_csv('colonizers.csv')

In [3]:
complete_data.head()

Unnamed: 0,yearc,year,cowcode,country,onset,war,warname,warno,wartype,yrbeg,yrend,anarc,anarcl,anoc,anocl,area2001,asia,autoc,autocl,democ,democl,eeurop,ethfrac,gdp,gdppc,gdppcl,implag,imppower,instab,instabl,lamerica,lmtnest,lnpop,lnpopl,milperc,milpercl,nafrme,nbcivil,nbconq,nbinter,nbnatind,nbnonind,nsflag,nsfyear,ocivil,oconq,oil,oilpc,oilpcl,ointer,ointrap,onatind,ononind,pdemnb,pocivil,poconq,pointer,pointrap,poldisc,poldiscl,ponatind,pononind,ponset,pop,relfrac,ssafrica,western
0,18162,1816,2,United States of America,0,0,,,,,,0,,0,,9629090.0,0.0,0,,1,,0.0,0.35,,,,,,1,,0.0,3.21,,,0.0,,0.0,0,0,0,0,0,-49,1865,0,0,0.0,0.0,,0,0,0,0,0.0,0,0,0,0,,,0,0,0,,0.59,0.0,1.0
1,18172,1817,2,United States of America,0,0,,,,,,0,0.0,0,0.0,9629090.0,0.0,0,0.0,1,1.0,0.0,0.35,,,,,,1,1.0,0.0,3.21,,,-0.06,0.0,0.0,0,0,0,1,0,-48,1865,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0,0,,,0,0,0,,0.59,0.0,1.0
2,18182,1818,2,United States of America,0,0,,,,,,0,0.0,0,0.0,9629090.0,0.0,0,0.0,1,1.0,0.0,0.35,,,,,,0,1.0,0.0,3.21,,,-0.08,-0.06,0.0,0,0,0,1,0,-47,1865,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0,0,,,0,0,0,,0.59,0.0,1.0
3,18192,1819,2,United States of America,0,0,,,,,,0,0.0,0,0.0,9629090.0,0.0,0,0.0,1,1.0,0.0,0.35,,,,,,0,0.0,0.0,3.21,,,-0.11,-0.08,0.0,0,0,0,0,0,-46,1865,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0,0,,,0,0,0,,0.59,0.0,1.0
4,18202,1820,2,United States of America,0,0,,,,,,0,0.0,0,0.0,9629090.0,0.0,0,0.0,1,1.0,0.0,0.35,12548.0,1257.18,,,,0,0.0,0.0,3.21,9.208389,,0.01,-0.11,0.0,0,0,0,0,0,-45,1865,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0,0,,,0,0,0,9980.51,0.59,0.0,1.0


In [10]:
imppowerdata = complete_data.filter(
                ['year',
                 'country',
                 'imppower']
                , axis=1)

In [11]:
imppowerdata['imppower'].unique()

array([nan, 'United Kingdom', 'Spain', 'USA', 'Haiti', 'Colombia',
       'Netherlands', 'Bolivia', 'Portugal', 'Brazil', 'Germany',
       'MIXED RULE', 'Austria-Hungary', 'Russia', 'Czechoslovakia',
       'Turkey', 'Yugoslavia', 'Sweden', 'Denmark', 'France', 'Sudan',
       'Belgium', 'Egypt', 'Italy', 'Ethiopia', 'South Africa', 'China',
       'Korea', 'Japan', 'Pakistan', 'Thailand', 'Australia'],
      dtype=object)

In [15]:
imppowerdata.dropna()

Unnamed: 0,year,country,imppower
186,1816,Canada,United Kingdom
187,1817,Canada,United Kingdom
188,1818,Canada,United Kingdom
189,1819,Canada,United Kingdom
190,1820,Canada,United Kingdom
191,1821,Canada,United Kingdom
192,1822,Canada,United Kingdom
193,1823,Canada,United Kingdom
194,1824,Canada,United Kingdom
195,1825,Canada,United Kingdom


In [28]:
pd.DataFrame(imppowerdata.dropna().groupby('country')['imppower'].unique())

Unnamed: 0_level_0,imppower
country,Unnamed: 1_level_1
Albania,[Turkey]
Algeria,[France]
Angola,[Portugal]
Armenia,"[MIXED RULE, Russia]"
Australia,[United Kingdom]
Austria,[Germany]
Azerbaijan,[Russia]
Bangladesh,"[United Kingdom, Pakistan]"
Belarus,[Russia]
Belgium,"[Netherlands, Germany]"


In [26]:
len(imppowerdata['country'].unique())

150

In [31]:
pd.DataFrame(imppowerdata.dropna().groupby('country')['imppower'].unique()).to_csv('temp.csv', sep=';')

In [35]:
complete_data['country'].unique()

array(['United States of America', 'Canada', 'Cuba', 'Haiti',
       'Dominican Republic', 'Mexico', 'Belize', 'Guatemala', 'Honduras',
       'El Salvador', 'Nicaragua', 'Costa Rica', 'Panama', 'Colombia',
       'Venezuela', 'Guyana', 'Suriname', 'Ecuador', 'Peru', 'Brazil',
       'Bolivia', 'Paraguay', 'Chile', 'Argentina', 'Uruguay',
       'United Kingdom', 'Ireland', 'Netherlands', 'Belgium', 'France',
       'Switzerland', 'Spain', 'Portugal', 'Germany', 'Poland', 'Austria',
       'Hungary', 'Czech Republic', 'Slovakia', 'Italy', 'Albania',
       'Macedonia', 'Croatia', 'Yugoslavia', 'Bosnia and Herzegovina',
       'Slovenia', 'Greece', 'Cyprus', 'Bulgaria', 'Moldova', 'Romania',
       'Russia', 'Estonia', 'Latvia', 'Lithuania', 'Ukraine', 'Belarus',
       'Armenia', 'Georgia', 'Azerbaijan', 'Finland', 'Sweden', 'Norway',
       'Denmark', 'Iceland', 'Guinea-Bissau', 'Equatorial Guinea',
       'Gambia', 'Mali', 'Senegal', 'Benin', 'Mauritania', 'Niger',
       'Ivory Coas

In [39]:
imppowerdata[imppowerdata['country']=='Argentina']

Unnamed: 0,year,country,imppower
4287,1816,Argentina,
4288,1817,Argentina,
4289,1818,Argentina,
4290,1819,Argentina,
4291,1820,Argentina,
4292,1821,Argentina,
4293,1822,Argentina,
4294,1823,Argentina,
4295,1824,Argentina,
4296,1825,Argentina,


In [41]:
pd.DataFrame(imppowerdata.dropna().groupby('country')['imppower'].unique())

Unnamed: 0_level_0,imppower
country,Unnamed: 1_level_1
Albania,[Turkey]
Algeria,[France]
Angola,[Portugal]
Armenia,"[MIXED RULE, Russia]"
Australia,[United Kingdom]
Austria,[Germany]
Azerbaijan,[Russia]
Bangladesh,"[United Kingdom, Pakistan]"
Belarus,[Russia]
Belgium,"[Netherlands, Germany]"
