In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
FV_data = pd.read_csv("V-Dem_Frasier_data_1970_alliances.csv")
regime_data = pd.read_csv("Regime_types_in_1970.csv")

In [3]:
regime_countries = regime_data['country'].tolist()
regime_codes = regime_data['abbreviation'].tolist()
FV_entities1 = FV_data['Countries'].tolist()
FV_entities2 = FV_data['Entity'].tolist()
FV_codes = FV_data['ISO_Code'].tolist()

In [4]:
only_regimes=[i for i in regime_countries if i not in FV_entities2]
print(only_regimes)
print(len(only_regimes))

['UNITED STATES OF AMERICA', 'CANADA', 'CUBA', 'HAITI', 'DOMINICAN REP.', 'JAMAICA', 'TRINIDAD&TOBAGO', 'BARBADOS', 'MEXICO', 'GUATEMALA', 'HONDURAS', 'EL SALVADOR', 'NICARAGUA', 'COSTA RICA', 'PANAMA', 'COLOMBIA', 'VENEZUELA', 'GUYANA', 'ECUADOR', 'PERU', 'BRAZIL', 'BOLIVIA', 'PARAGUAY', 'CHILE', 'ARGENTINA', 'URUGUAY', 'UNITED KINGDOM', 'IRELAND', 'NETHERLANDS', 'BELGIUM', 'LUXEMBOURG', 'FRANCE', 'MONACO', 'LIECHTENSTEIN', 'SWITZERLAND', 'SPAIN', 'PORTUGAL', 'GERMANY WEST', 'GERMANY EAST', 'POLAND', 'AUSTRIA', 'HUNGARY', 'CZECHOSLOVAKIA', 'ITALY', 'SAN MARINO', 'MALTA', 'ALBANIA', 'YUGOSLAVIA', 'GREECE', 'CYPRUS', 'BULGARIA', 'ROMANIA', 'USSR', 'ESTONIA', 'LATVIA', 'LITHUANIA', 'FINLAND', 'SWEDEN', 'NORWAY', 'DENMARK', 'ICELAND', 'EQUATORIAL GUINEA', 'GAMBIA', 'MALI', 'SENEGAL', 'BENIN', 'MAURITANIA', 'NIGER', "COTE D'IVOIRE", 'GUINEA', 'BURKINA FASO', 'LIBERIA', 'SIERRA LEONE', 'GHANA', 'TOGO', 'CAMEROON', 'NIGERIA', 'GABON', 'CENTRAL AFR.R.', 'CHAD', 'CONGO, REP.', 'CONGO, DEM. REP

The trouble is, they are all capitalized so that will be fixed with the following code:

In [5]:
regime_countries= list(map(lambda x: x.title(), regime_countries))

In [6]:
only_regimes=[i for i in regime_countries if i not in FV_entities2]
print(only_regimes)
print(len(only_regimes))

['United States Of America', 'Dominican Rep.', 'Trinidad&Tobago', 'Monaco', 'Liechtenstein', 'Germany West', 'Germany East', 'Czechoslovakia', 'San Marino', 'Yugoslavia', 'Ussr', "Cote D'Ivoire", 'Central Afr.R.', 'Congo, Rep.', 'Congo, Dem. Rep.', 'Ethiopia  (Incl. Erit)', 'Yemen Arab Republic', "Yemen, People'S Dem. Rep.", 'Korea North', 'Korea South', 'Pakistan  (Incl. Banglad', 'Vietnam North', 'Vietnam South', 'Tonga', 'Nauru', 'Samoa']
26


In [7]:
only_regime_codes=[i for i in regime_codes if i not in FV_codes]
print(only_regime_codes)
print(len(only_regime_codes))

['MCO', 'LIE', 'DDR', 'CSK', 'SMR', 'SER', 'ROM', 'SVU', 'SWD', 'ZAR', 'ETF', 'YDR', 'ROK', 'VDR', 'SVR', 'TON', 'NRU', 'WSM']
18


The most efficient, and methodologically sound thing to do will be to see where the inconsistencies lie with the country codes. There could be 3 possible reasons for the inconsistencies:
1) Different codes are used between the two datasets
2) Entities are missing from one dataset or the other
3) There is genuine confusion about entities (e.g., data for CZECHOSLOVAKIA, vs. Czechia, and Slovakia, stored as separate 
entities) 

In [8]:
inconsistent_countries = []
inconsistent_country_code_pairs = {}
for i in only_regime_codes:
    country=((regime_data[regime_data['abbreviation']==i]['country']).values[0])
    inconsistent_countries.append(country)
    inconsistent_country_code_pairs[i]=country
inconsistent_countries= list(map(lambda x: x.title(), inconsistent_countries))
print(inconsistent_countries)

['Monaco', 'Liechtenstein', 'Germany East', 'Czechoslovakia', 'San Marino', 'Yugoslavia', 'Romania', 'Ussr', 'Sweden', 'Congo, Dem. Rep.', 'Ethiopia  (Incl. Erit)', "Yemen, People'S Dem. Rep.", 'Korea South', 'Vietnam North', 'Vietnam South', 'Tonga', 'Nauru', 'Samoa']


It will be good to find out which of these entities ARE included in in the main Dataframe

In [9]:
included_countries=[i for i in inconsistent_countries if i in FV_entities1 or i in FV_entities2]
print(included_countries)
print(len(included_countries))

['Romania', 'Sweden', 'Congo, Dem. Rep.']
3


Those are perfect matches and one may also look for partial matches

In [10]:
FV_entities1_filtered=[x for x in FV_entities1 if not isinstance(x, float)]

In [11]:
FV_entities2_filtered=[x for x in FV_entities2 if not isinstance(x, float)]

In [12]:
inconsistencies = {}
for i in inconsistent_countries:
    inconsistencies[i]=(list(filter(lambda x: i[0:3] in x, FV_entities1_filtered)))
print(inconsistencies)    

{'Monaco': ['Mongolia', 'Montenegro'], 'Liechtenstein': [], 'Germany East': ['Germany'], 'Czechoslovakia': ['Czech Republic'], 'San Marino': [], 'Yugoslavia': [], 'Romania': ['Romania'], 'Ussr': [], 'Sweden': ['Sweden'], 'Congo, Dem. Rep.': ['Congo, Rep.', 'Congo, Dem. Rep.'], 'Ethiopia  (Incl. Erit)': ['Ethiopia'], "Yemen, People'S Dem. Rep.": ['Yemen, Rep.'], 'Korea South': ['Korea, Rep.'], 'Vietnam North': ['Vietnam'], 'Vietnam South': ['Vietnam'], 'Tonga': [], 'Nauru': [], 'Samoa': []}


In [13]:
inconsistencies = {}
for i in inconsistent_countries:
    inconsistencies[i]=(list(filter(lambda x: i[0:3] in x, FV_entities2_filtered)))
print(inconsistencies)  
print(inconsistent_country_code_pairs)

{'Monaco': ['Mongolia'], 'Liechtenstein': [], 'Germany East': ['East Germany', 'Germany'], 'Czechoslovakia': ['Czechia'], 'San Marino': [], 'Yugoslavia': [], 'Romania': ['Romania'], 'Ussr': [], 'Sweden': ['Sweden'], 'Congo, Dem. Rep.': ['Congo', 'Democratic Republic of Congo'], 'Ethiopia  (Incl. Erit)': ['Ethiopia'], "Yemen, People'S Dem. Rep.": ['Yemen', "Yemen People's Republic"], 'Korea South': ['North Korea', 'South Korea'], 'Vietnam North': ['Republic of Vietnam', 'Vietnam'], 'Vietnam South': ['Republic of Vietnam', 'Vietnam'], 'Tonga': [], 'Nauru': [], 'Samoa': []}
{'MCO': 'MONACO', 'LIE': 'LIECHTENSTEIN', 'DDR': 'GERMANY EAST', 'CSK': 'CZECHOSLOVAKIA', 'SMR': 'SAN MARINO', 'SER': 'YUGOSLAVIA', 'ROM': 'ROMANIA', 'SVU': 'USSR', 'SWD': 'SWEDEN', 'ZAR': 'CONGO, DEM. REP.', 'ETF': 'ETHIOPIA  (INCL. ERIT)', 'YDR': "YEMEN, PEOPLE'S DEM. REP.", 'ROK': 'KOREA SOUTH', 'VDR': 'VIETNAM NORTH', 'SVR': 'VIETNAM SOUTH', 'TON': 'TONGA', 'NRU': 'NAURU', 'WSM': 'SAMOA'}


With the above output, we can get a clear idea of what entities are just being named differently (e.g., "Korea South" vs. "South Korea"), Where some entities are genuinely missing (e.g., the regime type data has Monaco, and the main dataframe does not have it) and where there is genuine confusion re: what entities are what (e.g., the main dataframe distinguishes Czechia and Slovakia, and the regime type data merges them, in accordance with the fact that they were once included together in a union: Czechoslovakia)
These latter cases will be the toughest to deal with, as they will require research to determine how best to deal with these cases.

First I will pair the inconsistent country codes with the appropriate country codes based on the output above.

In [14]:
((FV_data[FV_data['Entity']=='East Germany']['ISO_Code']).values[0])
((FV_data[FV_data['Entity']=='Romania']['ISO_Code']).values[0])
((FV_data[FV_data['Entity']=='Sweden']['ISO_Code']).values[0])
((FV_data[FV_data['Countries']=='Congo, Dem. Rep.']['ISO_Code']).values[0])
((FV_data[FV_data['Entity']=="Yemen People's Republic"]['ISO_Code']).values[0])
((FV_data[FV_data['Entity']=="South Korea"]['ISO_Code']).values[0])
#north Vietnam
((FV_data[FV_data['Entity']=="Vietnam"]['ISO_Code']).values[0])
#South Vietnam
((FV_data[FV_data['Entity']=='Republic of Vietnam']['ISO_Code']).values[0])



'OWID_RVN'

In [15]:
regime_data["abbreviation"]=regime_data["abbreviation"].replace('DDR', ((FV_data[FV_data['Entity']=='East Germany']['ISO_Code']).values[0]))
regime_data["abbreviation"]=regime_data["abbreviation"].replace('ROM', ((FV_data[FV_data['Entity']=='Romania']['ISO_Code']).values[0]))
regime_data["abbreviation"]=regime_data["abbreviation"].replace('SWD',((FV_data[FV_data['Entity']=='Sweden']['ISO_Code']).values[0]) )
regime_data["abbreviation"]=regime_data["abbreviation"].replace('ZAR',((FV_data[FV_data['Countries']=='Congo, Dem. Rep.']['ISO_Code']).values[0]) )
regime_data["abbreviation"]=regime_data["abbreviation"].replace('YDR',((FV_data[FV_data['Entity']=="Yemen People's Republic"]['ISO_Code']).values[0]) )
regime_data["abbreviation"]=regime_data["abbreviation"].replace('ROK',((FV_data[FV_data['Entity']=="South Korea"]['ISO_Code']).values[0]))
regime_data["abbreviation"]=regime_data["abbreviation"].replace('VDR',((FV_data[FV_data['Entity']=="Vietnam"]['ISO_Code']).values[0]) )
regime_data["abbreviation"]=regime_data["abbreviation"].replace('SVR',((FV_data[FV_data['Entity']=='Republic of Vietnam']['ISO_Code']).values[0]) )

Now after changing the codes to something more consistent with the main data we can explore the remaining inconsistencies that will be trickier to deal with. In summary the Regime data makes note of the fact that many independent countries today were in unions back in 1970. It provides data for the unions, whereas the main data looks at the countries as independent. This is a change in the underlying "entities" that is going to be tough to deal with no matter what. A decent strategy is just apply the regime type for the union to all union members.

In [16]:
regime_codes = regime_data['abbreviation'].tolist()
only_regime_codes=[i for i in regime_codes if i not in FV_codes]
print(only_regime_codes)
print(len(only_regime_codes))


['MCO', 'LIE', 'CSK', 'SMR', 'SER', 'SVU', 'ETF', 'TON', 'NRU', 'WSM']
10


It will be useful to see the other side of the coin; what is included in FV_codes but not regime_codes

In [17]:
only_FV_codes=[i for i in FV_codes if i not in regime_codes]
print(only_FV_codes)
print(len(only_FV_codes))


['AGO', 'ARM', 'AZE', 'BHR', 'BGD', 'BLR', 'BIH', 'CPV', 'COM', 'HRV', 'CZE', 'DJI', 'ERI', 'ETH', 'GEO', 'GNB', 'HKG', 'KAZ', 'KGZ', 'MDA', 'MOZ', 'NAM', 'MKD', 'PSE_? ', 'PNG', 'QAT', 'RUS', 'STP', 'SRB', 'SYC', 'SVK', 'SVN', 'SLB', 'SSD', 'SUR', 'TJK', 'TLS', 'TKM', 'UKR', 'UZB', 'VUT', 'OWID_ZAN', 'ARE', 'BHS', 'BLZ', 'BRN', 'MNE']
47


In [18]:
inconsistent_countries = []
for i in only_FV_codes:
    country=((FV_data[FV_data['ISO_Code']==i]['Entity']).values[0])
    inconsistent_countries.append(country)
print(inconsistent_countries)

['Angola', 'Armenia', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Belarus', 'Bosnia and Herzegovina', 'Cape Verde', 'Comoros', 'Croatia', 'Czechia', 'Djibouti', 'Eritrea', 'Ethiopia', 'Georgia', 'Guinea-Bissau', 'Hong Kong', 'Kazakhstan', 'Kyrgyzstan', 'Moldova', 'Mozambique', 'Namibia', 'North Macedonia', 'Palestine/West Bank', 'Papua New Guinea', 'Qatar', 'Russia', 'Sao Tome and Principe', 'Serbia', 'Seychelles', 'Slovakia', 'Slovenia', 'Solomon Islands', 'South Sudan', 'Suriname', 'Tajikistan', 'Timor', 'Turkmenistan', 'Ukraine', 'Uzbekistan', 'Vanuatu', 'Zanzibar', nan, nan, nan, nan, nan]


The trouble is that some of these countries may genuinely be missing from the "regime data" but many of them are simply included under "USSR" or take your pick in the regime data. I hate to say it, but the most methodologically sound thing to do would seem to be to apply the USSR Regime classification to all it's member republics. The Czechoslovakia regime classification to both Czechia and Slovakia.

Bahrain was a protectorate of Britain and an absolute monarchy under the same family simultaneously 
A major problem we're running into here is countries like Hong Kong that were British Colonies but had their own very unique system.
Crucially Bahrain and Hong Kong were both British colonies, and both not listed as their own regimes in the regime data, but they both had very different systems. Hong Kong the ultimate libertarian Republic in many ways and Bahrain an absolute monarchy. So this creates real problems for labelling regimes. I guess one strategy is see what they were labeled post independence But I am honestly inclined to give NA or "unknown" to the colonies.
Apparently Qatar was an indepedent sovereign state and absolute monarchy although it is not included in the regime data. 

I am going to opt to classify the regime as unknown in the case of all overseas colonies for European powers including the British ones, French ones, and the numerous ones possessed by Portugal. The clear problem is two overseas colonies belonging to the same European country could be governed very, very differently (e.g., Hong Kong vs. Bahrain). Given my lack of subject expertise these will be classified as unknown until such a time as a subject matter expert can shed more light on these.

In [19]:
inconsistent_countries = []
for i in only_FV_codes:
    country=((FV_data[FV_data['ISO_Code']==i]['Countries']).values[0])
    inconsistent_countries.append(country)
print(inconsistent_countries)

['Angola', 'Armenia', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Belarus', 'Bosnia and Herzegovina', 'Cabo Verde', 'Comoros', 'Croatia', 'Czech Republic', 'Djibouti', nan, 'Ethiopia', 'Georgia', 'Guinea-Bissau', 'Hong Kong SAR, China', 'Kazakhstan', 'Kyrgyz Republic', 'Moldova', 'Mozambique', 'Namibia', 'North Macedonia', nan, 'Papua New Guinea', 'Qatar', 'Russian Federation', nan, 'Serbia', 'Seychelles', 'Slovak Republic', 'Slovenia', nan, nan, 'Suriname', 'Tajikistan', 'Timor-Leste', nan, 'Ukraine', nan, nan, nan, 'United Arab Emirates', 'Bahamas, The', 'Belize', 'Brunei Darussalam', 'Montenegro']


In [20]:
Portuguese_territory = ['Angola','Cape Verde','Guinea-Bissau', 'Mozambique', 'Sao Tome and Principe','Timor' ]
USSR = ['Armenia','Azerbaijan', 'Belarus', 'Georgia', 'Kazakhstan', 'Kyrgyzstan', 'Moldova','Russia','Tajikistan','Turkmenistan','Ukraine', 'Uzbekistan']
Czechoslovakia = ['Czechia','Slovakia']
included_with_pakistan = ['Bangladesh', ]
included_with_Yugoslavia = ['Bosnia and Herzegovina','Croatia', 'North Macedonia','Serbia','Slovenia','Montenegro']
French_territory = ['Comoros','Djibouti',]
included_with_ethiopia = ['Eritrea','Ethiopia']
included_with_South_Africa = ['Namibia', ]
governed_by_jordan = ['Palestine/West Bank']
governed_by_Australia = ['Papua New Guinea']
British_territory = ['Bahrain','Hong Kong','Qatar', 'Seychelles','Solomon Islands','United Arab Emirates','Bahamas, The','Belize','Brunei Darussalam',]
included_With_Republic_of_Sudan = ['South Sudan']
Dutch_colony = ['Suriname']
other = ['Vanuatu']
included_with_Tanzania = ['Zanzibar']

The plan is to add entries to the regime data for every country in the USSR, Czechoslovakia, included_with_pakistan, included_with_Yugoslavia, included_with_ethiopia, included_with_South_Africa, included_With_Republic_of_Sudan, included_with_Tanzania. Whatever regime that overarching entity is classified as, the members will be classified as the same. This is arguably a problematic approach but if critiqued it can be re-done. Conversely I don't have any good way to label the former colonies of the European countries accurately for what kind of regime the constitute. The reason being is that regime types could vary wildly among colonies ruled by the same European country, for that reason they should all be classified as unknown. 

In [21]:
#adding appropriate rows for USSR
for i in USSR:
    new_row = {'country': i, 'abbreviation':((FV_data[FV_data['Entity']==i]['ISO_Code']).values[0]), 'regimenarrowcat':4}
    regime_data.loc[len(regime_data)] = new_row

In [22]:
#adding appropriate rows for Czechoslovakia
for i in Czechoslovakia:
    new_row = {'country': i, 'abbreviation':((FV_data[FV_data['Entity']==i]['ISO_Code']).values[0]), 'regimenarrowcat':4}
    regime_data.loc[len(regime_data)] = new_row

In [23]:
#adding appropriate rows for Yugoslavia
for i in included_with_Yugoslavia:
    new_row = {'country': i, 'abbreviation':((FV_data[FV_data['Entity']==i]['ISO_Code']).values[0]), 'regimenarrowcat':4}
    regime_data.loc[len(regime_data)] = new_row

IndexError: index 0 is out of bounds for axis 0 with size 0

In [24]:
new_row = {'country': 'Montenegro', 'abbreviation':((FV_data[FV_data['Countries']=='Montenegro']['ISO_Code']).values[0]), 'regimenarrowcat':4}
regime_data.loc[len(regime_data)] = new_row

In [25]:
new_row = {'country': 'Bangladesh', 'abbreviation':((FV_data[FV_data['Entity']=='Bangladesh']['ISO_Code']).values[0]), 'regimenarrowcat':7}
regime_data.loc[len(regime_data)] = new_row
new_row = {'country': 'Eritrea', 'abbreviation':((FV_data[FV_data['Entity']=='Eritrea']['ISO_Code']).values[0]), 'regimenarrowcat':8}
regime_data.loc[len(regime_data)] = new_row
new_row = {'country': 'Namibia', 'abbreviation':((FV_data[FV_data['Entity']=='Namibia']['ISO_Code']).values[0]), 'regimenarrowcat':10}
regime_data.loc[len(regime_data)] = new_row
new_row = {'country': 'South Sudan', 'abbreviation':((FV_data[FV_data['Entity']=='South Sudan']['ISO_Code']).values[0]), 'regimenarrowcat':7}
regime_data.loc[len(regime_data)] = new_row
new_row = {'country': 'Zanzibar', 'abbreviation':((FV_data[FV_data['Entity']=='Zanzibar']['ISO_Code']).values[0]), 'regimenarrowcat':4}
regime_data.loc[len(regime_data)] = new_row


In [26]:
regime_data.to_csv("Regime_data_entries_added.csv") 

In [27]:
regime_codes = regime_data['abbreviation'].tolist()
only_regime_codes=[i for i in regime_codes if i not in FV_codes]
print(only_regime_codes)
print(len(only_regime_codes))

['MCO', 'LIE', 'CSK', 'SMR', 'SER', 'SVU', 'ETF', 'TON', 'NRU', 'WSM']
10


In [28]:
regime_data["abbreviation"]=regime_data["abbreviation"].replace('ETF',((FV_data[FV_data['Entity']=='Ethiopia']['ISO_Code']).values[0]) )

In [29]:
only_FV_codes=[i for i in FV_codes if i not in regime_codes]
print(only_FV_codes)
print(len(only_FV_codes))

['AGO', 'BHR', 'CPV', 'COM', 'DJI', 'ETH', 'GNB', 'HKG', 'MOZ', 'PSE_? ', 'PNG', 'QAT', 'STP', 'SYC', 'SLB', 'SUR', 'TLS', 'VUT', 'ARE', 'BHS', 'BLZ', 'BRN']
22


In [30]:
inconsistent_countries = []
for i in only_FV_codes:
    country=((FV_data[FV_data['ISO_Code']==i]['Entity']).values[0])
    inconsistent_countries.append(country)
print(inconsistent_countries)

['Angola', 'Bahrain', 'Cape Verde', 'Comoros', 'Djibouti', 'Ethiopia', 'Guinea-Bissau', 'Hong Kong', 'Mozambique', 'Palestine/West Bank', 'Papua New Guinea', 'Qatar', 'Sao Tome and Principe', 'Seychelles', 'Solomon Islands', 'Suriname', 'Timor', 'Vanuatu', nan, nan, nan, nan]


Seems to be an acceptable state of affairs so we can do the merge.

In [31]:
regime_data.rename(columns={"abbreviation": 'ISO_Code'}, inplace=True)

In [32]:
regime_data.head(10)

Unnamed: 0.1,Unnamed: 0,country,ISO_Code,democracy,monarchy,regimenarrowcat
0,170.0,UNITED STATES OF AMERICA,USA,1.0,0,2
1,323.0,CANADA,CAN,1.0,1,0
2,488.0,CUBA,CUB,0.0,#NULL!,4
3,688.0,HAITI,HTI,0.0,#NULL!,6
4,864.0,DOMINICAN REP.,DOM,1.0,0,2
5,922.0,JAMAICA,JAM,1.0,1,0
6,980.0,TRINIDAD&TOBAGO,TTO,1.0,1,0
7,1034.0,BARBADOS,BRB,1.0,1,0
8,1475.0,MEXICO,MEX,0.0,#NULL!,5
9,1695.0,GUATEMALA,GTM,1.0,0,2


In [33]:
merge_try = pd.merge(FV_data,regime_data,on='ISO_Code',how='left')

  merge_try = pd.merge(FV_data,regime_data,on='ISO_Code',how='left')


In [34]:
merge_try.to_csv("Frasier_Vdem_regime&alliance.csv") 