In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import chardet

In [2]:
# In the datasets we are using, each country is associated with a country code.
# Here, we are reading in the csv that contains these codes and creating
# a dictionary so that we can access countries by their code rather than their name.
# This is important because the names of countries have changed over the time period
# we are looking at (e.g. USSR -> Russia, Ottoman Empire -> Turkey).
# It is important to keep this in mind during our analysis as this will show up as
# "Russia" having fought in WW2, for example, when really "USSR" fought in WW2.
# This is okay for our purposes because we are not primarily concerned about
# individual states but rather the quantitative things about states that are correlated with war.

country_codes = pd.read_csv("correlates_of_war/COW-country-codes.csv")
country_codes.drop_duplicates(inplace=True)
country_codes.set_index("CCode", inplace=True)
country_dict = country_codes.to_dict("index")

In [3]:
# Here we are checking that this worked correctly by plugging in some of
# the country codes and making sure they are associated with the right country.

print(country_dict[2]) # should be USA
print(country_dict[20]) # should be Canada
print(country_dict[365]) # should be Russia
try:
    print(country_dict[3]) # no country associated with this code, should throw a KeyError
except KeyError:
    print("Not a valid country code")

{'StateAbb': 'USA', 'StateNme': 'United States of America'}
{'StateAbb': 'CAN', 'StateNme': 'Canada'}
{'StateAbb': 'RUS', 'StateNme': 'Russia'}
Not a valid country code


In [4]:
# Now we are going to read in the rest of our csv files. Some of the files use Latin-1 encoding which
# causes problems when we try read it in with pandas as pandas assumes UTF-8 by default. To make this simpler
# I wrote a short function that will figure out the encoding. This isn't the most efficient, but it's
# not too slow and it works.
def get_encoding(filename):
    with open(filename, "rb") as f:
        return chardet.detect(f.read())["encoding"]

filename = "correlates_of_war/COW War Data/Inter-StateWarData_v4.0.csv"
interstate_war = pd.read_csv(filename, encoding=get_encoding(filename))

filename = "correlates_of_war/COW War Data/Intra-State-Wars-v5.1/INTRA-STATE_State_participants v5.1 CSV.csv"
intrastate_war = pd.read_csv(filename, encoding=get_encoding(filename))

filename = "correlates_of_war/COW War Data/Extra-StateWarData_v4.0.csv"
extrastate_war = pd.read_csv(filename, encoding=get_encoding(filename))

filename = "correlates_of_war/Diplomatic Exchange/Diplomatic_Exchange_2006v1.csv"
diplomatic_exchanges = pd.read_csv(filename, encoding=get_encoding(filename))

filename = "correlates_of_war/Colonial Contiguity/contcol.csv"
colonial_contiguity = pd.read_csv(filename, encoding=get_encoding(filename))

filename = "correlates_of_war/Direct Contiguity/contdir.csv"
direct_contiguity = pd.read_csv(filename, encoding=get_encoding(filename))

filename = "correlates_of_war/Formal Alliances/alliance_v4.1_by_member_yearly.csv"
formal_alliances = pd.read_csv(filename, encoding=get_encoding(filename))

filename = "correlates_of_war/Militarized Interstate Disputes/MIDIP 5.0.csv"
mid = pd.read_csv(filename, encoding=get_encoding(filename))

filename = "correlates_of_war/Militarized Interstate Dispute Locations/MIDLOCI_2.1.csv"
midl = pd.read_csv(filename, encoding=get_encoding(filename))

filename = "correlates_of_war/National Material Capabilities/NMC-60-abridged/NMC-60-abridged.csv"
nmc = pd.read_csv(filename, encoding=get_encoding(filename))

filename = "correlates_of_war/State System Membership/majors2016.csv"
major_powers = pd.read_csv(filename, encoding=get_encoding(filename))

filename = "correlates_of_war/Territorial Change/tc2018.csv"
territorial_change = pd.read_csv(filename, encoding=get_encoding(filename))

filename = "correlates_of_war/Trade/National_COW_4.0.csv"
national_trade = pd.read_csv(filename, encoding=get_encoding(filename))

filename = "correlates_of_war/Trade/Dyadic_COW_4.0.csv"
dyadic_trade = pd.read_csv(filename, encoding=get_encoding(filename))

filename = "correlates_of_war/World Religion/WRP_national.csv"
national_religion = pd.read_csv(filename, encoding=get_encoding(filename))

# **Data Cleaning**

Luckily for us, the Correlates of War Project has spent a lot of time making their data as easy to use as possible, which includes interpolating missing data sometimes or indicating that the data is missing otherwise. For example, -9 typically means missing data. I've looked through the codebooks that are provided with each dataset and can say that everywhere negative integers occur, it is meant to signify data is missing or not applicable, and so we can safely replace all instances of negative integers with NaN. We do this so that these negative numbers don't affect our analysis later.

In [8]:
# It's important that we check that it is an int, as negative floats have meaning 
# (e.g. in location coordinates) and if we try to compare a string to 0 we get an error.
def replace_missing(df):
    return df.applymap((lambda x: np.nan if isinstance(x, int) and x < 0 else x))

interstate_war = replace_missing(interstate_war)
intrastate_war = replace_missing(intrastate_war)
extrastate_war = replace_missing(extrastate_war)
diplomatic_exchanges = replace_missing(diplomatic_exchanges)
colonial_contiguity = replace_missing(colonial_contiguity)
direct_contiguity = replace_missing(direct_contiguity)
formal_alliances = replace_missing(formal_alliances)
mid = replace_missing(mid)
midl = replace_missing(midl)
nmc = replace_missing(nmc)
national_trade = replace_missing(national_trade)
dyadic_trade = replace_missing(dyadic_trade)
national_religion = replace_missing(national_religion)
# territorial_change also uses '.' to indicate missing values
territorial_change = replace_missing(territorial_change)
territorial_change = territorial_change.applymap((lambda x: np.nan if x == '.' else x))

In [None]:
# have to do something about missing data, if I want to just leave it, 
# need to explain why, could maybe do interpolation here

# **Exploratory Data Analysis**

what do we want to do

In [11]:
num_interwars = interstate_war.groupby("ccode").size()
num_intrawarsA = intrastate_war.groupby("CcodeA").size()
num_intrawarsB = intrastate_war.groupby("CcodeB").size()
num_extrawarsA = extrastate_war.groupby("CcodeA").size()
num_extrawarsB = extrastate_war.groupby("CcodeB").size()
# want to combine these into one