In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
import datetime as dt
import re

Cleaning

In [2]:
data = pd.read_csv("UN DATA.csv", engine='python')
# They throw problems, manual inspection
data.drop("COTE D’IVOIRE", axis=1, inplace=True)
data.drop(" UNITED KINGDOM", axis=1, inplace=True)
data.drop(" UNITED STATES", axis=1, inplace=True)
data.drop(" CHILE", axis=1, inplace=True)
data.drop(" SOMALIA", axis=1, inplace=True)
data.drop(" ALGERIA", axis=1, inplace=True)
data.drop(" BELGIUM", axis=1, inplace=True)
data.drop("Aa UNITED STATES", axis=1, inplace=True)
data.drop("AY UNION OF SOUTH AFRICA", axis=1, inplace=True)
data.drop("AY DENMARK", axis=1, inplace=True)
data.drop("AY SWEDEN", axis=1, inplace=True)

Keep only alphanumericalnames of countries to then use as a mapping 

In [3]:
colnames_country = {i: re.sub('[^A-Za-z0-9]+', '', i) for i in data.columns.tolist()}
data = data.rename(columns=colnames_country)
countries = data.columns[11:]

Create country-country DataFrame

In [4]:
from itertools import product

ctr_ctr = pd.DataFrame(list(product(countries, countries)))

Remove duplicated country pairs

In [5]:
# https://stackoverflow.com/a/40475008
ctr_ctr = pd.DataFrame(np.sort(ctr_ctr.values, axis=1), columns=ctr_ctr.columns).drop_duplicates()

Remove same country pairs

In [6]:
# https://stackoverflow.com/a/43951580
ctr_ctr = ctr_ctr[ctr_ctr[0] != ctr_ctr[1]].rename(columns={0: "countryA",1: "countryB"})

Get list of countries in use

In [7]:
countryA = ctr_ctr.groupby(by="countryA").count().index.tolist()
ctr_ctr.head

<bound method NDFrame.head of                         countryA                       countryB
1                       MALAYSIA                        SENEGAL
2                        SENEGAL  VENEZUELABOLIVARIANREPUBLICOF
3                        SENEGAL                  UNITEDKINGDOM
4              RUSSIANFEDERATION                        SENEGAL
5                          JAPAN                        SENEGAL
...                          ...                            ...
60758        DEMOCRATICCAMPUCHEA       SAINTCHRISTOPHERANDNEVIS
60759  CONGODEMOCRATICREPUBLICOF       SAINTCHRISTOPHERANDNEVIS
61006        DEMOCRATICCAMPUCHEA                    TKAZAKHSTAN
61007  CONGODEMOCRATICREPUBLICOF                    TKAZAKHSTAN
61255  CONGODEMOCRATICREPUBLICOF            DEMOCRATICCAMPUCHEA

[30628 rows x 2 columns]>

Compute similarity

In [8]:
ctr_ctr["similarity"] = np.nan
for idx, cA in enumerate(tqdm(countryA)):
    for cB in countryA[idx+1:]:
        # Find data of 2 countries voting on the same resolution
        currentData = pd.concat([data[cA], data[cB]], axis=1).dropna()
        total = currentData[cA].count() # number of votes
        if total > 100:
            same = currentData[cA].eq(currentData[cB]).sum() # ocassions when they have the same vote
            ctr_ctr.loc[(ctr_ctr.countryA == cA) & (ctr_ctr.countryB == cB),'similarity'] = same/total

ctr_ctr.dropna(inplace=True) # remove countries with non-relevant similarities

  0%|          | 0/247 [00:00<?, ?it/s]

In [12]:
ctr_ctr.sort_values(by=["countryA"])

Unnamed: 0,countryA,countryB,similarity
1375,AFGHANISTAN,JAPAN,0.489941
2615,AFGHANISTAN,NEW ZEALAND,0.516345
32375,AFGHANISTAN,SAO TOME AND PRINCIPE,0.606853
15759,AFGHANISTAN,CABO VERDE,0.870886
6583,AFGHANISTAN,UNITED REPUBLIC OF TANZANIA,0.798859
...,...,...,...
12365,YEMEN,ZAIRE,0.786008
10713,YEMEN,ZAMBIA,0.839006
10865,YUGOSLAVIA,ZAMBIA,0.759430
50061,YUGOSLAVIA,ZAIRE,0.846914


Map name of countries back to their original values

In [13]:
colnames_country = {v: k for k, v in colnames_country.items()}
colnames_country_func = lambda x: colnames_country[x] if x in colnames_country.keys() else x
ctr_ctr['countryA'] = ctr_ctr['countryA'].map(colnames_country_func)
ctr_ctr['countryB'] = ctr_ctr['countryB'].map(colnames_country_func)


In [14]:
ctr_ctr.to_csv("neo4j_votes.csv", encoding='utf-8', index=False)