# CPI Analysis

In [1]:
# import the necessary libraries, set options, and define utility functions
import pandas as pd
import numpy as np

pd.options.display.max_columns = None

def overview(df):
    print(df.head(5))
    print(df.columns)
    print(df.describe())

In [2]:
# import CPI data from 2012-2021
CPI_2021 = pd.read_csv("datasets/CPI/CPI2021-Table 1.csv", header=2)
CPI_2020 = pd.read_csv("datasets/CPI/CPI2020-Table 1.csv", header=2)
CPI_2019 = pd.read_csv("datasets/CPI/CPI2019-Table 1.csv", header=2)
CPI_2018 = pd.read_csv("datasets/CPI/CPI2018-Table 1.csv", header=2)
CPI_2017 = pd.read_csv("datasets/CPI/CPI2017-Table 1.csv", header=2)
CPI_2016 = pd.read_csv("datasets/CPI/CPI2016-Table 1.csv")
CPI_2015 = pd.read_csv("datasets/CPI/CPI2015-Table 1.csv")
CPI_2014 = pd.read_csv("datasets/CPI/CPI2014-Table 1.csv")
CPI_2013 = pd.read_csv("datasets/CPI/CPI2013-Table 1.csv", header=1)
CPI_2012 = pd.read_csv("datasets/CPI/CPI2012-Table 1.csv")

# retrieve the Country, ISO3, Region, CPI score, and rank columns
# 2021

CPI_2021 = CPI_2021.iloc[:, :5]
CPI_2021.rename(columns={ "Country / Territory": "Country", "Rank": "Rank 2021"}, inplace=True)

# 2020
CPI_2020 = CPI_2020.iloc[:, :5]
CPI_2020.rename(columns={ "Rank": "Rank 2020"}, inplace=True)

CPI = CPI_2021.merge(CPI_2020, how="outer", on=["Country", "ISO3", "Region"])

# 2019
CPI_2019 = CPI_2019.iloc[:, :5]
CPI_2019.rename(columns={ "Rank": "Rank 2019"}, inplace=True)

CPI = CPI.merge(CPI_2019, how="outer", on=["Country", "ISO3", "Region"])

# 2018
CPI_2018 = CPI_2018.iloc[:, :5]
CPI_2018.rename(columns={ "Rank ": "Rank 2018", "CPI Score 2018": "CPI score 2018"}, inplace=True)

CPI = CPI.merge(CPI_2018, how="outer", on=["Country", "ISO3", "Region"])


# 2017
CPI_2017 = CPI_2017.iloc[:, :5]
CPI_2017.rename(columns={ "Rank ": "Rank 2017", "CPI Score 2017": "CPI score 2017"}, inplace=True)

CPI = CPI.merge(CPI_2017, how="outer", on=["Country", "ISO3", "Region"])

# 2016
CPI_2016 = CPI_2016.iloc[:, :4]
CPI_2016.rename(columns={ "CPI2016": "CPI score 2016", "Rank": "Rank 2016"}, inplace=True)

CPI = CPI.merge(CPI_2016, how="outer", on=["Country"], suffixes=["", "_2016"])
CPI.drop("Region_2016", axis=1, inplace=True)

# 2015
CPI_2015 = CPI_2015.iloc[:, :5]
CPI_2015.drop("Region", axis=1, inplace=True)
CPI_2015.rename(columns={ "CPI 2015 Score": "CPI score 2015", "Country Rank": "Rank 2015", "Country/Territory": "Country", "Country Code": "ISO3"}, inplace=True)

CPI = CPI.merge(CPI_2015, how="outer", on=["Country"], suffixes=["", "_2015"])

# 2014
CPI_2014 = CPI_2014.iloc[:, :5]
CPI_2014.drop("Region", axis=1, inplace=True)
CPI_2014.rename(columns={ "CPI 2014": "CPI score 2014", "Country Rank": "Rank 2014", "Country/Territory": "Country", "Country Code": "ISO3"}, inplace=True)
CPI = CPI.merge(CPI_2014, how="outer", on=["Country"], suffixes=["", "_2014"])

# 2013
CPI_2013 = CPI_2013.iloc[:, :7]
CPI_2013.drop(labels=["Region", "IFS Code", "Country Rank.1"], axis=1, inplace=True)
CPI_2013.rename(columns={ "CPI 2013 Score": "CPI score 2013", "Country Rank": "Rank 2013", "Country / Territory": "Country", "Country Code": "ISO3", "WB Code": "ISO3"}, inplace=True)
CPI = CPI.merge(CPI_2013, how="outer", on=["Country"], suffixes=["", "_2013"])

# 2012
CPI_2012 = CPI_2012.iloc[:, :4]
CPI_2012.drop(labels=["Unnamed: 2"], axis=1, inplace=True)
CPI_2012.rename(columns={ "CPI 2012 Score": "CPI score 2012", "Country Rank": "Rank 2012", "Country / Territory": "Country"}, inplace=True)
CPI = CPI.merge(CPI_2012, how="outer", on=["Country"], suffixes=["", "_2012"])


# sanity check
CPI

Unnamed: 0,Country,ISO3,Region,CPI score 2021,Rank 2021,CPI score 2020,Rank 2020,CPI score 2019,Rank 2019,CPI score 2018,Rank 2018,CPI score 2017,Rank 2017,CPI score 2016,Rank 2016,Rank 2015,ISO3_2015,CPI score 2015,Rank 2014,ISO3_2014,CPI score 2014,Rank 2013,ISO3_2013,CPI score 2013,Rank 2012,CPI score 2012
0,Denmark,DNK,WE/EU,88.0,1.0,88.0,1.0,87.0,1.0,88.0,1.0,88.0,2.0,90.0,1.0,1.0,DNK,91.0,1.0,DNK,92.0,1.0,DNK,91.0,1.0,90.0
1,Finland,FIN,WE/EU,88.0,1.0,85.0,3.0,86.0,3.0,85.0,3.0,85.0,3.0,89.0,3.0,3.0,FIN,90.0,3.0,FIN,89.0,3.0,FIN,89.0,1.0,90.0
2,New Zealand,NZL,AP,88.0,1.0,88.0,1.0,87.0,1.0,87.0,2.0,89.0,1.0,90.0,1.0,1.0,NZL,91.0,2.0,NZL,91.0,1.0,NZL,91.0,1.0,90.0
3,Norway,NOR,WE/EU,85.0,4.0,84.0,7.0,84.0,7.0,84.0,7.0,85.0,3.0,85.0,6.0,5.0,NOR,88.0,5.0,NOR,86.0,5.0,NOR,86.0,7.0,85.0
4,Singapore,SGP,AP,85.0,4.0,85.0,3.0,85.0,4.0,85.0,3.0,84.0,6.0,84.0,7.0,7.0,SGP,85.0,7.0,SGP,84.0,5.0,SGP,86.0,5.0,87.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,Korea (North),,,,,,,,,,,,,12.0,174.0,167.0,PRK,8.0,174.0,PRK,8.0,175.0,PRK,8.0,174.0,8.0
199,United States,,,,,,,,,,,,,,,16.0,USA,76.0,17.0,USA,74.0,19.0,USA,73.0,19.0,73.0
200,Côte d´Ivoire,,,,,,,,,,,,,,,106.0,CIV,32.0,115.0,CIV,32.0,136.0,CIV,27.0,130.0,29.0
201,Congo Republic,,,,,,,,,,,,,,,146.0,COG,23.0,152.0,COG,23.0,154.0,COG,22.0,144.0,26.0


In [3]:
CPI.head(5)

Unnamed: 0,Country,ISO3,Region,CPI score 2021,Rank 2021,CPI score 2020,Rank 2020,CPI score 2019,Rank 2019,CPI score 2018,Rank 2018,CPI score 2017,Rank 2017,CPI score 2016,Rank 2016,Rank 2015,ISO3_2015,CPI score 2015,Rank 2014,ISO3_2014,CPI score 2014,Rank 2013,ISO3_2013,CPI score 2013,Rank 2012,CPI score 2012
0,Denmark,DNK,WE/EU,88.0,1.0,88.0,1.0,87.0,1.0,88.0,1.0,88.0,2.0,90.0,1.0,1.0,DNK,91.0,1.0,DNK,92.0,1.0,DNK,91.0,1.0,90.0
1,Finland,FIN,WE/EU,88.0,1.0,85.0,3.0,86.0,3.0,85.0,3.0,85.0,3.0,89.0,3.0,3.0,FIN,90.0,3.0,FIN,89.0,3.0,FIN,89.0,1.0,90.0
2,New Zealand,NZL,AP,88.0,1.0,88.0,1.0,87.0,1.0,87.0,2.0,89.0,1.0,90.0,1.0,1.0,NZL,91.0,2.0,NZL,91.0,1.0,NZL,91.0,1.0,90.0
3,Norway,NOR,WE/EU,85.0,4.0,84.0,7.0,84.0,7.0,84.0,7.0,85.0,3.0,85.0,6.0,5.0,NOR,88.0,5.0,NOR,86.0,5.0,NOR,86.0,7.0,85.0
4,Singapore,SGP,AP,85.0,4.0,85.0,3.0,85.0,4.0,85.0,3.0,84.0,6.0,84.0,7.0,7.0,SGP,85.0,7.0,SGP,84.0,5.0,SGP,86.0,5.0,87.0


In [4]:
CPI.tail(5)

Unnamed: 0,Country,ISO3,Region,CPI score 2021,Rank 2021,CPI score 2020,Rank 2020,CPI score 2019,Rank 2019,CPI score 2018,Rank 2018,CPI score 2017,Rank 2017,CPI score 2016,Rank 2016,Rank 2015,ISO3_2015,CPI score 2015,Rank 2014,ISO3_2014,CPI score 2014,Rank 2013,ISO3_2013,CPI score 2013,Rank 2012,CPI score 2012
198,Korea (North),,,,,,,,,,,,,12.0,174.0,167.0,PRK,8.0,174.0,PRK,8.0,175.0,PRK,8.0,174.0,8.0
199,United States,,,,,,,,,,,,,,,16.0,USA,76.0,17.0,USA,74.0,19.0,USA,73.0,19.0,73.0
200,Côte d´Ivoire,,,,,,,,,,,,,,,106.0,CIV,32.0,115.0,CIV,32.0,136.0,CIV,27.0,130.0,29.0
201,Congo Republic,,,,,,,,,,,,,,,146.0,COG,23.0,152.0,COG,23.0,154.0,COG,22.0,144.0,26.0
202,Puerto Rico,,,,,,,,,,,,,,,,,,31.0,PRI,63.0,33.0,PRI,62.0,33.0,63.0


In [5]:
CPI.columns

Index(['Country', 'ISO3', 'Region', 'CPI score 2021', 'Rank 2021',
       'CPI score 2020', 'Rank 2020', 'CPI score 2019', 'Rank 2019',
       'CPI score 2018', 'Rank 2018', 'CPI score 2017', 'Rank 2017',
       'CPI score 2016', 'Rank 2016', 'Rank 2015', 'ISO3_2015',
       'CPI score 2015', 'Rank 2014', 'ISO3_2014', 'CPI score 2014',
       'Rank 2013', 'ISO3_2013', 'CPI score 2013', 'Rank 2012',
       'CPI score 2012'],
      dtype='object')

In [6]:
CPI.describe()

Unnamed: 0,CPI score 2021,Rank 2021,CPI score 2020,Rank 2020,CPI score 2019,Rank 2019,CPI score 2018,Rank 2018,CPI score 2017,Rank 2017,CPI score 2016,Rank 2016,Rank 2015,CPI score 2015,Rank 2014,CPI score 2014,Rank 2013,CPI score 2013,Rank 2012,CPI score 2012
count,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,181.0,180.0,176.0,176.0,168.0,168.0,175.0,175.0,177.0,177.0,176.0,176.0
mean,43.266667,89.072222,43.344444,88.966667,43.166667,88.966667,43.116667,89.127778,43.07221,89.116667,42.948864,87.198864,83.160714,42.595238,86.571429,43.268571,87.644068,42.655367,87.170455,43.267045
std,18.780591,51.881798,18.875162,51.759532,18.960264,51.656353,19.142008,51.74832,18.998547,51.765185,19.436716,50.543562,48.265152,20.195819,50.400058,19.84051,50.863597,19.922859,50.518618,19.674269
min,11.0,1.0,12.0,1.0,9.0,1.0,10.0,1.0,9.0,1.0,10.0,1.0,1.0,8.0,1.0,8.0,1.0,8.0,1.0,8.0
25%,29.75,45.0,29.0,45.0,29.0,44.0,29.0,45.0,29.0,45.0,29.0,44.0,42.25,28.0,44.0,29.0,45.0,28.0,44.5,28.75
50%,39.0,87.0,40.0,86.0,39.5,88.0,38.0,89.0,39.0,88.0,38.0,87.0,83.0,37.0,85.0,38.0,83.0,38.0,88.0,37.0
75%,55.0,130.0,56.0,134.0,56.0,130.0,57.0,132.0,56.0,135.0,57.0,131.0,123.0,54.25,126.0,55.0,127.0,56.0,130.75,56.25
max,88.0,180.0,88.0,179.0,87.0,180.0,88.0,180.0,89.0,180.0,90.0,176.0,167.0,91.0,174.0,92.0,175.0,91.0,174.0,90.0


In [7]:
CPI.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203 entries, 0 to 202
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         202 non-null    object 
 1   ISO3            186 non-null    object 
 2   Region          186 non-null    object 
 3   CPI score 2021  180 non-null    float64
 4   Rank 2021       180 non-null    float64
 5   CPI score 2020  180 non-null    float64
 6   Rank 2020       180 non-null    float64
 7   CPI score 2019  180 non-null    float64
 8   Rank 2019       180 non-null    float64
 9   CPI score 2018  180 non-null    float64
 10  Rank 2018       180 non-null    float64
 11  CPI score 2017  181 non-null    float64
 12  Rank 2017       180 non-null    float64
 13  CPI score 2016  176 non-null    float64
 14  Rank 2016       176 non-null    float64
 15  Rank 2015       168 non-null    float64
 16  ISO3_2015       168 non-null    object 
 17  CPI score 2015  168 non-null    flo

In [8]:
# there are some null values in the final dataframe
# let's see if we can clean them up

# Country's frequency table

CPI["Country"].value_counts(dropna=False)

Denmark               1
Mauritania            1
Djibouti              1
Dominican Republic    1
Laos                  1
                     ..
South Africa          1
Tunisia               1
Ghana                 1
Hungary               1
Puerto Rico           1
Name: Country, Length: 203, dtype: int64

In [9]:
# Country's columns
CPI.columns

Index(['Country', 'ISO3', 'Region', 'CPI score 2021', 'Rank 2021',
       'CPI score 2020', 'Rank 2020', 'CPI score 2019', 'Rank 2019',
       'CPI score 2018', 'Rank 2018', 'CPI score 2017', 'Rank 2017',
       'CPI score 2016', 'Rank 2016', 'Rank 2015', 'ISO3_2015',
       'CPI score 2015', 'Rank 2014', 'ISO3_2014', 'CPI score 2014',
       'Rank 2013', 'ISO3_2013', 'CPI score 2013', 'Rank 2012',
       'CPI score 2012'],
      dtype='object')

In [10]:
# we discovered some duplicate ISO3 columns

# the ISO3 code for certain countries has changed some time in between 2012 to 2021
# e.g. Montenegro's ISO3 was once MON but it changed to MNE after the CPI 2015 data was published
# trivia: actually, the ISO3 change for Montenegro happened in 2007, but the CPI dataset still used the old ISO3 code

# we'll keep the newest ISO3 and drop the rest
CPI = CPI.drop(["ISO3_2015", "ISO3_2014", "ISO3_2013"], axis=1)
CPI.columns
# CPI[CPI["ISO3"] == np.nan]

Index(['Country', 'ISO3', 'Region', 'CPI score 2021', 'Rank 2021',
       'CPI score 2020', 'Rank 2020', 'CPI score 2019', 'Rank 2019',
       'CPI score 2018', 'Rank 2018', 'CPI score 2017', 'Rank 2017',
       'CPI score 2016', 'Rank 2016', 'Rank 2015', 'CPI score 2015',
       'Rank 2014', 'CPI score 2014', 'Rank 2013', 'CPI score 2013',
       'Rank 2012', 'CPI score 2012'],
      dtype='object')

In [11]:
# there's a null value we haven't checked
# the curious thing is the value_counts doesn't show the NaN value
CPI["Country"].value_counts(dropna=False)

# we'll directly retrieve it with isnull
CPI[CPI["Country"].isnull()]



Unnamed: 0,Country,ISO3,Region,CPI score 2021,Rank 2021,CPI score 2020,Rank 2020,CPI score 2019,Rank 2019,CPI score 2018,Rank 2018,CPI score 2017,Rank 2017,CPI score 2016,Rank 2016,Rank 2015,CPI score 2015,Rank 2014,CPI score 2014,Rank 2013,CPI score 2013,Rank 2012,CPI score 2012
186,,,,,,,,,,,,,,,,,,,,,,,


In [12]:
# row with index 186 has NaN for all its columns
# it probably came from an empty row in one of the original datasets
# we can safely drop it
CPI.drop(186, axis=0, inplace=True)
CPI.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 202 entries, 0 to 202
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         202 non-null    object 
 1   ISO3            186 non-null    object 
 2   Region          186 non-null    object 
 3   CPI score 2021  180 non-null    float64
 4   Rank 2021       180 non-null    float64
 5   CPI score 2020  180 non-null    float64
 6   Rank 2020       180 non-null    float64
 7   CPI score 2019  180 non-null    float64
 8   Rank 2019       180 non-null    float64
 9   CPI score 2018  180 non-null    float64
 10  Rank 2018       180 non-null    float64
 11  CPI score 2017  181 non-null    float64
 12  Rank 2017       180 non-null    float64
 13  CPI score 2016  176 non-null    float64
 14  Rank 2016       176 non-null    float64
 15  Rank 2015       168 non-null    float64
 16  CPI score 2015  168 non-null    float64
 17  Rank 2014       175 non-null    flo

In [13]:
# while looking for the null country, we also discovered a country named GLOBAL AVARAGE below the previous row with index 186
# there's barely any data in the GLOBAL AVARAGE and we don't need any of them
# we decided to drop it
CPI.drop(187, axis=0, inplace=True)
CPI.iloc[[187]]

# dropping the columns didn't reset index; let's do that
CPI.reset_index(drop=True, inplace=True)

In [14]:
# there are some other peculiarities with the Country columns
# The USA has multiple rows representing it because their country names differ
# 1. United States of America
# 2. The United States of America
# 3. United States

CPI[CPI["Country"].str.match(r"(The )?United States")]

Unnamed: 0,Country,ISO3,Region,CPI score 2021,Rank 2021,CPI score 2020,Rank 2020,CPI score 2019,Rank 2019,CPI score 2018,Rank 2018,CPI score 2017,Rank 2017,CPI score 2016,Rank 2016,Rank 2015,CPI score 2015,Rank 2014,CPI score 2014,Rank 2013,CPI score 2013,Rank 2012,CPI score 2012
27,United States of America,USA,AME,67.0,27.0,67.0,25.0,69.0,23.0,71.0,22.0,75.0,16.0,,,,,,,,,,
186,The United States of America,,,,,,,,,,,,,74.0,18.0,,,,,,,,
197,United States,,,,,,,,,,,,,,,16.0,76.0,17.0,74.0,19.0,73.0,19.0,73.0


In [15]:
# we'll stick with the row that has the most recent data ,i.e. row 27 with the name United States of America, 
# and fill the missing values using data from the older rows

# this is a recurring issue, so it pays to create a function for it
# replace values of row with keep_index with values from row with copy_index based on the column names passed
#
# finish by dropping the copied rows and reset index
def clean_inconsistent_names(df, keep_indexes, copy_dicts):
    for keep_index, copy_dict in zip(keep_indexes, copy_dicts):
        for copy_index, cols in copy_dict.items():
            CPI.loc[keep_index, cols] = CPI.loc[copy_index, cols]
    
    drop_rows = []
    for dict in copy_dicts:
            drop_rows += dict.keys()
    drop_rows = list(drop_rows)
    # DEBUG
    print(drop_rows)
    CPI.drop(drop_rows, axis=0, inplace=True)
    CPI.reset_index(drop=True, inplace=True)
    
clean_inconsistent_names(CPI, [27], [{186: ["CPI score 2016", "Rank 2016"], 197: ["CPI score 2015", "Rank 2015", 
             "CPI score 2014", "Rank 2014", 
             "CPI score 2013", "Rank 2013", 
             "CPI score 2012", "Rank 2012"]}] )
# clean_inconsistent_names(CPI, 27, 197, ["CPI score 2015", "Rank 2015", 
#              "CPI score 2014", "Rank 2014", 
#              "CPI score 2013", "Rank 2013", 
#              "CPI score 2012", "Rank 2012"])

# CPI.loc[27, ["CPI score 2016", "Rank 2016"]] = CPI.loc[186, ["CPI score 2016", "Rank 2016"]]
# CPI.loc[27, ["CPI score 2015", "Rank 2015", 
#              "CPI score 2014", "Rank 2014", 
#              "CPI score 2013", "Rank 2013", 
#              "CPI score 2012", "Rank 2012"]] = CPI.loc[197, ["CPI score 2015", "Rank 2015", 
#                                                              "CPI score 2014", "Rank 2014", 
#                                                              "CPI score 2013", "Rank 2013", 
#                                                              "CPI score 2012", "Rank 2012"]]


# we'll also drop the two rows and reset the index afterwards

CPI[CPI["Country"].str.match(r"(The )?United States")]

[186, 197]


Unnamed: 0,Country,ISO3,Region,CPI score 2021,Rank 2021,CPI score 2020,Rank 2020,CPI score 2019,Rank 2019,CPI score 2018,Rank 2018,CPI score 2017,Rank 2017,CPI score 2016,Rank 2016,Rank 2015,CPI score 2015,Rank 2014,CPI score 2014,Rank 2013,CPI score 2013,Rank 2012,CPI score 2012
27,United States of America,USA,AME,67.0,27.0,67.0,25.0,69.0,23.0,71.0,22.0,75.0,16.0,74.0,18.0,16.0,76.0,17.0,74.0,19.0,73.0,19.0,73.0


In [16]:
# Country has no null and duplicate values as of now
CPI.info()
CPI[CPI.duplicated(subset="Country")]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         199 non-null    object 
 1   ISO3            186 non-null    object 
 2   Region          186 non-null    object 
 3   CPI score 2021  180 non-null    float64
 4   Rank 2021       180 non-null    float64
 5   CPI score 2020  180 non-null    float64
 6   Rank 2020       180 non-null    float64
 7   CPI score 2019  180 non-null    float64
 8   Rank 2019       180 non-null    float64
 9   CPI score 2018  180 non-null    float64
 10  Rank 2018       180 non-null    float64
 11  CPI score 2017  180 non-null    float64
 12  Rank 2017       180 non-null    float64
 13  CPI score 2016  176 non-null    float64
 14  Rank 2016       176 non-null    float64
 15  Rank 2015       168 non-null    float64
 16  CPI score 2015  168 non-null    float64
 17  Rank 2014       175 non-null    flo

Unnamed: 0,Country,ISO3,Region,CPI score 2021,Rank 2021,CPI score 2020,Rank 2020,CPI score 2019,Rank 2019,CPI score 2018,Rank 2018,CPI score 2017,Rank 2017,CPI score 2016,Rank 2016,Rank 2015,CPI score 2015,Rank 2014,CPI score 2014,Rank 2013,CPI score 2013,Rank 2012,CPI score 2012


In [17]:
# ISO3 has a some null values as well
# we'll also be looking for duplicates once the null values are handled

CPI[CPI["ISO3"].isnull()]

Unnamed: 0,Country,ISO3,Region,CPI score 2021,Rank 2021,CPI score 2020,Rank 2020,CPI score 2019,Rank 2019,CPI score 2018,Rank 2018,CPI score 2017,Rank 2017,CPI score 2016,Rank 2016,Rank 2015,CPI score 2015,Rank 2014,CPI score 2014,Rank 2013,CPI score 2013,Rank 2012,CPI score 2012
186,Saint Vincent and The Grenadines,,,,,,,,,,,,,60.0,35.0,,,,,,,,
187,Cape Verde,,,,,,,,,,,,,59.0,38.0,40.0,55.0,43.0,57.0,41.0,58.0,39.0,60.0
188,Brunei,,,,,,,,,,,,,58.0,41.0,,,,,38.0,60.0,46.0,55.0
189,Korea (South),,,,,,,,,,,,,53.0,52.0,43.0,54.0,44.0,55.0,46.0,55.0,45.0,56.0
190,The FYR of Macedonia,,,,,,,,,,,,,37.0,90.0,66.0,42.0,64.0,45.0,,,,
191,Côte d’Ivoire,,,,,,,,,,,,,34.0,108.0,,,,,,,,
192,The Democratic Republic of Congo,,,,,,,,,,,,,21.0,156.0,,,,,,,,
193,Republic of Congo,,,,,,,,,,,,,20.0,159.0,,,,,,,,
194,Guinea-Bissau,,,,,,,,,,,,,16.0,168.0,158.0,17.0,161.0,19.0,163.0,19.0,150.0,25.0
195,Korea (North),,,,,,,,,,,,,12.0,174.0,167.0,8.0,174.0,8.0,175.0,8.0,174.0,8.0


In [18]:
# while reviewing the ISO3 duplicates, we discovered Country duplicates we didn't manage to capture before this
# e.g. Côte d’Ivoire is slightly different than  Côte d´Ivoire because of the use of Greek oxia character in place of apostrophe
#
# other than that, there is another country with the same naming issue as the United States — Republic of Congo
#
# South and North Korea possibly have inconsistent names as well

# we'll start by examining both the Korea countries

CPI[CPI["Country"].str.contains("Korea")]

Unnamed: 0,Country,ISO3,Region,CPI score 2021,Rank 2021,CPI score 2020,Rank 2020,CPI score 2019,Rank 2019,CPI score 2018,Rank 2018,CPI score 2017,Rank 2017,CPI score 2016,Rank 2016,Rank 2015,CPI score 2015,Rank 2014,CPI score 2014,Rank 2013,CPI score 2013,Rank 2012,CPI score 2012
31,"Korea, South",KOR,AP,62.0,32.0,61.0,33.0,59.0,39.0,57.0,45.0,54.0,51.0,,,,,,,,,,
174,"Korea, North",PRK,AP,16.0,174.0,18.0,170.0,17.0,172.0,14.0,176.0,17.0,171.0,,,,,,,,,,
189,Korea (South),,,,,,,,,,,,,53.0,52.0,43.0,54.0,44.0,55.0,46.0,55.0,45.0,56.0
195,Korea (North),,,,,,,,,,,,,12.0,174.0,167.0,8.0,174.0,8.0,175.0,8.0,174.0,8.0


In [19]:
# copy data to most recent Korea countries rows

skorea_clean_dict = {189:  ["CPI score 2016", "Rank 2016", "CPI score 2015", "Rank 2015", 
             "CPI score 2014", "Rank 2014", 
             "CPI score 2013", "Rank 2013", 
             "CPI score 2012", "Rank 2012"]}

nkorea_clean_dict = {195:  ["CPI score 2016", "Rank 2016", "CPI score 2015", "Rank 2015", 
             "CPI score 2014", "Rank 2014", 
             "CPI score 2013", "Rank 2013", 
             "CPI score 2012", "Rank 2012"]}

# clean both Korean countries
clean_inconsistent_names(CPI, [31, 174], [skorea_clean_dict, nkorea_clean_dict])

CPI[CPI["Country"].str.contains("Korea")]

[189, 195]


Unnamed: 0,Country,ISO3,Region,CPI score 2021,Rank 2021,CPI score 2020,Rank 2020,CPI score 2019,Rank 2019,CPI score 2018,Rank 2018,CPI score 2017,Rank 2017,CPI score 2016,Rank 2016,Rank 2015,CPI score 2015,Rank 2014,CPI score 2014,Rank 2013,CPI score 2013,Rank 2012,CPI score 2012
31,"Korea, South",KOR,AP,62.0,32.0,61.0,33.0,59.0,39.0,57.0,45.0,54.0,51.0,53.0,52.0,43.0,54.0,44.0,55.0,46.0,55.0,45.0,56.0
174,"Korea, North",PRK,AP,16.0,174.0,18.0,170.0,17.0,172.0,14.0,176.0,17.0,171.0,12.0,174.0,167.0,8.0,174.0,8.0,175.0,8.0,174.0,8.0


In [20]:
# we'll target Côte d’Ivoire next
CPI[CPI["ISO3"].isnull()]

Unnamed: 0,Country,ISO3,Region,CPI score 2021,Rank 2021,CPI score 2020,Rank 2020,CPI score 2019,Rank 2019,CPI score 2018,Rank 2018,CPI score 2017,Rank 2017,CPI score 2016,Rank 2016,Rank 2015,CPI score 2015,Rank 2014,CPI score 2014,Rank 2013,CPI score 2013,Rank 2012,CPI score 2012
186,Saint Vincent and The Grenadines,,,,,,,,,,,,,60.0,35.0,,,,,,,,
187,Cape Verde,,,,,,,,,,,,,59.0,38.0,40.0,55.0,43.0,57.0,41.0,58.0,39.0,60.0
188,Brunei,,,,,,,,,,,,,58.0,41.0,,,,,38.0,60.0,46.0,55.0
189,The FYR of Macedonia,,,,,,,,,,,,,37.0,90.0,66.0,42.0,64.0,45.0,,,,
190,Côte d’Ivoire,,,,,,,,,,,,,34.0,108.0,,,,,,,,
191,The Democratic Republic of Congo,,,,,,,,,,,,,21.0,156.0,,,,,,,,
192,Republic of Congo,,,,,,,,,,,,,20.0,159.0,,,,,,,,
193,Guinea-Bissau,,,,,,,,,,,,,16.0,168.0,158.0,17.0,161.0,19.0,163.0,19.0,150.0,25.0
194,Côte d´Ivoire,,,,,,,,,,,,,,,106.0,32.0,115.0,32.0,136.0,27.0,130.0,29.0
195,Congo Republic,,,,,,,,,,,,,,,146.0,23.0,152.0,23.0,154.0,22.0,144.0,26.0


In [21]:
# we'll target Côte d’Ivoire next
CPI[CPI["Country"].str.contains("Côte")]


Unnamed: 0,Country,ISO3,Region,CPI score 2021,Rank 2021,CPI score 2020,Rank 2020,CPI score 2019,Rank 2019,CPI score 2018,Rank 2018,CPI score 2017,Rank 2017,CPI score 2016,Rank 2016,Rank 2015,CPI score 2015,Rank 2014,CPI score 2014,Rank 2013,CPI score 2013,Rank 2012,CPI score 2012
181,Côte d'Ivoire,CIV,SSA,,,36.0,104.0,,,,,,,,,,,,,,,,
185,Côte D'Ivoire,CIV,SSA,,,,,,,,,36.0,103.0,,,,,,,,,,
190,Côte d’Ivoire,,,,,,,,,,,,,34.0,108.0,,,,,,,,
194,Côte d´Ivoire,,,,,,,,,,,,,,,106.0,32.0,115.0,32.0,136.0,27.0,130.0,29.0


In [22]:
CPI[CPI.duplicated(subset="Country")]

Unnamed: 0,Country,ISO3,Region,CPI score 2021,Rank 2021,CPI score 2020,Rank 2020,CPI score 2019,Rank 2019,CPI score 2018,Rank 2018,CPI score 2017,Rank 2017,CPI score 2016,Rank 2016,Rank 2015,CPI score 2015,Rank 2014,CPI score 2014,Rank 2013,CPI score 2013,Rank 2012,CPI score 2012


In [23]:
# the clean_inconsistent_names doesn't work well for Côte d’Ivoire

# def clean_inconsistent_names(df, keep_indexes, copy_dicts):
#     for keep_index, copy_dict in zip(keep_indexes, copy_dicts):
#         for copy_index, cols in copy_dict.items():
#             CPI.loc[keep_index, cols] = CPI.loc[copy_index, cols]
    
#     drop_rows = []
#     for dict in copy_dicts:
#             drop_rows += dict.keys()
#     drop_rows = list(drop_rows)
#     # DEBUG
#     print(drop_rows)
#     CPI.drop(drop_rows, axis=0, inplace=True)
#     try:
#         CPI.reset_index(drop=True, inplace=True)
#     except e:
#         print("Failed to reset index{}".format(e))
    

# ivorycoast_clean_dicts = {
#                             185: ["CPI score 2017", "Rank 2017"],
#                             190: ["CPI score 2016", "Rank 2016"],
#                             194: ["CPI score 2015", "Rank 2015", "CPI score 2014", "CPI score 2014", 
#                                   "CPI score 2013", "CPI score 2013", "CPI score 2012", "CPI score 2012"]
# }

# clean_inconsistent_names(CPI, [181], [ivorycoast_clean_dicts])
# # CPI.reset_index(drop=True, inplace=True)
# CPI[CPI["Country"].str.contains("Côte")]

  CPI.loc[keep_index, cols] = CPI.loc[copy_index, cols]


ValueError: cannot reindex on an axis with duplicate labels

In [None]:
# ignore this

# we decide to progressively lower the range until we can identify the problematic row's index
# we're doing it manually, but a function is more appropriate here

# this shows no nulls
# CPI[:100].info()

# this shows a null
# CPI[100:].info()

# this shows no nulls
# CPI[100:150].info()

# this shows a null
# CPI[150:].info()

# this shows no nulls
# CPI[150:175].info()

# this shows a null
# CPI[175:].info()

# explore the rows in the identified range
# CPI[175:]

# # index 186 is the problem. Why weren't we able to filter its Country?
# elusive_country = CPI.iloc[186]["Country"] 

# the value is "nan" but it's not a string

# type(elusive_country)



# and checking with math.isnan()
# type(CPI.iloc[186]["Country"])