# Import libraries

In [1]:
import numpy as np
import csv

# Load data

In [2]:
file = open("countries of the world.csv")

In [3]:
raw_data = list(csv.reader(file))

In [4]:
raw_data[:2]

[['Country',
  'Region',
  'Population',
  'Area (sq. mi.)',
  'Pop. Density (per sq. mi.)',
  'Coastline (coast/area ratio)',
  'Net migration',
  'Infant mortality (per 1000 births)',
  'GDP ($ per capita)',
  'Literacy (%)',
  'Phones (per 1000)',
  'Arable (%)',
  'Crops (%)',
  'Other (%)',
  'Climate',
  'Birthrate',
  'Deathrate',
  'Agriculture',
  'Industry',
  'Service'],
 ['Afghanistan ',
  'ASIA (EX. NEAR EAST)         ',
  '31056997',
  '647500',
  '48,0',
  '0,00',
  '23,06',
  '163,07',
  '700',
  '36,0',
  '3,2',
  '12,13',
  '0,22',
  '87,65',
  '1',
  '46,6',
  '20,34',
  '0,38',
  '0,24',
  '0,38']]

# Separate the header and the actual data

In [5]:
header = raw_data[0]

In [6]:
header

['Country',
 'Region',
 'Population',
 'Area (sq. mi.)',
 'Pop. Density (per sq. mi.)',
 'Coastline (coast/area ratio)',
 'Net migration',
 'Infant mortality (per 1000 births)',
 'GDP ($ per capita)',
 'Literacy (%)',
 'Phones (per 1000)',
 'Arable (%)',
 'Crops (%)',
 'Other (%)',
 'Climate',
 'Birthrate',
 'Deathrate',
 'Agriculture',
 'Industry',
 'Service']

In [7]:
data = raw_data[1:]

In [8]:
data[0]

['Afghanistan ',
 'ASIA (EX. NEAR EAST)         ',
 '31056997',
 '647500',
 '48,0',
 '0,00',
 '23,06',
 '163,07',
 '700',
 '36,0',
 '3,2',
 '12,13',
 '0,22',
 '87,65',
 '1',
 '46,6',
 '20,34',
 '0,38',
 '0,24',
 '0,38']

# Cleaning Data

## Remove white space at the end of strings

In [9]:
"   python   ".strip()

'python'

In [10]:
data = [[c.strip() for c in country] for country in data]

In [11]:
data[3]

['American Samoa',
 'OCEANIA',
 '57794',
 '199',
 '290,4',
 '58,29',
 '-20,71',
 '9,27',
 '8000',
 '97,0',
 '259,5',
 '10',
 '15',
 '75',
 '2',
 '22,46',
 '3,27',
 '',
 '',
 '']

## Convert numerical data to type float 

In [12]:
float("0.38")

0.38

In [13]:
data = [[float(c.replace(",", ".")) if (i >= 2) & (c != "") else c for i, c in enumerate(country)] for country in data]

In [14]:
data[3]

['American Samoa',
 'OCEANIA',
 57794.0,
 199.0,
 290.4,
 58.29,
 -20.71,
 9.27,
 8000.0,
 97.0,
 259.5,
 10.0,
 15.0,
 75.0,
 2.0,
 22.46,
 3.27,
 '',
 '',
 '']

In [15]:
# How many country in our dataset
len(data)

227

## Remove rows with missing values 

In [16]:
# This function has one parameter, a list. 
# It returns True if the list has no missing value (an empty string), otherwise it returns False
def no_missing_value(country):
    for item in country:
        if item == "":
            return False
    return True

In [17]:
# Filter rows that have missing values
data_no_missing = list(filter(no_missing_value, data))

In [18]:
# How many country left 
len(data_no_missing)

180

# Find countries that are similar to Vietnam

## Create a list of countries

In [19]:
countries = [country[0] for country in data_no_missing]

In [20]:
countries[:5]

['Afghanistan', 'Albania', 'Algeria', 'Anguilla', 'Antigua & Barbuda']

In [21]:
countries.index("Vietnam")

176

In [22]:
data_no_missing[countries.index("Vietnam")]

['Vietnam',
 'ASIA (EX. NEAR EAST)',
 84402966.0,
 329560.0,
 256.1,
 1.05,
 -0.45,
 25.95,
 2500.0,
 90.3,
 187.7,
 19.97,
 5.95,
 74.08,
 2.0,
 16.86,
 6.22,
 0.209,
 0.41,
 0.381]

## Create a 2-d array of numerical data

In [23]:
# The numerical data starts from column index 2
num_data = np.array([country[2:] for country in data_no_missing])

In [24]:
num_data.shape

(180, 18)

*num_data* is a 2 dimentionals array where each row is the data of a country and each column is a statistic of the corresponding country. There are 179 rows (countries), and 18 columns (statistic). To access the data of a country we need to know its row index. We store the list of 179 countries in the variable *countries*, so we can get the (row) index of a country by indexing *countries*

In [25]:
vietnam = num_data[countries.index("Vietnam")]

In [26]:
countries.index("Vietnam")

176

In [27]:
vietnam

array([ 8.4402966e+07,  3.2956000e+05,  2.5610000e+02,  1.0500000e+00,
       -4.5000000e-01,  2.5950000e+01,  2.5000000e+03,  9.0300000e+01,
        1.8770000e+02,  1.9970000e+01,  5.9500000e+00,  7.4080000e+01,
        2.0000000e+00,  1.6860000e+01,  6.2200000e+00,  2.0900000e-01,
        4.1000000e-01,  3.8100000e-01])

## Calculating the Euclidean distances between Vietnam and other countries

In [28]:
euclidean_distance = ((vietnam - num_data)**2).sum(axis=1)

## Top 5 countries similar to Vietnam

In [29]:
# Get the index of top 5 minimum values of euclidean_distance
top5_indices = np.argsort(euclidean_distance)[1:6]

In [30]:
top5_indices

array([ 65, 130,  51,  56, 165], dtype=int64)

In [31]:
# Using the above indices to get the list of countries
country_array = np.array(countries)
country_array[top5_indices]

array(['Germany', 'Philippines', 'Egypt', 'Ethiopia', 'Turkey'],
      dtype='<U32')

## What's wrong here ?

We don't think Vietnam is similar to Germany and Turkey in term of all statistics we have in our dataset. So what is the problem ? First, look at the data of VietNam, Germany, and Turkey.

In [32]:
num_data[[176, 65, 165]]

array([[ 8.4402966e+07,  3.2956000e+05,  2.5610000e+02,  1.0500000e+00,
        -4.5000000e-01,  2.5950000e+01,  2.5000000e+03,  9.0300000e+01,
         1.8770000e+02,  1.9970000e+01,  5.9500000e+00,  7.4080000e+01,
         2.0000000e+00,  1.6860000e+01,  6.2200000e+00,  2.0900000e-01,
         4.1000000e-01,  3.8100000e-01],
       [ 8.2422299e+07,  3.5702100e+05,  2.3090000e+02,  6.7000000e-01,
         2.1800000e+00,  4.1600000e+00,  2.7600000e+04,  9.9000000e+01,
         6.6790000e+02,  3.3850000e+01,  5.9000000e-01,  6.5560000e+01,
         3.0000000e+00,  8.2500000e+00,  1.0620000e+01,  9.0000000e-03,
         2.9600000e-01,  6.9500000e-01],
       [ 7.0413958e+07,  7.8058000e+05,  9.0200000e+01,  9.2000000e-01,
         0.0000000e+00,  4.1040000e+01,  6.7000000e+03,  8.6500000e+01,
         2.6950000e+02,  3.0930000e+01,  3.3100000e+00,  6.5760000e+01,
         3.0000000e+00,  1.6620000e+01,  5.9700000e+00,  1.1700000e-01,
         2.9800000e-01,  5.8500000e-01]])

Notice the first value of each row, which is the population of the country. We see that the populatition of Vietnam is quite close to that of Germany and Turkey. In addition, we observe that the scale of the first and second column (population and area) are significantly larger than the scale of other columns. So that, the eucledian distance we computed are dominated by the difference in population and then area. In other words, the result should be understanded as the top 5 countries similar to Vietnam's population.

To take into account all statistics, we need to have all columns to be in the same scale. One approach is that we scale the column's values by its max value.

In [33]:
scale_data = num_data / num_data.max(axis=0)

In [34]:
scale_data[countries.index("Vietnam")]

array([ 0.0642349 ,  0.03421718,  0.01582525,  0.00120598, -0.01951431,
        0.15913411,  0.06613757,  0.903     ,  0.20902004,  0.32152632,
        0.12152778,  0.7408    ,  0.5       ,  0.33234772,  0.20914593,
        0.27178153,  0.45253863,  0.39937107])

In [35]:
vietnam = scale_data[countries.index("Vietnam")]
euclidean_distance = ((vietnam - scale_data)**2).sum(axis=1)
top5_indices = np.argsort(euclidean_distance)[1:6]
country_array[top5_indices]

array(['Thailand', 'Indonesia', 'Korea, North', 'Sri Lanka',
       'Dominican Republic'], dtype='<U32')

# Create a function 

In [36]:
def top_5_similar(country_name):
    interest_country = scale_data[countries.index(country_name)]
    euclidean_distance = ((interest_country - scale_data)**2).sum(axis=1)
    top5_indices = np.argsort(euclidean_distance)[1:6]
    return country_array[top5_indices]

In [37]:
top_5_similar("Vietnam")

array(['Thailand', 'Indonesia', 'Korea, North', 'Sri Lanka',
       'Dominican Republic'], dtype='<U32')

In [38]:
top_5_similar("United States")

array(['Australia', 'Sweden', 'Switzerland', 'Bermuda', 'Germany'],
      dtype='<U32')

In [39]:
top_5_similar("Korea, South")

array(['Israel', 'Portugal', 'Spain', 'New Zealand', 'Uruguay'],
      dtype='<U32')

In [40]:
top_5_similar("Thailand")

array(['Vietnam', 'Dominican Republic', 'Cuba', 'Philippines', 'Turkey'],
      dtype='<U32')