# The University of Hong Kong
## DASC7600 Data Science Project 2024
## EDA - Global - Latitude and Longitude

# Import Modules and Settings

In [1]:
import os
import sys

# Add project directory to system path
project_dir = os.path.dirname(os.getcwd())
sys.path.append(project_dir)

In [2]:
import pandas as pd
import warnings

import covid_module

# Settings
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

%matplotlib inline

# Load Data

In [3]:
# Read csv file
## Dataset name: countries.csv
## URL: https://developers.google.com/public-data/docs/canonical/countries_csv
global_lat_long_df = pd.read_csv(project_dir + '/data/raw_data/global/countries.csv', encoding='unicode_escape')

In [4]:
# Basic Information of Data Set

In [5]:
# Print first 10 records
global_lat_long_df.head(10)

Unnamed: 0,country,latitude,longitude,name
0,AD,42.546245,1.601554,Andorra
1,AE,23.424076,53.847818,United Arab Emirates
2,AF,33.93911,67.709953,Afghanistan
3,AG,17.060816,-61.796428,Antigua and Barbuda
4,AI,18.220554,-63.068615,Anguilla
5,AL,41.153332,20.168331,Albania
6,AM,40.069099,45.038189,Armenia
7,AN,12.226079,-69.060087,Netherlands Antilles
8,AO,-11.202692,17.873887,Angola
9,AQ,-75.250973,-0.071389,Antarctica


In [6]:
# Basic information of dataframe
global_lat_long_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245 entries, 0 to 244
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    244 non-null    object 
 1   latitude   244 non-null    float64
 2   longitude  244 non-null    float64
 3   name       245 non-null    object 
dtypes: float64(2), object(2)
memory usage: 7.8+ KB


There are 245 records in this csv file. <br>
Each of the columns 'country', 'latitude' and 'longitude' has 1 NULL value.

# Exploratory Data Analysis (EDA)

## Univariate Analysis

## Field - country

In [7]:
print('The record with missing value in country column is:')
global_lat_long_df[global_lat_long_df['country'].isna()]

The record with missing value in country column is:


Unnamed: 0,country,latitude,longitude,name
156,,-22.95764,18.49041,Namibia


The record with missing value in the 'country' column are associated with the country Namibia. <br>
It is reasonable to conclude that the country code for Namibia is NA and that it was mistakenly identified as a missing value. <br>
Thus, we should replace the missing value in 'country' column with the string 'NA'.

In [8]:
# Correct the missing values in Country code column
global_lat_long_df['country'] = global_lat_long_df['country'].fillna('NA')

In [9]:
print('All values are distinct:', global_lat_long_df['country'].nunique() == global_lat_long_df.shape[0])

All values are distinct: True


In [10]:
min_len, max_len = global_lat_long_df['country'].str.len().agg(['min', 'max']).to_list()
if min_len != max_len:
    print(f'The length of country codes ranges from {min_len} to {max_len}.')
else:
    print(f'All country codes are having length {min_len}.')

All country codes are having length 2.


## Fields - latitude and longitude

In [11]:
print('The record with missing value in latitude column is:')
global_lat_long_df[global_lat_long_df['latitude'].isna()]

The record with missing value in latitude column is:


Unnamed: 0,country,latitude,longitude,name
226,UM,,,U.S. Minor Outlying Islands


The minor outlying islands and groups of islands comprise eight United States insular areas <br>
in the Pacific Ocean (Baker Island, Howland Island, Jarvis Island, Johnston Atoll, Kingman Reef, Midway Atoll, Palmyra Atoll, and Wake Island) and <br>
one in the Caribbean Sea (Navassa Island). <br>
 <br>
 Thus, there is no latitude and longitude assigned to UM.

In [12]:
# Range of latitude
min_lat, max_lat = global_lat_long_df['latitude'].agg(['min', 'max']).to_list()
print(f'The latitude range from {min_lat:.2f} to {max_lat:.2f}.')

# Range of longitude
min_lat, max_lat = global_lat_long_df['longitude'].agg(['min', 'max']).to_list()
print(f'The longitude range from {min_lat:.2f} to {max_lat:.2f}.')

The latitude range from -75.25 to 77.55.
The longitude range from -177.16 to 179.41.


## Missing Value Analysis

In [13]:
# Number of missing values in each column
covid_module.print_missing_val_count(global_lat_long_df)

The following columns have missing values:
latitude: 1 (0.4%)
longitude: 1 (0.4%)


As shown in univariate analysis,
The 'NA' value in the column 'country' is not a missing value. It has been replaced with the string 'NA'.
The missing values from columns latitude and longitude are both from the record of count 'UM' (U.S. Minor Outlying Islands).