<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-pandas-and-load-the-weather-station-and-country-data" data-toc-modified-id="Import-pandas-and-load-the-weather-station-and-country-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import pandas and load the weather station and country data</a></span></li><li><span><a href="#Set-the-index-for-the-weather-station-(locations)-and-country-data" data-toc-modified-id="Set-the-index-for-the-weather-station-(locations)-and-country-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Set the index for the weather station (locations) and country data</a></span></li><li><span><a href="#Perform-a-left-join-of-countries-and-locations-using-join" data-toc-modified-id="Perform-a-left-join-of-countries-and-locations-using-join-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Perform a left join of countries and locations using join</a></span></li><li><span><a href="#Check-that-the-merge-by-column-matches" data-toc-modified-id="Check-that-the-merge-by-column-matches-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Check that the merge-by column matches</a></span></li><li><span><a href="#Show-the-rows-in-one-file-but-not-the-other" data-toc-modified-id="Show-the-rows-in-one-file-but-not-the-other-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Show the rows in one file but not the other</a></span></li><li><span><a href="#Merge-the-locations-and-countries-DataFrames" data-toc-modified-id="Merge-the-locations-and-countries-DataFrames-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Merge the locations and countries DataFrames</a></span></li></ul></div>

# Import pandas and load the weather station and country data 

In [1]:
import pandas as pd

In [2]:
import watermark
%load_ext watermark

%watermark -n -i -iv

watermark: 2.1.0
pandas   : 1.2.1
json     : 2.0.9



In [3]:
countries = pd.read_csv('data/ltcountries.csv')
locations = pd.read_csv('data/ltlocations.csv')

# Set the index for the weather station (locations) and country data

In [4]:
countries.set_index(['countryid'], inplace=True)
locations.set_index(['countryid'], inplace=True)

In [5]:
countries.head()

Unnamed: 0_level_0,country
countryid,Unnamed: 1_level_1
AC,Antigua and Barbuda
AE,United Arab Emirates
AF,Afghanistan
AG,Algeria
AJ,Azerbaijan


In [6]:
countries.index.nunique() == countries.shape[0]

True

In [7]:
locations[['locationid', 'latitude', 'stnelev']].head(10)

Unnamed: 0_level_0,locationid,latitude,stnelev
countryid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AC,ACW00011604,57.7667,18.0
AE,AE000041196,25.333,34.0
AE,AEM00041184,25.617,31.0
AE,AEM00041194,25.255,10.4
AE,AEM00041216,24.43,3.0
AE,AEM00041217,24.433,26.8
AE,AEM00041218,24.262,264.9
AF,AF000040930,35.317,3366.0
AF,AFM00040911,36.7,378.0
AF,AFM00040938,34.21,977.2


# Perform a left join of countries and locations using join

In [8]:
stations = countries.join(locations)

In [9]:
stations[['locationid', 'latitude', 'stnelev', 'country']].head(10)

Unnamed: 0_level_0,locationid,latitude,stnelev,country
countryid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AC,ACW00011604,57.7667,18.0,Antigua and Barbuda
AE,AE000041196,25.333,34.0,United Arab Emirates
AE,AEM00041184,25.617,31.0,United Arab Emirates
AE,AEM00041194,25.255,10.4,United Arab Emirates
AE,AEM00041216,24.43,3.0,United Arab Emirates
AE,AEM00041217,24.433,26.8,United Arab Emirates
AE,AEM00041218,24.262,264.9,United Arab Emirates
AF,AF000040930,35.317,3366.0,Afghanistan
AF,AFM00040911,36.7,378.0,Afghanistan
AF,AFM00040938,34.21,977.2,Afghanistan


# Check that the merge-by column matches

In [10]:
countries = pd.read_csv('data/ltcountries.csv')
locations = pd.read_csv('data/ltlocations.csv')

In [15]:
def checkmerge(dfleft, dfright, idvar):
    dfleft['inleft'] = 'Y'
    dfright['inright'] = 'Y'
    dfboth = pd.merge(dfleft[[idvar, 'inleft']],
                      dfright[[idvar, 'inright']],
                      on=[idvar],
                      how='outer')
    dfboth.fillna('N', inplace=True)
    print(pd.crosstab(dfboth['inleft'], dfboth['inright']))
    print(dfboth.loc[(dfboth['inleft'] == 'N') | (dfboth['inright'] == 'N')])

In [16]:
checkmerge(countries.copy(), locations.copy(), 'countryid')

inright  N      Y
inleft           
N        0      1
Y        2  27472
      countryid inleft inright
9715         LQ      Y       N
13103        ST      Y       N
27474        FO      N       Y


# Show the rows in one file but not the other

In [17]:
countries.loc[countries['countryid'].isin(['LQ', 'ST'])]

Unnamed: 0,countryid,country
124,LQ,Palmyra Atoll [United States]
195,ST,Saint Lucia


In [18]:
locations.loc[locations['countryid'] == 'FO']

Unnamed: 0,locationid,latitude,longitude,stnelev,station,countryid
7363,FOM00006009,61.4,-6.667,102.0,AKRABERG,FO


# Merge the locations and countries DataFrames

In [19]:
stations = pd.merge(countries, locations, on=['countryid'], how='left')

In [20]:
stations[['locationid', 'latitude', 'stnelev', 'country']].head(10)

Unnamed: 0,locationid,latitude,stnelev,country
0,ACW00011604,57.7667,18.0,Antigua and Barbuda
1,AE000041196,25.333,34.0,United Arab Emirates
2,AEM00041184,25.617,31.0,United Arab Emirates
3,AEM00041194,25.255,10.4,United Arab Emirates
4,AEM00041216,24.43,3.0,United Arab Emirates
5,AEM00041217,24.433,26.8,United Arab Emirates
6,AEM00041218,24.262,264.9,United Arab Emirates
7,AF000040930,35.317,3366.0,Afghanistan
8,AFM00040911,36.7,378.0,Afghanistan
9,AFM00040938,34.21,977.2,Afghanistan


In [21]:
stations.shape

(27474, 7)

In [22]:
stations.loc[stations['countryid'].isin(['LQ', 'ST'])].isnull().sum()

countryid     0
country       0
locationid    2
latitude      2
longitude     2
stnelev       2
station       2
dtype: int64