<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-pandas-and-NumPy,-as-well-as-the-os-module" data-toc-modified-id="Import-pandas-and-NumPy,-as-well-as-the-os-module-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import pandas and NumPy, as well as the os module</a></span></li><li><span><a href="#Load-the-data-from-Cameroon-and-Poland" data-toc-modified-id="Load-the-data-from-Cameroon-and-Poland-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load the data from Cameroon and Poland</a></span></li><li><span><a href="#Concatenate-the-Cameroon-and-Poland-data" data-toc-modified-id="Concatenate-the-Cameroon-and-Poland-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Concatenate the Cameroon and Poland data</a></span></li><li><span><a href="#Concatenate-all-the-country-data-files" data-toc-modified-id="Concatenate-all-the-country-data-files-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Concatenate all the country data files</a></span></li><li><span><a href="#Show-some-of-the-combined-data" data-toc-modified-id="Show-some-of-the-combined-data-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Show some of the combined data</a></span></li><li><span><a href="#Check-the-values-in-the-concatenated-data" data-toc-modified-id="Check-the-values-in-the-concatenated-data-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Check the values in the concatenated data</a></span></li><li><span><a href="#Fix-the-missing-values" data-toc-modified-id="Fix-the-missing-values-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Fix the missing values</a></span></li></ul></div>

# Import pandas and NumPy, as well as the os module

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# pd.set_option('display.width', 200)
# pd.set_option('display.max_columns', 35)
# pd.set_option('display.max_rows', 50)
# pd.options.display.float_format = '{:,.0f}'.format

In [3]:
import watermark
%load_ext watermark

%watermark -n -i -iv

pandas   : 1.2.1
json     : 2.0.9
numpy    : 1.19.2
watermark: 2.1.0



# Load the data from Cameroon and Poland

In [4]:
ltcameroon = pd.read_csv('data/ltcountry/ltcameroon.csv')
ltpoland = pd.read_csv('data/ltcountry/ltpoland.csv')

# Concatenate the Cameroon and Poland data

In [5]:
ltcameroon.shape

(48, 11)

In [6]:
ltpoland.shape

(120, 11)

In [7]:
ltpoland.columns

Index(['locationid', 'year', 'month', 'temperature', 'latitude', 'longitude',
       'elevation', 'station', 'countryid', 'country', 'latabs'],
      dtype='object')

In [8]:
ltcameroon.columns

Index(['locationid', 'year', 'month', 'temperature', 'latitude', 'longitude',
       'elevation', 'station', 'countryid', 'country', 'latabs'],
      dtype='object')

In [9]:
ltall = pd.concat([ltcameroon, ltpoland])

In [10]:
ltall['country'].value_counts()

Poland      120
Cameroon     48
Name: country, dtype: int64

# Concatenate all the country data files

In [11]:
directory = 'data/ltcountry'
ltall = pd.DataFrame()

In [12]:
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        fileloc = os.path.join(directory, filename)
        # open the next file
        with open(fileloc) as file:
            ltnew = pd.read_csv(fileloc)
            print(filename + ' has ' + str(ltnew.shape[0]) + ' rows.')
            ltall = pd.concat([ltall, ltnew])
            # check for differences in columns
            columndiff = ltall.columns.symmetric_difference(ltnew.columns)
            if (not columndiff.empty):
                print('',
                      'Different column names for: ',
                      filename,
                      columndiff,
                      sep='\n')

ltbrazil.csv has 1104 rows.
ltcameroon.csv has 48 rows.
ltindia.csv has 1056 rows.
ltjapan.csv has 1800 rows.
ltmexico.csv has 852 rows.
ltoman.csv has 288 rows.

Different column names for: 
ltoman.csv
Index(['latabs'], dtype='object')
ltpoland.csv has 120 rows.


# Show some of the combined data

In [13]:
ltall[['country', 'station', 'month', 'temperature',
       'latitude']].sample(5, random_state=1)

Unnamed: 0,country,station,month,temperature,latitude
717,Brazil,TAGUATINGA,8,27.75,-12.4
649,Japan,MATSUMOTO,5,17.25,36.25
172,Oman,BURAIMI_AUT,8,37.4,24.233
1029,India,JAIPUR_SANGANER,12,14.92,26.817
351,Mexico,SN_CRISTOBAL_LAS_CASASCHIS,5,19.3,16.75


# Check the values in the concatenated data

In [14]:
ltall['country'].value_counts().sort_index()

Brazil      1104
Cameroon      48
India       1056
Japan       1800
Mexico       852
Oman         288
Poland       120
Name: country, dtype: int64

In [15]:
ltall.groupby(['country']).agg({
    'temperature': ['min', 'mean', 'max', 'count'],
    'latabs': ['min', 'mean', 'max', 'count']
})

Unnamed: 0_level_0,temperature,temperature,temperature,temperature,latabs,latabs,latabs,latabs
Unnamed: 0_level_1,min,mean,max,count,min,mean,max,count
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Brazil,12.2,25.470196,33.93,969,0.051,13.859272,33.517,1104
Cameroon,21.87,27.227941,36.0,34,4.02,7.78925,10.451,48
India,1.95,26.113477,37.4,1044,8.3,21.104516,34.083,1056
Japan,-7.15,15.495854,30.14,1797,24.3,35.664822,45.417,1800
Mexico,7.0,23.230397,34.45,806,14.917,22.114901,32.4,852
Oman,12.1,27.753463,38.39,205,,,,0
Poland,-3.85,10.46275,23.25,120,50.078,52.4985,54.75,120


# Fix the missing values

In [16]:
ltall['latabs'] = np.where(ltall['country'] == 'Oman', ltall['latitude'],
                           ltall['latabs'])

In [17]:
ltall.groupby(['country']).agg({
    'temperature': ['min', 'mean', 'max', 'count'],
    'latabs': ['min', 'mean', 'max', 'count']
})

Unnamed: 0_level_0,temperature,temperature,temperature,temperature,latabs,latabs,latabs,latabs
Unnamed: 0_level_1,min,mean,max,count,min,mean,max,count
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Brazil,12.2,25.470196,33.93,969,0.051,13.859272,33.517,1104
Cameroon,21.87,27.227941,36.0,34,4.02,7.78925,10.451,48
India,1.95,26.113477,37.4,1044,8.3,21.104516,34.083,1056
Japan,-7.15,15.495854,30.14,1797,24.3,35.664822,45.417,1800
Mexico,7.0,23.230397,34.45,806,14.917,22.114901,32.4,852
Oman,12.1,27.753463,38.39,205,16.933,21.997875,26.167,288
Poland,-3.85,10.46275,23.25,120,50.078,52.4985,54.75,120
