<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-pandas-and-pyarrow-and-adjust-the-display" data-toc-modified-id="Import-pandas-and-pyarrow-and-adjust-the-display-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import pandas and pyarrow and adjust the display</a></span></li><li><span><a href="#Load-the-land-temperatures-CSV-file-into-pandas,-drop-rows-with-missing-data,-and-set-an-index" data-toc-modified-id="Load-the-land-temperatures-CSV-file-into-pandas,-drop-rows-with-missing-data,-and-set-an-index-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load the land temperatures CSV file into pandas, drop rows with missing data, and set an index</a></span></li><li><span><a href="#Write-extreme-values-for-temperature-to-CSV-and-Excel-files" data-toc-modified-id="Write-extreme-values-for-temperature-to-CSV-and-Excel-files-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Write extreme values for temperature to CSV and Excel files</a></span></li><li><span><a href="#Save-to-pickle-and-feather-files" data-toc-modified-id="Save-to-pickle-and-feather-files-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Save to pickle and feather files</a></span></li><li><span><a href="#Load-the-pickle-and-feather-files-we-just-saved" data-toc-modified-id="Load-the-pickle-and-feather-files-we-just-saved-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Load the pickle and feather files we just saved</a></span></li></ul></div>

# Import pandas and pyarrow and adjust the display

In [1]:
# import pandas and pyarrow
import pandas as pd
import pyarrow

In [2]:
# pd.options.display.float_format = '{:,.2f}'.format
# pd.set_option('display.width', 68)
# pd.set_option('display.max_columns', 3)

In [3]:
import watermark
%load_ext watermark

%watermark -n -v -g -iv

Python implementation: CPython
Python version       : 3.7.9
IPython version      : 7.20.0

Git hash: 8a3e97ca83bad4351eec3ea3d9f803908bc0100f

pandas   : 1.2.1
json     : 2.0.9
watermark: 2.1.0
pyarrow  : 0.13.0



# Load the land temperatures CSV file into pandas, drop rows with missing data, and set an index

In [4]:
landtemps = pd.read_csv('data/landtempssample.csv',
                        names=[
                            'stationid', 'year', 'month', 'avgtemp',
                            'latitude', 'longitude', 'elevation', 'station',
                            'countryid', 'country'
                        ],
                        skiprows=1,
                        parse_dates=[['month', 'year']],
                        low_memory=False)

In [5]:
landtemps.rename(columns={'month_year':'measuredate'}, inplace=True)

In [6]:
landtemps.dropna(subset=['avgtemp'], inplace=True)

In [7]:
landtemps.dtypes

measuredate    datetime64[ns]
stationid              object
avgtemp               float64
latitude              float64
longitude             float64
elevation             float64
station                object
countryid              object
country                object
dtype: object

In [8]:
landtemps.set_index(['measuredate','stationid'], inplace=True)

In [9]:
landtemps.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,avgtemp,latitude,longitude,elevation,station,countryid,country
measuredate,stationid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-04-01,USS0010K01S,5.27,39.9,-110.75,2773.7,INDIAN_CANYON,US,United States
1940-05-01,CI000085406,18.04,-18.35,-70.333,58.0,ARICA,CI,Chile
2013-12-01,USC00036376,6.22,34.3703,-91.1242,61.0,SAINT_CHARLES,US,United States
1963-02-01,ASN00024002,22.93,-34.2833,140.6,65.5,BERRI_IRRIGATION,AS,Australia
1991-04-01,USW00024151,5.59,42.1492,-112.2872,1362.5,MALAD_CITY,US,United States


# Write extreme values for temperature to CSV and Excel files

In [10]:
# Use the quantile method to select outlier rows, those at the 1-in-1,000 level at each end of the distribution

extremevals = landtemps[
    (landtemps['avgtemp'] < landtemps['avgtemp'].quantile(.001)) |
    (landtemps['avgtemp'] > landtemps['avgtemp'].quantile(.999))]

In [11]:
extremevals.shape

(171, 7)

In [12]:
extremevals.sample(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,avgtemp,latitude,longitude,elevation,station,countryid,country
measuredate,stationid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-06-01,MUM00041304,34.99,18.133,55.183,273.0,MARMUL,MU,Oman
2001-12-01,RSM00024871,-35.82,61.867,135.5,141.0,OHOTSKIJPEREVOZ,RS,Russia
1984-01-01,RSM00031054,-35.05,59.183,135.15,212.0,UST_JUDOMA,RS,Russia
1938-12-01,RSM00025138,-35.35,68.12,164.17,98.0,OSTROVNOE,RS,Russia
2001-05-01,MUM00041262,35.48,22.35,56.483,170.0,FAHUD_AUT,MU,Oman
2018-09-01,AYM00089606,-63.35,-78.45,106.867,3488.0,VOSTOK,AY,Antarctica
2018-09-01,AYW00090001,-62.4,-90.0,0.0,9999.0,AMUNDSEN_SCOTT,AY,Antarctica


In [13]:
extremevals.to_excel('views/tempext.xlsx')

In [14]:
extremevals.to_csv('views/tempext.csv')

# Save to pickle and feather files

In [15]:
landtemps.to_pickle('data/landtemps.pkl')

In [16]:
landtemps.reset_index(inplace=True)

In [17]:
# Pandas requires version '0.15.0' or newer of 'pyarrow' (version '0.13.0' currently installed).
# landtemps.to_feather('data/landtemps.ftr')

# Load the pickle and feather files we just saved

In [18]:
landtemps = pd.read_pickle('data/landtemps.pkl')

In [19]:
landtemps.head(2).T

measuredate,2000-04-01,1940-05-01
stationid,USS0010K01S,CI000085406
avgtemp,5.27,18.04
latitude,39.9,-18.35
longitude,-110.75,-70.333
elevation,2773.7,58.0
station,INDIAN_CANYON,ARICA
countryid,US,CI
country,United States,Chile


In [20]:
# Pandas requires version '0.15.0' or newer of 'pyarrow' (version '0.13.0' currently installed).
# landtemps = pd.read_feather('data/landtemps.ftr')
# landtemps.head(2).T