In [2]:
import pandas as pd
import pyarrow

In [6]:
landtemps = \
pd.read_csv('data/landtempssample.csv',
names=['stationid','year','month','avgtemp',
    'latitude','longitude','elevation',
    'station','countryid','country'],
    skiprows=1,
    low_memory=False)

landtemps['measuredate'] = pd.to_datetime(landtemps[['year', 'month']].assign(day=1))
landtemps = landtemps.drop(['month', 'year'], axis=1)
landtemps.dropna(subset=['avgtemp'], inplace=True)

landtemps.dtypes

stationid              object
avgtemp               float64
latitude              float64
longitude             float64
elevation             float64
station                object
countryid              object
country                object
measuredate    datetime64[ns]
dtype: object

In [7]:
landtemps.set_index(['measuredate', 'stationid'], inplace=True)

In [8]:
landtemps.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,avgtemp,latitude,longitude,elevation,station,countryid,country
measuredate,stationid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-04-01,USS0010K01S,5.27,39.9,-110.75,2773.7,INDIAN_CANYON,US,United States
1940-05-01,CI000085406,18.04,-18.35,-70.333,58.0,ARICA,CI,Chile
2013-12-01,USC00036376,6.22,34.3703,-91.1242,61.0,SAINT_CHARLES,US,United States
1963-02-01,ASN00024002,22.93,-34.2833,140.6,65.5,BERRI_IRRIGATION,AS,Australia
1991-04-01,USW00024151,5.59,42.1492,-112.2872,1362.5,MALAD_CITY,US,United States


In [9]:
extremevals = landtemps[(landtemps.avgtemp < landtemps.avgtemp.quantile(.001)) | (landtemps.avgtemp > landtemps.avgtemp.quantile(.999))]
extremevals.shape

(171, 7)

In [10]:
extremevals.sample(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,avgtemp,latitude,longitude,elevation,station,countryid,country
measuredate,stationid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-06-01,SAM00041140,34.8,16.901,42.586,6.1,KING_ABDULLAH_BIN_ABDULAZIZ,SA,Saudi Arabia
2015-05-01,SAM00041030,35.2,21.483,39.833,310.0,MAKKAH_MECCA,SA,Saudi Arabia
1959-12-01,RSM00024639,-36.52,63.283,118.333,119.0,NJURBA,RS,Russia
1998-02-01,RSM00025325,-35.71,66.55,159.42,127.0,UST_OLOJ,RS,Russia
1980-07-01,ER000063043,36.38,13.067,42.717,14.0,ASSAB,ER,Eritrea
1987-07-01,SUM00062700,35.5,16.7,33.4333,360.0,SHENDI,SU,Sudan
1965-12-01,RSXLT854187,-35.59,70.6667,154.15,2.0,ALAZEYA,RS,Russia


In [12]:
extremevals.to_excel('views/tempext.xlsx')
extremevals.to_csv('views/tempext.csv')

In [13]:
landtemps.to_pickle('data/landtemps.pkl')
landtemps.reset_index(inplace=True)
landtemps.to_feather('data/landtemps.ftr')

In [15]:
landtempspkl = pd.read_pickle('data/landtemps.pkl')
landtempspkl.head(2).T
# T here is for transpose rows to columns and vice versa (columns to rows)

measuredate,2000-04-01,1940-05-01
stationid,USS0010K01S,CI000085406
avgtemp,5.27,18.04
latitude,39.9,-18.35
longitude,-110.75,-70.333
elevation,2773.7,58.0
station,INDIAN_CANYON,ARICA
countryid,US,CI
country,United States,Chile


In [16]:
landtempsftr = pd.read_feather('data/landtemps.ftr')
landtempsftr.head(2).T

Unnamed: 0,0,1
measuredate,2000-04-01 00:00:00,1940-05-01 00:00:00
stationid,USS0010K01S,CI000085406
avgtemp,5.27,18.04
latitude,39.9,-18.35
longitude,-110.75,-70.333
elevation,2773.7,58.0
station,INDIAN_CANYON,ARICA
countryid,US,CI
country,United States,Chile
