# Modern Pandas (Part 4)

* see: https://github.com/TomAugspurger/effective-pandas
* see: https://tomaugspurger.github.io/modern-4-performance

In [3]:
%matplotlib inline

import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('ticks')
sns.set_context('talk')
pd.options.display.max_rows = 10

# Constructors

In [4]:
files = glob.glob("weather/*.csv")

In [6]:
len(files)

51

In [8]:
columns = ['station', 'date', 'tmpf', 'relh', 'sped', 'mslp', 'p01i', 'vsby', 'gust_mph', 'skyc1', 'skyc2', 'skyc3']

In [20]:
%%time
weather = pd.DataFrame(columns=columns)

for fp in files:
    city = pd.read_csv(fp, names=columns)
    weather = pd.concat([weather, city])



CPU times: user 12.3 s, sys: 5.02 s, total: 17.3 s
Wall time: 18.7 s


In [21]:
weather.head()

Unnamed: 0,station,date,tmpf,relh,sped,mslp,p01i,vsby,gust_mph,skyc1,skyc2,skyc3
0,station,date,tmpf,relh,sped,mslp,p01i,vsby,gust_mph,skyc1,skyc2,skyc3
1,BXK,2014-01-01 00:15:00,62.6,31.56,0.0,,0.0,10.0,,CLR,M,M
2,BXK,2014-01-01 00:35:00,62.6,31.56,0.0,,0.0,10.0,,CLR,M,M
3,BXK,2014-01-01 00:55:00,59.0,35.87,0.0,,0.0,6.0,,CLR,M,M
4,BXK,2014-01-01 01:15:00,57.2,38.26,0.0,,0.0,10.0,,CLR,M,M


In [22]:
weather.shape

(3303698, 12)

In [23]:
%%time

weather_dfs = [pd.read_csv(fp, names=columns) for fp in files]
weather = pd.concat(weather_dfs)

  call = lambda f, *a, **k: f(*a, **k)


CPU times: user 5.04 s, sys: 1.33 s, total: 6.37 s
Wall time: 7.49 s


In [24]:
weather.head()

Unnamed: 0,station,date,tmpf,relh,sped,mslp,p01i,vsby,gust_mph,skyc1,skyc2,skyc3
0,station,date,tmpf,relh,sped,mslp,p01i,vsby,gust_mph,skyc1,skyc2,skyc3
1,BXK,2014-01-01 00:15:00,62.6,31.56,0.0,,0.0,10.0,,CLR,M,M
2,BXK,2014-01-01 00:35:00,62.6,31.56,0.0,,0.0,10.0,,CLR,M,M
3,BXK,2014-01-01 00:55:00,59.0,35.87,0.0,,0.0,6.0,,CLR,M,M
4,BXK,2014-01-01 01:15:00,57.2,38.26,0.0,,0.0,10.0,,CLR,M,M


In [25]:
weather.shape

(3303698, 12)

# Iteration, Apply, And Vectorization

In [26]:
df = pd.read_csv('data/756883233_T_ONTIME.csv')

In [27]:
df.head()

Unnamed: 0,FL_DATE,UNIQUE_CARRIER,AIRLINE_ID,TAIL_NUM,FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,...,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32
0,2017-01-01,AA,19805,N787AA,1,12478,1247803,31703,JFK,"New York, NY",...,1209.0,27.0,0.0,,27.0,0.0,0.0,0.0,0.0,
1,2017-01-01,AA,19805,N783AA,2,12892,1289204,32575,LAX,"Los Angeles, CA",...,1809.0,42.0,0.0,,34.0,0.0,8.0,0.0,0.0,
2,2017-01-01,AA,19805,N791AA,4,12892,1289204,32575,LAX,"Los Angeles, CA",...,2040.0,42.0,0.0,,7.0,0.0,0.0,0.0,35.0,
3,2017-01-01,AA,19805,N391AA,5,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",...,1749.0,97.0,0.0,,77.0,0.0,20.0,0.0,0.0,
4,2017-01-01,AA,19805,N346AA,6,13830,1383002,33830,OGG,"Kahului, HI",...,642.0,42.0,0.0,,0.0,0.0,42.0,0.0,0.0,


In [28]:
delays = df['DEP_DELAY']

In [30]:
delays.to_frame().head()

Unnamed: 0,DEP_DELAY
0,31.0
1,34.0
2,51.0
3,77.0
4,0.0


In [31]:
delays.nlargest(5).sort_values()

116202    1480.0
152242    1545.0
147971    1934.0
70194     1970.0
50292     2755.0
Name: DEP_DELAY, dtype: float64

In [41]:
delays.sort_values(na_position='first').tail(5)

116202    1480.0
152242    1545.0
147971    1934.0
70194     1970.0
50292     2755.0
Name: DEP_DELAY, dtype: float64

In [34]:
%timeit delays.nlargest(5).sort_values()

9.22 ms ± 416 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [42]:
%timeit delays.sort_values(na_position='first').tail(5)

52 ms ± 4.96 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [44]:
from utils import download_airports
import zipfile

In [45]:
if not os.path.exists("data/airports.csv.zip"):
    download_airports()

FileNotFoundError: [Errno 2] No such file or directory: 'data/airports.csv.zip'

TODO: download data

# Categoricals

In [46]:
import string

s = pd.Series(np.random.choice(list(string.ascii_letters), 100000))
print('{:0.2f} KB'.format(s.memory_usage(index=False) / 1000))

800.00 KB


In [47]:
c = s.astype('category')
print('{:0.2f} KB'.format(c.memory_usage(index=False) / 1000))

102.98 KB
