# 25 Pandas Tricks
From Kevin at Data School
<br>
https://www.youtube.com/watch?v=RlIiVeig3hc

In [5]:
import numpy as np
import pandas as pd

In [6]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
movies = pd.read_csv('http://bit.ly/imdbratings')
orders = pd.read_csv('http://bit.ly/chiporders', sep='\t')
orders['item_price'] = orders.item_price.str.replace('$', '').astype('float')
stocks = pd.read_csv('http://bit.ly/smallstocks', parse_dates = ['Date'])
titanic = pd.read_csv('http://bit.ly/kaggletrain')
ufo = pd.read_csv('http://bit.ly/uforeports', parse_dates=['Time'])

**1. Show installed versions**

In [7]:
pd.__version__

'0.24.2'

In [8]:
pd.show_versions()


INSTALLED VERSIONS
------------------
commit: None
python: 3.7.3.final.0
python-bits: 64
OS: Linux
OS-release: 5.0.0-37-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: en_US.UTF-8

pandas: 0.24.2
pytest: 4.3.1
pip: 19.0.3
setuptools: 40.8.0
Cython: 0.29.6
numpy: 1.16.2
scipy: 1.2.1
pyarrow: None
xarray: None
IPython: 7.4.0
sphinx: 1.8.5
patsy: 0.5.1
dateutil: 2.8.0
pytz: 2018.9
blosc: None
bottleneck: 1.2.1
tables: 3.5.1
numexpr: 2.6.9
feather: None
matplotlib: 3.0.3
openpyxl: 2.6.1
xlrd: 1.2.0
xlwt: 1.3.0
xlsxwriter: 1.1.5
lxml.etree: 4.3.2
bs4: 4.7.1
html5lib: 1.0.1
sqlalchemy: 1.3.1
pymysql: None
psycopg2: 2.7.6.1 (dt dec pq3 ext lo64)
jinja2: 2.10
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None
gcsfs: None


**2. Create an example DataFrame**

In [27]:
df = pd.DataFrame({'col one':[100,200], 'col two':[300, 400]})
df

Unnamed: 0,col one,col two
0,100,300
1,200,400


In [10]:
pd.DataFrame(np.random.rand(4,8))

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.82307,0.109714,0.493437,0.42357,0.149847,0.525941,0.921783,0.670169
1,0.018844,0.499902,0.956774,0.308028,0.573294,0.999051,0.861936,0.722586
2,0.613682,0.858237,0.959239,0.357055,0.699301,0.151704,0.276886,0.138323
3,0.839396,0.745525,0.205082,0.537481,0.116064,0.470845,0.664668,0.614291


In [15]:
pd.DataFrame(np.random.rand(4,8), columns=list('abcdefgh'))

Unnamed: 0,a,b,c,d,e,f,g,h
0,0.112133,0.649338,0.768312,0.943106,0.670004,0.729593,0.227514,0.267639
1,0.91605,0.702997,0.521782,0.777097,0.434498,0.91231,0.553917,0.411501
2,0.420433,0.711815,0.728395,0.65291,0.375788,0.805566,0.710373,0.311225
3,0.728516,0.48714,0.088293,0.133865,0.790623,0.945295,0.221959,0.656236


**3. Rename Columns**

In [28]:
# original
df

Unnamed: 0,col one,col two
0,100,300
1,200,400


In [20]:
# method 1
df.rename({'col one': 'col_one', 'col two': 'col_two'}, axis='columns')

Unnamed: 0,col_one,col_two
0,100,300
1,200,400


In [29]:
# original
df

Unnamed: 0,col one,col two
0,100,300
1,200,400


In [22]:
df.columns = ['col_one', 'col_two']

In [23]:
df

Unnamed: 0,col_one,col_two
0,100,300
1,200,400


In [30]:
# original
df

Unnamed: 0,col one,col two
0,100,300
1,200,400


In [32]:
df.columns = df.columns.str.replace(' ', '_')
df

Unnamed: 0,col_one,col_two
0,100,300
1,200,400


In [33]:
df.add_prefix('X_')

Unnamed: 0,X_col_one,X_col_two
0,100,300
1,200,400


In [35]:
df.add_suffix('_Y')

Unnamed: 0,col_one_Y,col_two_Y
0,100,300
1,200,400


**4. Reverse row order** 

In [36]:
drinks.head(5)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [37]:
drinks.loc[::-1].head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
192,Zimbabwe,64,18,4,4.7,Africa
191,Zambia,32,19,4,2.5,Africa
190,Yemen,6,0,0,0.1,Asia
189,Vietnam,111,2,1,2.0,Asia
188,Venezuela,333,100,3,7.7,South America


In [38]:
drinks.loc[::-1].reset_index(drop=True).head() # ::-1 means reverse order

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Zimbabwe,64,18,4,4.7,Africa
1,Zambia,32,19,4,2.5,Africa
2,Yemen,6,0,0,0.1,Asia
3,Vietnam,111,2,1,2.0,Asia
4,Venezuela,333,100,3,7.7,South America


**5. Reverse column order**

In [39]:
drinks.loc[:, ::-1].head() # ::-1 means reverse order

Unnamed: 0,continent,total_litres_of_pure_alcohol,wine_servings,spirit_servings,beer_servings,country
0,Asia,0.0,0,0,0,Afghanistan
1,Europe,4.9,54,132,89,Albania
2,Africa,0.7,14,0,25,Algeria
3,Europe,12.4,312,138,245,Andorra
4,Africa,5.9,45,57,217,Angola


**6. Select columns by data type**

In [40]:
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [41]:
drinks.select_dtypes(include='number').head()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,0,0,0,0.0
1,89,132,54,4.9
2,25,0,14,0.7
3,245,138,312,12.4
4,217,57,45,5.9


In [42]:
drinks.select_dtypes(include='object').head()

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa


In [43]:
drinks.select_dtypes(include=['number', 'object', 'category', 'datetime']).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [44]:
drinks.select_dtypes(exclude = 'number').head()

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa


**7. Convert strings to numbers**

In [45]:
df = pd.DataFrame({'col_one':['1.1', '2.2', '3.3']
                  , 'col_two':['4.4', '5.5', '6.6']
                  , 'col_three':['7.7', '8.8', '-']})
df

Unnamed: 0,col_one,col_two,col_three
0,1.1,4.4,7.7
1,2.2,5.5,8.8
2,3.3,6.6,-


In [46]:
df.dtypes

col_one      object
col_two      object
col_three    object
dtype: object

In [47]:
df.astype({'col_one':'float', 'col_two':'float'}).dtypes

col_one      float64
col_two      float64
col_three     object
dtype: object

In [48]:
pd.to_numeric(df.col_three, errors='coerce')

0    7.7
1    8.8
2    NaN
Name: col_three, dtype: float64

In [52]:
pd.to_numeric(df.col_three, errors='coerce').fillna(0)

0    7.7
1    8.8
2    0.0
Name: col_three, dtype: float64

In [55]:
df = df.apply(pd.to_numeric, errors='coerce').fillna(0) # does the entire DF
df

Unnamed: 0,col_one,col_two,col_three
0,1.1,4.4,7.7
1,2.2,5.5,8.8
2,3.3,6.6,0.0


In [57]:
df.dtypes # now they are all floats

col_one      float64
col_two      float64
col_three    float64
dtype: object

**8. Reduce DataFrame Size**

In [58]:
drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 30.4 KB


In [60]:
cols = ['beer_servings', 'continent']
small_drinks = pd.read_csv('http://bit.ly/drinksbycountry', usecols=cols)
small_drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
beer_servings    193 non-null int64
continent        193 non-null object
dtypes: int64(1), object(1)
memory usage: 13.6 KB


In [63]:
dtypes = {'continent':'category'} # change object type to category further reduces memory usage
smaller_drinks = pd.read_csv('http://bit.ly/drinksbycountry', usecols = cols, dtype=dtypes)
smaller_drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
beer_servings    193 non-null int64
continent        193 non-null category
dtypes: category(1), int64(1)
memory usage: 2.3 KB


**9. Build a DataFrame from multiple files (row-wise)**

In [65]:
stocks

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
3,2016-10-04,113.0,29736800,AAPL
4,2016-10-04,57.24,20085900,MSFT
5,2016-10-04,31.35,18460400,CSCO
6,2016-10-05,57.64,16726400,MSFT
7,2016-10-05,31.59,11808600,CSCO
8,2016-10-05,113.05,21453100,AAPL


In [70]:
stocks.loc[stocks.Date=='2016-10-03', :].to_csv('data/stocks1.csv')

In [71]:
stocks.loc[stocks.Date=='2016-10-04', :].to_csv('data/stocks2.csv')

In [72]:
stocks.loc[stocks.Date=='2016-10-05', :].to_csv('data/stocks3.csv')

In [78]:
pd.read_csv('data/stocks1.csv').drop('Unnamed: 0', axis=1)

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT


In [79]:
pd.read_csv('data/stocks2.csv').drop('Unnamed: 0', axis=1)

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-04,113.0,29736800,AAPL
1,2016-10-04,57.24,20085900,MSFT
2,2016-10-04,31.35,18460400,CSCO


In [80]:
pd.read_csv('data/stocks3.csv').drop('Unnamed: 0', axis=1)

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-05,57.64,16726400,MSFT
1,2016-10-05,31.59,11808600,CSCO
2,2016-10-05,113.05,21453100,AAPL


In [81]:
from glob import glob
stock_files = sorted(glob('data/stocks*.csv'))
stock_files

['data/stocks1.csv', 'data/stocks2.csv', 'data/stocks3.csv']

In [82]:
pd.concat((pd.read_csv(file) for file in stock_files))

Unnamed: 0.1,Unnamed: 0,Date,Close,Volume,Symbol
0,0,2016-10-03,31.5,14070500,CSCO
1,1,2016-10-03,112.52,21701800,AAPL
2,2,2016-10-03,57.42,19189500,MSFT
0,3,2016-10-04,113.0,29736800,AAPL
1,4,2016-10-04,57.24,20085900,MSFT
2,5,2016-10-04,31.35,18460400,CSCO
0,6,2016-10-05,57.64,16726400,MSFT
1,7,2016-10-05,31.59,11808600,CSCO
2,8,2016-10-05,113.05,21453100,AAPL


In [83]:
pd.concat((pd.read_csv(file) for file in stock_files), ignore_index=True)

Unnamed: 0.1,Unnamed: 0,Date,Close,Volume,Symbol
0,0,2016-10-03,31.5,14070500,CSCO
1,1,2016-10-03,112.52,21701800,AAPL
2,2,2016-10-03,57.42,19189500,MSFT
3,3,2016-10-04,113.0,29736800,AAPL
4,4,2016-10-04,57.24,20085900,MSFT
5,5,2016-10-04,31.35,18460400,CSCO
6,6,2016-10-05,57.64,16726400,MSFT
7,7,2016-10-05,31.59,11808600,CSCO
8,8,2016-10-05,113.05,21453100,AAPL


In [84]:
pd.concat((pd.read_csv(file) for file in stock_files), ignore_index=True).drop('Unnamed: 0', axis=1)

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
3,2016-10-04,113.0,29736800,AAPL
4,2016-10-04,57.24,20085900,MSFT
5,2016-10-04,31.35,18460400,CSCO
6,2016-10-05,57.64,16726400,MSFT
7,2016-10-05,31.59,11808600,CSCO
8,2016-10-05,113.05,21453100,AAPL


**10. Build a Dataframe from multiple files (column-wise)**