# Manipulating DataFrames with pandas

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

## 1. Extracting and transforming data

In [44]:
df = pd.read_csv('data/election2012.csv')
df = df.loc[df.state == 'PA']
df.head()

Unnamed: 0,state,county,Obama,Romney,winner,total,margin,turnout
2959,PA,Adams,35.5,63.1,Romney,41383,27.6,1.4
2960,PA,Allegheny,56.6,42.2,Obama,607455,14.4,1.2
2961,PA,Armstrong,30.7,67.9,Romney,27925,37.2,1.4
2962,PA,Beaver,46.0,52.6,Romney,78951,6.6,1.4
2963,PA,Bedford,22.1,77.0,Romney,21239,54.9,0.9


### indexing & slicing

In [45]:
election = df.set_index('county')
election.head()

Unnamed: 0_level_0,state,Obama,Romney,winner,total,margin,turnout
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Adams,PA,35.5,63.1,Romney,41383,27.6,1.4
Allegheny,PA,56.6,42.2,Obama,607455,14.4,1.2
Armstrong,PA,30.7,67.9,Romney,27925,37.2,1.4
Beaver,PA,46.0,52.6,Romney,78951,6.6,1.4
Bedford,PA,22.1,77.0,Romney,21239,54.9,0.9


In [25]:
p_counties = election.loc['Perry':'Potter']
p_counties

Unnamed: 0_level_0,state,Obama,Romney,winner,total,margin,turnout
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Perry,PA,29.8,68.6,Romney,17941,38.8,1.6
Philadelphia,PA,85.2,14.1,Obama,648864,71.1,0.7
Pike,PA,43.9,54.9,Romney,22883,11.0,1.2
Potter,PA,26.3,72.2,Romney,7091,45.9,1.5


In [26]:
p_counties = election.loc['Potter':'Perry':-1]   # reverse
p_counties

Unnamed: 0_level_0,state,Obama,Romney,winner,total,margin,turnout
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Potter,PA,26.3,72.2,Romney,7091,45.9,1.5
Pike,PA,43.9,54.9,Romney,22883,11.0,1.2
Philadelphia,PA,85.2,14.1,Obama,648864,71.1,0.7
Perry,PA,29.8,68.6,Romney,17941,38.8,1.6


### filtering

In [46]:
turnout_df = election[election['turnout'] < 1]
turnout_df

Unnamed: 0_level_0,state,Obama,Romney,winner,total,margin,turnout
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bedford,PA,22.1,77.0,Romney,21239,54.9,0.9
Philadelphia,PA,85.2,14.1,Obama,648864,71.1,0.7


In [47]:
low_margin = election[(election.margin < 3) & (election.turnout < 3)]
low_margin

Unnamed: 0_level_0,state,Obama,Romney,winner,total,margin,turnout
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Berks,PA,48.9,49.5,Romney,160752,0.6,1.6
Bucks,PA,50.0,48.8,Obama,315474,1.2,1.2
Centre,PA,48.9,49.0,Romney,67374,0.1,2.1
Chester,PA,49.2,49.7,Romney,245512,0.5,1.1
Mercer,PA,48.0,50.6,Romney,47386,2.6,1.4


In [40]:
# 득표율 차이가 3% 이내이면 winner 없음 처리.
too_close_row = election['margin'] < 3
election.loc[too_close_row, 'winner'] = np.nan
election.info()

<class 'pandas.core.frame.DataFrame'>
Index: 67 entries, Adams to York
Data columns (total 7 columns):
state      67 non-null object
Obama      67 non-null float64
Romney     67 non-null float64
winner     62 non-null object
total      67 non-null int64
margin     67 non-null float64
turnout    67 non-null float64
dtypes: float64(4), int64(1), object(2)
memory usage: 4.2+ KB


## Trasforming DataFrames

In [65]:
sales = pd.DataFrame({'month':['Jan','Feb','Mar','Apr','May','Jun'],
                  'eggs':[47,110,226,82,132,210],
                  'salt':[12,50,89,87,np.nan,60],
                  'spam':[17,31,72,20,52,55]})
sales = sales.set_index('month')
sales

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,226,89.0,72
Apr,82,87.0,20
May,132,,52
Jun,210,60.0,55


In [52]:
# convert to dozon unit function
def dozens(n):
    return n//12

In [66]:
sales.apply(dozens)

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3,1.0,1
Feb,9,4.0,2
Mar,18,7.0,6
Apr,6,7.0,1
May,11,,4
Jun,17,5.0,4


In [67]:
sales.apply(lambda n: n//12)

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3,1.0,1
Feb,9,4.0,2
Mar,18,7.0,6
Apr,6,7.0,1
May,11,,4
Jun,17,5.0,4


In [68]:
sales['dozen_spam'] = sales.spam.apply(dozens)
sales['salty_spam'] = sales.salt + sales.spam
sales

Unnamed: 0_level_0,eggs,salt,spam,dozen_spam,salty_spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Jan,47,12.0,17,1,29.0
Feb,110,50.0,31,2,81.0
Mar,226,89.0,72,6,161.0
Apr,82,87.0,20,1,107.0
May,132,,52,4,
Jun,210,60.0,55,4,115.0


### map() with a dictionary

In [58]:
red_vs_blue = {'Obama':'blue', 'Romney':'red'}
election['color'] = election.winner.map(red_vs_blue)

election.head()

Unnamed: 0_level_0,state,Obama,Romney,winner,total,margin,turnout,color
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Adams,PA,35.5,63.1,Romney,41383,27.6,1.4,red
Allegheny,PA,56.6,42.2,Obama,607455,14.4,1.2,blue
Armstrong,PA,30.7,67.9,Romney,27925,37.2,1.4,red
Beaver,PA,46.0,52.6,Romney,78951,6.6,1.4,red
Bedford,PA,22.1,77.0,Romney,21239,54.9,0.9,red


In [60]:
from scipy.stats import zscore

In [64]:
turnout_zscore = zscore(election['turnout'])    # z-score
election['turnout_zscore'] = turnout_zscore
election.tail()

Unnamed: 0_level_0,state,Obama,Romney,winner,total,margin,turnout,color,turnout_zscore
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Washington,PA,42.7,56.0,Romney,88958,13.3,1.3,red,-0.507754
Wayne,PA,38.8,59.8,Romney,20669,21.0,1.4,red,-0.133913
Westmoreland,PA,37.6,61.3,Romney,166809,23.7,1.1,red,-1.255436
Wyoming,PA,42.9,55.2,Romney,11001,12.3,1.9,red,1.735292
York,PA,38.7,59.9,Romney,183702,21.2,1.4,red,-0.133913


## 2. Advanced indexing

### Change Index

In [75]:
sales.index = [idx.upper() for idx in sales.index]
sales.head(3)

Unnamed: 0,eggs,salt,spam,dozen_spam,salty_spam
JAN,47,12.0,17,1,29.0
FEB,110,50.0,31,2,81.0
MAR,226,89.0,72,6,161.0


In [74]:
sales.index.name = 'MONTHS'
sales.head(3)

Unnamed: 0_level_0,eggs,salt,spam,dozen_spam,salty_spam
MONTHS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
JAN,47,12.0,17,1,29.0
FEB,110,50.0,31,2,81.0
MAR,226,89.0,72,6,161.0


### Hierarchical indexing

In [84]:
sales = pd.DataFrame({'state':['CA','CA','TX','TX','NY','NY'], 'month':[1,2,1,2,1,2], 'eggs':[47,110,221,77,69,88], 
                      'salt':[12,50,89,87,73,49], 'spam':[17,31,72,20,37,56]})
sales = sales.set_index(['state','month'])
sales

Unnamed: 0_level_0,Unnamed: 1_level_0,eggs,salt,spam
state,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,1,47,12,17
CA,2,110,50,31
TX,1,221,89,72
TX,2,77,87,20
NY,1,69,73,37
NY,2,88,49,56


In [83]:
sales.loc['CA':'TX']

Unnamed: 0_level_0,Unnamed: 1_level_0,eggs,salt,spam
state,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,1,47,12,17
CA,2,110,50,31
TX,1,221,89,72
TX,2,77,87,20


In [90]:
sales.loc[('NY', 1)]

eggs    69
salt    73
spam    37
Name: (NY, 1), dtype: int64

In [93]:
sales.loc[(['CA','TX'], 2), :]

KeyError: 'MultiIndex Slicing requires the index to be fully lexsorted tuple len (2), lexsort depth (0)'

## 3. Rearranging and reshaping data

## 4. Grouping data

## 5. Recap