 ## Series as dictionary

In [2]:
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1],
                index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [3]:
data['b']

0.5

In [4]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [5]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [7]:
# a Series object is mutable
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

# Series as one-dimensional array

In [14]:
# slicing by index
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [9]:
# masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [15]:
# fancy indexing
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

# Indexers: loc, iloc and ix

In [19]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [21]:
# loc references the explicit index
data.loc[1:3]

1    a
3    b
dtype: object

In [22]:
# iloc references the implicit index (python-style)
data.iloc[1:3]

3    b
5    c
dtype: object

## Data Selection in DataFrame

In [2]:
import pandas as pd
population_dict = {'California': 111111, 'Texas': 22222, 'Florida': 33333, 'Oregon': 88888} 
pop = pd.Series(population_dict)

area_dict = {'California': 1, 'Texas': 2, 'Florida': 3, 'Oregon': 8, 'Montana': 0} 
area = pd.Series(area_dict)

data = pd.DataFrame({'area': area, 'pop': pop})
data.head()

Unnamed: 0,area,pop
California,1,111111.0
Florida,3,33333.0
Montana,0,
Oregon,8,88888.0
Texas,2,22222.0


In [7]:
# attribute accesor
data.area is data['area']

True

In [9]:
# note that data.pop is DataFrame method so we can't always use dot notation
data.pop is data['pop']

False

In [10]:
# and we can assign another column
data['density'] = data['pop'] / data['area']

In [13]:
data[data.density > 11110]

Unnamed: 0,area,pop,density
California,1,111111.0,111111.0
Florida,3,33333.0,11111.0
Oregon,8,88888.0,11111.0
Texas,2,22222.0,11111.0


## Operating on Data in Pandas (p115)

In [11]:
import pandas as pd
import numpy as np

rng = np. random.RandomState(42)

ser = pd.Series(rng.randint(0, 10, 4))
ser


0    6
1    3
2    7
3    4
dtype: int64

In [16]:
df = pd.DataFrame(rng.randint(0, 10, (3,4)), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,3,1,7,3
1,1,5,5,9
2,3,5,1,9


In [22]:
# here we can use a NumPy ufunc and the indices generate by Pandas are preserved
np.exp(df)


Unnamed: 0,A,B,C,D
0,20.085537,2.718282,1096.633158,20.085537
1,2.718282,148.413159,148.413159,8103.083928
2,20.085537,148.413159,2.718282,8103.083928


In [24]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662, 
                'California': 19651127}, name='area')
area

Alaska         1723337
California    19651127
Texas           695662
Name: area, dtype: int64

In [25]:
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                'New York': 19651127}, name='population')
pop

California    38332521
New York      19651127
Texas         26448193
Name: population, dtype: int64

In [26]:
# note the graceful Nan for un-matched indices
pop/area

Alaska              NaN
California     1.950653
New York            NaN
Texas         38.018740
dtype: float64

In [28]:
# and when we use pandas divide() we can add a fill_value
pop.divide(area, fill_value=0)

Alaska         0.000000
California     1.950653
New York            inf
Texas         38.018740
dtype: float64