# Intrdoducing Pandas Objects

## The Pandas Series Object

In [1]:
import pandas as pd
data = pd.Series([0.25,0.5,0.75,1.0])
print(data)

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64


In [2]:
data.values

array([ 0.25,  0.5 ,  0.75,  1.  ])

In [3]:
data[1]

0.5

In [4]:
data[1:3]

1    0.50
2    0.75
dtype: float64

### Series as generalized Numpy array

In [5]:
data = pd.Series([0.25,0.5,0.75,1.0],index=['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [6]:
data['b']

0.5

In [7]:
data = pd.Series([0.25,0.5,0.75,1.0],index=[2,3,5,7])
data

2    0.25
3    0.50
5    0.75
7    1.00
dtype: float64

In [8]:
data[5]

0.75

### Series as specialized dictionary

In [9]:
population_dict = {'California':38332521,
             'Texas':26448193,
             'New York':19651127,
             'Florida':19552860,
             'Illionis':12882135}
population = pd.Series(population_dict)
population

California    38332521
Florida       19552860
Illionis      12882135
New York      19651127
Texas         26448193
dtype: int64

In [10]:
population['Texas']

26448193

In [11]:
population['California':'Illionis']

California    38332521
Florida       19552860
Illionis      12882135
dtype: int64

### Constructing Series objects

In [12]:
pd.Series([2,4,6])

0    2
1    4
2    6
dtype: int64

In [13]:
pd.Series(5,index=[100,200,300])

100    5
200    5
300    5
dtype: int64

In [14]:
pd.Series({2:'a',1:'b',3:'c'})

1    b
2    a
3    c
dtype: object

In [15]:
pd.Series({2:'a',1:'b',3:'c'},index = [3,2])

3    c
2    a
dtype: object

## The Pandas DataFrame Object

### DataFrame as a generalized Numpy array

In [16]:
area_dict = {'California':423976,
             'Texas':695662,
             'New York':141297,
             'Florida':170312,
             'Illionis':149995}
area = pd.Series(area_dict)
area

California    423976
Florida       170312
Illionis      149995
New York      141297
Texas         695662
dtype: int64

In [17]:
stats = pd.DataFrame({'population':population,'area':area})
stats

Unnamed: 0,area,population
California,423976,38332521
Florida,170312,19552860
Illionis,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [18]:
stats.index

Index(['California', 'Florida', 'Illionis', 'New York', 'Texas'], dtype='object')

In [19]:
stats.columns

Index(['area', 'population'], dtype='object')

### DataFrame as specialized dictionary

In [20]:
stats['area']

California    423976
Florida       170312
Illionis      149995
New York      141297
Texas         695662
Name: area, dtype: int64

### Consttucting DataFrame objects

#### From a single Series object

In [21]:
pd.DataFrame(population,columns=['population'])

Unnamed: 0,population
California,38332521
Florida,19552860
Illionis,12882135
New York,19651127
Texas,26448193


#### From a list of dicts

In [22]:
data = [{'a':i,'b':2*i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


#### From a dictionary of Series objects

In [23]:
pd.DataFrame({'population':population,'area':area})

Unnamed: 0,area,population
California,423976,38332521
Florida,170312,19552860
Illionis,149995,12882135
New York,141297,19651127
Texas,695662,26448193


#### From a two-dimensional NumPy array

In [24]:
import numpy as np

In [25]:
pd.DataFrame(np.random.rand(3,2),columns=['foo','bar'],index=['a','b','c'])

Unnamed: 0,foo,bar
a,0.645224,0.82407
b,0.3316,0.403604
c,0.820552,0.317264


#### From a Numpy structured array

In [26]:
A = np.zeros(3,dtype=[('A','i8'),('B','f8')])
A

array([(0,  0.), (0,  0.), (0,  0.)],
      dtype=[('A', '<i8'), ('B', '<f8')])

In [27]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


## The Pandas Index Object

In [28]:
ind = pd.Index([2,3,5,7,11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

### Index as immutable array

In [29]:
ind[1]

3

In [30]:
ind[::2]

Int64Index([2, 5, 11], dtype='int64')

In [31]:
print(ind.size,ind.shape,ind.ndim,ind.dtype)

5 (5,) 1 int64


In [32]:
#error ,cannot be modified via the normal means
#int[1]=3

TypeError: 'type' object does not support item assignment

In [33]:
indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])

In [34]:
indA & indB

Int64Index([3, 5, 7], dtype='int64')

In [35]:
indA | indB

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [37]:
indA ^ indB

Int64Index([1, 2, 9, 11], dtype='int64')

# Data Indexing and Selection

## Data Selection in Series

### Series as dictionary

In [54]:
data = pd.Series([0.25,0.5,0.75,1.0],index=['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [44]:
data['b']

0.5

In [45]:
'a' in data

True

In [46]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [47]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [62]:
data['e']=1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [63]:
data['b']=1.2
data

a    0.25
b    1.20
c    0.75
d    1.00
e    1.25
dtype: float64

### Series as one-dimensional array

In [55]:
#slicing by explicit index 显式索引切片
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [56]:
#slicing by implicit integer index 隐式索引切片
data[0:2]

a    0.25
b    0.50
dtype: float64

In [64]:
#masking
data[(data>0.3) & (data<0.8)]

c    0.75
dtype: float64

In [65]:
#fancy indexing
data[['a','e']]

a    0.25
e    1.25
dtype: float64

### indexers:loc,iloc,and ix

In [66]:
data = pd.Series(['a','b','c'],index = [1,3,5])
data

1    a
3    b
5    c
dtype: object

In [69]:
#explicit index when indexing
data[3]

'b'

In [70]:
#implicit index when indexing
data[1:3]

3    b
5    c
dtype: object

In [71]:
#索引值loc  显式  recommend
data.loc[1]

'a'

In [72]:
data.loc[1:3]

1    a
3    b
dtype: object

In [76]:
#索引位置iloc 隐式
data.iloc[2]

'c'

In [77]:
data.iloc[1:3]

3    b
5    c
dtype: object

## Data Selection in DataFrame

### DataFrame as a dictionary

In [80]:
area = pd.Series({'California':423976,
             'Texas':695662,
             'New York':141297,
             'Florida':170312,
             'Illionis':149995})
pop = pd.Series({'California':38332521,
             'Texas':26448193,
             'New York':19651127,
             'Florida':19552860,
             'Illionis':12882135})
data = pd.DataFrame({'area':area,'pop':pop})
data

Unnamed: 0,area,pop
California,423976,38332521
Florida,170312,19552860
Illionis,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [81]:
#accessed via dictionary-style indexing fo the column name
data['area']

California    423976
Florida       170312
Illionis      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [82]:
#attribute-style access with column names
#not recommend
data.area

California    423976
Florida       170312
Illionis      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [83]:
data.area is data['area']

True

In [84]:
#不推荐使用属性方式 会与自带函数名称冲突
#data.pop in data['pop']

TypeError: 'DataFrame' objects are mutable, thus they cannot be hashed

In [86]:
data['density'] = data['pop']/data['area']
data

Unnamed: 0,area,pop,density
California,423976,38332521,90.412007
Florida,170312,19552860,114.806121
Illionis,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


### DataFrame as two-dimensional array

In [87]:
data.values

array([[  4.23976000e+05,   3.83325210e+07,   9.04120068e+01],
       [  1.70312000e+05,   1.95528600e+07,   1.14806121e+02],
       [  1.49995000e+05,   1.28821350e+07,   8.58837628e+01],
       [  1.41297000e+05,   1.96511270e+07,   1.39076746e+02],
       [  6.95662000e+05,   2.64481930e+07,   3.80187404e+01]])

In [88]:
#transpose
data.T

Unnamed: 0,California,Florida,Illionis,New York,Texas
area,423976.0,170312.0,149995.0,141297.0,695662.0
pop,38332520.0,19552860.0,12882140.0,19651130.0,26448190.0
density,90.41201,114.8061,85.88376,139.0767,38.01874


In [89]:
data.values[0]

array([  4.23976000e+05,   3.83325210e+07,   9.04120068e+01])

In [90]:
data['area']

California    423976
Florida       170312
Illionis      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [93]:
#二元list隐式索引获取 iloc
data.iloc[:3,:2]
#data.iloc[0:3,0:2]

Unnamed: 0,area,pop
California,423976,38332521
Florida,170312,19552860
Illionis,149995,12882135


In [92]:
#二元list显式索引获取 loc
data.loc[:'Illionis',:'pop']

Unnamed: 0,area,pop
California,423976,38332521
Florida,170312,19552860
Illionis,149995,12882135


In [94]:
##二元list混合索引获取 ix
data.ix[:3,:'pop']

Unnamed: 0,area,pop
California,423976,38332521
Florida,170312,19552860
Illionis,149995,12882135


In [95]:
#combine masking and fancy indexing
data.loc[data.density >100,['pop','density']]

Unnamed: 0,pop,density
Florida,19552860,114.806121
New York,19651127,139.076746


In [97]:
data.iloc[0,2] = 90
data

Unnamed: 0,area,pop,density
California,423976,38332521,90.0
Florida,170312,19552860,114.806121
Illionis,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


### Additional indeing conventions

In [98]:
data['Florida':'Illionis']

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
Illionis,149995,12882135,85.883763


In [99]:
data[1:3]

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
Illionis,149995,12882135,85.883763


In [100]:
#bool slices
data[data['density'] > 100]

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
New York,141297,19651127,139.076746


# Operating on Data in Pandas

## Ufuncs:Index Preservation

In [101]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [102]:
df = pd.DataFrame(rng.randint(0,10,(3,4)),columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [103]:
#指数
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

## UFuncs:Index Alignment

### Index alignment in Series

In [105]:
area = pd.Series({'Alaska':1723337,'Texas':695662,'California':423976},name ='area')
population = pd.Series({'California':38332521,
             'Texas':26448193,
             'New York':19651127},name='population')
population/area

Alaska              NaN
California    90.412007
New York            NaN
Texas         38.018740
dtype: float64

In [106]:
area.index | population.index

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [107]:
A = pd.Series([2,4,6],index=[0,1,2])
B = pd.Series([1,3,5],index = [1,2,3])
A+B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [108]:
A.add(B,fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

### Index alignment in DataFrame

In [110]:
A = pd.DataFrame(rng.randint(0,20,(2,2)),columns=list('AB'))
A

Unnamed: 0,A,B
0,0,11
1,11,16


In [111]:
B = pd.DataFrame(rng.randint(0,10,(3,3)),columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,9,2,6
1,3,8,2
2,4,2,6


In [112]:
A+B

Unnamed: 0,A,B,C
0,2.0,20.0,
1,19.0,19.0,
2,,,


In [115]:
#均值填充
fill = A.stack().mean()
A.add(B,fill_value=fill)

Unnamed: 0,A,B,C
0,2.0,20.0,15.5
1,19.0,19.0,11.5
2,11.5,13.5,15.5


### Ufuncs：Operations Between DataFrame and Series

In [116]:
A = rng.randint(10,size=(3,4))
A

array([[4, 8, 6, 1],
       [3, 8, 1, 9],
       [8, 9, 4, 1]])

In [117]:
A -A[0]

array([[ 0,  0,  0,  0],
       [-1,  0, -5,  8],
       [ 4,  1, -2,  0]])

In [119]:
# - 0row
df = pd.DataFrame(A,columns=list('QRST'))
print(df)
df -df.iloc[0]

   Q  R  S  T
0  4  8  6  1
1  3  8  1  9
2  8  9  4  1


Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,-1,0,-5,8
2,4,1,-2,0


In [120]:
# - R col
df.subtract(df['R'],axis = 0 )

Unnamed: 0,Q,R,S,T
0,-4,0,-2,-7
1,-5,0,-7,1
2,-1,0,-5,-8


In [122]:
#half row
halfrow = df.iloc[0,::2]
halfrow

Q    4
S    6
Name: 0, dtype: int64

In [124]:
print (df)
print (halfrow)
print (df - halfrow)

   Q  R  S  T
0  4  8  6  1
1  3  8  1  9
2  8  9  4  1
Q    4
S    6
Name: 0, dtype: int64
     Q   R    S   T
0  0.0 NaN  0.0 NaN
1 -1.0 NaN -5.0 NaN
2  4.0 NaN -2.0 NaN


# Handing Missing Data

## None:Pythonic missing data

In [125]:
vals1 = np.array([1,None,3,4])
vals1

array([1, None, 3, 4], dtype=object)

In [126]:
for dtype in ['object','int']:
    print ("dtype = ",dtype)
    %timeit np.arange(1E6,dtype=dtype).sum()
    print()


dtype =  object
85.5 ms ± 1.82 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

dtype =  int
1.69 ms ± 21.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)



In [127]:
#None sum error
#vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

## NaN:Missing numerical data

In [128]:
vals2 = np.array([1,np.nan,3,4])
vals2.dtype

dtype('float64')

In [129]:
1+np.nan

nan

In [131]:
vals2.sum(),vals2.min(),vals2.max()

(nan, nan, nan)