# Pandas - DataFrame

In [2]:
import pandas as pd
import numpy as np
from numpy.random import randn

In [3]:
np.random.seed(101)

In [4]:
df = pd.DataFrame(randn(5,4),['A', 'B', 'C', 'E', 'D'],['W', 'X', 'Y', 'Z'])

In [5]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
E,0.188695,-0.758872,-0.933237,0.955057
D,0.190794,1.978757,2.605967,0.683509


### Acess Columns

In [6]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
E    0.188695
D    0.190794
Name: W, dtype: float64

In [7]:
type(df['W'])

pandas.core.series.Series

In [8]:
df[['W', 'X']]

Unnamed: 0,W,X
A,2.70685,0.628133
B,0.651118,-0.319318
C,-2.018168,0.740122
E,0.188695,-0.758872
D,0.190794,1.978757


In [9]:
df['XY'] = df['X'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,XY
A,2.70685,0.628133,0.907969,0.503826,1.536102
B,0.651118,-0.319318,-0.848077,0.605965,-1.167395
C,-2.018168,0.740122,0.528813,-0.589001,1.268936
E,0.188695,-0.758872,-0.933237,0.955057,-1.692109
D,0.190794,1.978757,2.605967,0.683509,4.584725


### Drop data

In [10]:
# axis = 1 -> drop column
df.drop('XY', axis=1, inplace=False)
# axis = 0 -> drop row
# df.drop('A', axis=0)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
E,0.188695,-0.758872,-0.933237,0.955057
D,0.190794,1.978757,2.605967,0.683509


In [11]:
# romove in place
df.drop('XY', axis=1, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
E,0.188695,-0.758872,-0.933237,0.955057
D,0.190794,1.978757,2.605967,0.683509


> Row index start from 0

> Column index start from 1

In [12]:
df.shape

(5, 4)

### Acess Rows

> Use label

In [13]:
df.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

> Use numeric index

In [14]:
df.iloc[0]

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [15]:
df.iloc[0:2]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965


### Acess cells

In [16]:
df.loc['A', 'W']

2.706849839399938

In [17]:
df.loc[['A', 'C'],['W', 'X']]

Unnamed: 0,W,X
A,2.70685,0.628133
C,-2.018168,0.740122


# Operator with DataFrame

### Condition selection

In [18]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
E,True,False,False,True
D,True,True,True,True


In [19]:
df[df > 0]
# Nan equivalent to False

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
E,0.188695,,,0.955057
D,0.190794,1.978757,2.605967,0.683509


In [20]:
df['W'] > 0

A     True
B     True
C    False
E     True
D     True
Name: W, dtype: bool

In [21]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
E,0.188695,-0.758872,-0.933237,0.955057
D,0.190794,1.978757,2.605967,0.683509


> ### Multiple Conditions

In [22]:
df[(df['W']>0) & (df['Y']>0)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
D,0.190794,1.978757,2.605967,0.683509


In [23]:
df[(df['W']<0) | (df['Y']<0)]

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
E,0.188695,-0.758872,-0.933237,0.955057


> ### Reset index to 0-indices and add old index as a column

In [24]:
df.reset_index(inplace=False)

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,E,0.188695,-0.758872,-0.933237,0.955057
4,D,0.190794,1.978757,2.605967,0.683509


# Mutilindex

In [33]:
# Index levels
out_side = ['L1', 'L1', 'L1', 'L2', 'L2', 'L2']
in_side = [1, 2, 3, 4, 5, 6]
hier_index = list(zip(out_side, in_side))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [34]:
hier_index

MultiIndex([('L1', 1),
            ('L1', 2),
            ('L1', 3),
            ('L2', 4),
            ('L2', 5),
            ('L2', 6)],
           )

In [35]:
df1 = pd.DataFrame(randn(6,2),hier_index,['A', 'B'])

In [36]:
df1

Unnamed: 0,Unnamed: 1,A,B
L1,1,-0.497104,-0.75407
L1,2,-0.943406,0.484752
L1,3,-0.116773,1.901755
L2,4,0.238127,1.996652
L2,5,-0.993263,0.1968
L2,6,-1.136645,0.000366


In [40]:
df1.loc['L1']

Unnamed: 0,A,B
1,-0.497104,-0.75407
2,-0.943406,0.484752
3,-0.116773,1.901755


In [42]:
df1.index.names

FrozenList([None, None])

In [43]:
df1.index.names = ['Group', 'Num']

In [44]:
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
L1,1,-0.497104,-0.75407
L1,2,-0.943406,0.484752
L1,3,-0.116773,1.901755
L2,4,0.238127,1.996652
L2,5,-0.993263,0.1968
L2,6,-1.136645,0.000366


In [52]:
df1.loc['L2'].iloc[1][1]

0.19679950499134005

In [60]:
df1['B']['L1'][1]

-0.7540697010400628

In [69]:
df1.xs(('L1',2))

A   -0.943406
B    0.484752
Name: (L1, 2), dtype: float64