# Pandas DataFrame: Basics

In [1]:
import pandas as pd
import numpy as np

pd.__version__, np.__version__

('0.25.3', '1.17.4')

### Initializing a DataFrame from a Dictionary

In [9]:
data = {
    
     'vivek' : [35, 'M', 'Trainer'] ,
     'john'  : [22, 'M', 'Gardener'] ,
     'jill'  : [19, 'F', 'Accountant'] ,
     'ravi'  : [42, 'M', 'Sales'] 
}

df1 = pd.DataFrame(data)

df1

Unnamed: 0,vivek,john,jill,ravi
0,35,22,19,42
1,M,M,F,M
2,Trainer,Gardener,Accountant,Sales


In [41]:
df1.columns

Index(['vivek', 'john', 'jill', 'ravi'], dtype='object', name='name')

In [42]:
df1.values

array([[35, 22, 19, 42],
       ['M', 'M', 'F', 'M'],
       ['Trainer', 'Gardener', 'Accountant', 'Sales']], dtype=object)

In [43]:
type(df1.values)

numpy.ndarray

#### Changing the Index

Many ways to do this, this is just one of them!

In [44]:
df1['characteristics'] = [ 'age', 'sex', 'profession']

df1

name,vivek,john,jill,ravi,characteristics
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
age,35,22,19,42,age
sex,M,M,F,M,sex
profession,Trainer,Gardener,Accountant,Sales,profession


In [45]:
df1.set_index('characteristics', inplace=True)

df1

name,vivek,john,jill,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,35,22,19,42
sex,M,M,F,M
profession,Trainer,Gardener,Accountant,Sales


In [46]:
df1.columns.name = 'name'

df1

name,vivek,john,jill,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,35,22,19,42
sex,M,M,F,M
profession,Trainer,Gardener,Accountant,Sales


### From a Series

In [48]:
s1 = pd.Series(np.random.randint(0,100,7))

s2 = pd.Series(np.random.randint(0,100,8))

# side_by_side function from Wes McKinney, author of Pandas

def side_by_side(*objs, **kwds):
    from pandas.io.formats.printing import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print (adjoin(space, *reprs))
    
df2 = pd.DataFrame({ 's1' : s1, 's2' : s2 })

df2

Unnamed: 0,s1,s2
0,44.0,49
1,83.0,24
2,26.0,94
3,38.0,22
4,89.0,87
5,88.0,78
6,62.0,30
7,,39


In [50]:
s1 = pd.Series(np.random.randint(0,100,8))

s2 = pd.Series(np.random.randint(0,100,8))

# side_by_side function from Wes McKinney, author of Pandas

def side_by_side(*objs, **kwds):
    from pandas.io.formats.printing import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print (adjoin(space, *reprs))
    
df2 = pd.DataFrame({ 's1' : s1, 's2' : s2 })

df2

Unnamed: 0,s1,s2
0,7,82
1,16,33
2,85,29
3,9,59
4,93,11
5,57,91
6,57,7
7,34,20


### Even a single Series can be used

In [51]:
pd.DataFrame(s1)

Unnamed: 0,0
0,7
1,16
2,85
3,9
4,93
5,57
6,57
7,34


### Head and Tail

In [52]:
df1.head(3)

name,vivek,john,jill,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,35,22,19,42
sex,M,M,F,M
profession,Trainer,Gardener,Accountant,Sales


In [53]:
df1.tail(3)

name,vivek,john,jill,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,35,22,19,42
sex,M,M,F,M
profession,Trainer,Gardener,Accountant,Sales


### Accessing Values in a DataFrame

In [54]:
df1

name,vivek,john,jill,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,35,22,19,42
sex,M,M,F,M
profession,Trainer,Gardener,Accountant,Sales


#### Fetch a Column

In [55]:
df1['john']

characteristics
age                 22
sex                  M
profession    Gardener
Name: john, dtype: object

In [56]:
type(df1['john'])

pandas.core.series.Series

#### Fetch Columns

In [57]:
df1

name,vivek,john,jill,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,35,22,19,42
sex,M,M,F,M
profession,Trainer,Gardener,Accountant,Sales


In [65]:
 
df1[['john','jill']]

name,john,jill
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1
age,22,19
sex,M,F
profession,Gardener,Accountant


In [64]:
# df1[[1,2]] error 

### Fetch a Row or a Column by Label

#### Fetch Row by Label

.xs http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.xs.html

In [77]:

df1.xs('age')

name
vivek    35
john     22
jill     19
ravi     42
Name: age, dtype: object

In [67]:
type(df1.xs('age'))

pandas.core.series.Series

In [68]:
df1

name,vivek,john,jill,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,35,22,19,42
sex,M,M,F,M
profession,Trainer,Gardener,Accountant,Sales


#### Fetch Column by Label

In [79]:
df1.xs('vivek', axis=1)


characteristics
age                35
sex                 M
profession    Trainer
Name: vivek, dtype: object

In [80]:
df1.xs('age',axis=0)

name
vivek    35
john     22
jill     19
ravi     42
Name: age, dtype: object

### Fetch by Label but fall back on integer position based

http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.ix.html

In [82]:
df1.ix['age']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  df1.ix['age']


name
vivek    35
john     22
jill     19
ravi     42
Name: age, dtype: object

In [81]:
df1.ix[2] # 3rd Row 

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  df1.ix[2] # 3rd Row


name
vivek       Trainer
john       Gardener
jill     Accountant
ravi          Sales
Name: profession, dtype: object

#### Fetch Rows

In [83]:
df1[1:3] # Fetch rows 1 and 2 (0 indexed)

name,vivek,john,jill,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
sex,M,M,F,M
profession,Trainer,Gardener,Accountant,Sales


#### .loc and .iloc

In [84]:
df1

name,vivek,john,jill,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,35,22,19,42
sex,M,M,F,M
profession,Trainer,Gardener,Accountant,Sales


In [85]:
df1.loc['age']

name
vivek    35
john     22
jill     19
ravi     42
Name: age, dtype: object

In [86]:
df1.loc[['age', 'profession']]

name,vivek,john,jill,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,35,22,19,42
profession,Trainer,Gardener,Accountant,Sales


In [92]:
type(df1.loc[['age', 'profession']])

pandas.core.frame.DataFrame

In [87]:
df2

Unnamed: 0,s1,s2
0,7,82
1,16,33
2,85,29
3,9,59
4,93,11
5,57,91
6,57,7
7,34,20


In [88]:
df2.iloc[[2,4,7]]

Unnamed: 0,s1,s2
2,85,29
4,93,11
7,34,20


Search by Index Label or Location with .ix

In [93]:
df2

Unnamed: 0,s1,s2
0,7,82
1,16,33
2,85,29
3,9,59
4,93,11
5,57,91
6,57,7
7,34,20


In [94]:
df2.ix[4]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  df2.ix[4]


s1    93
s2    11
Name: 4, dtype: int64

In [95]:
df1

name,vivek,john,jill,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,35,22,19,42
sex,M,M,F,M
profession,Trainer,Gardener,Accountant,Sales


In [96]:
df1.ix['sex']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  df1.ix['sex']


name
vivek    M
john     M
jill     F
ravi     M
Name: sex, dtype: object

#### Fetching using Row,Column Combinations

In [97]:
df1

name,vivek,john,jill,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,35,22,19,42
sex,M,M,F,M
profession,Trainer,Gardener,Accountant,Sales


In [98]:
# Get the value in row 2 and column 3 i.e. 'M'

df1.ix[1,2], df1.ix[1,'ravi']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  df1.ix[1,2], df1.ix[1,'ravi']
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  return getattr(section, self.name)[new_key]


('F', 'M')

In [99]:
df1.iloc[1,2], df1.loc['sex','ravi']

('F', 'M')

In [100]:
df1

name,vivek,john,jill,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,35,22,19,42
sex,M,M,F,M
profession,Trainer,Gardener,Accountant,Sales


In [101]:
# We want the region between index values 1-3 and columns 'john' and 'ravi'

df1.ix[1:3, ['john', 'ravi']]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  df1.ix[1:3, ['john', 'ravi']]
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  retval = getattr(retval, self.name)._getitem_axis(key, axis=i)


name,john,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1
sex,M,M
profession,Gardener,Sales


In [102]:
df1.loc[['sex', 'profession'], ['john', 'ravi']]

name,john,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1
sex,M,M
profession,Gardener,Sales


In [103]:
df1.iloc[1:3, 1:3]

name,john,jill
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1
sex,M,F
profession,Gardener,Accountant


In [104]:
# Columns in above do not need to be adjacent 

df1.ix[1:2, ['john', 'vivek']]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  df1.ix[1:2, ['john', 'vivek']]


name,john,vivek
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1
sex,M,M


In [105]:
# column numbers can be used as well (axis=1)

df1.ix[0:2, 1:4]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  df1.ix[0:2, 1:4]


name,john,jill,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
age,22,19,42
sex,M,F,M


In [106]:
# just like the above 

df1.ix[1:3, [0,3]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  df1.ix[1:3, [0,3]]


name,vivek,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1
sex,M,M
profession,Trainer,Sales


In [107]:
# List of rows and columns can be provided as well

df1.ix[[1,2], [1,2]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  df1.ix[[1,2], [1,2]]


name,john,jill
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1
sex,M,F
profession,Gardener,Accountant


#### Fast label based lookup with .at 

http://pandas.pydata.org/pandas-docs/version/0.17.0/generated/pandas.DataFrame.at.html

In [108]:
df1.at['sex', 'john']

'M'

In [109]:
df1.at['profession', 'ravi']

'Sales'

#### .iat for using Location Based

In [110]:
df1

name,vivek,john,jill,ravi
characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,35,22,19,42
sex,M,M,F,M
profession,Trainer,Gardener,Accountant,Sales


In [111]:
df1.iat[2, 3]

'Sales'