# Pandas DataFrame: Filters and Operations

In [1]:
import pandas as pd
import numpy as np

pd.__version__, np.__version__

('0.25.3', '1.17.4')

In [2]:
data = {
    
     'vivek' : [35, 'M', 'Trainer'] ,
     'john'  : [22, 'M', 'Gardener'] ,
     'jill'  : [19, 'F', 'Accountant'] ,
     'ravi'  : [42, 'M', 'Sales'] 
}

df1 = pd.DataFrame(data, index=['age', 'sex', 'profession'])

df1

Unnamed: 0,vivek,john,jill,ravi
age,35,22,19,42
sex,M,M,F,M
profession,Trainer,Gardener,Accountant,Sales


### Transpose

In [3]:
df2 = df1.T

df2

Unnamed: 0,age,sex,profession
vivek,35,M,Trainer
john,22,M,Gardener
jill,19,F,Accountant
ravi,42,M,Sales


In [5]:
df2.loc['vivek']

age                35
sex                 M
profession    Trainer
Name: vivek, dtype: object

In [6]:
df2.age < 30

vivek    False
john      True
jill      True
ravi     False
Name: age, dtype: bool

In [7]:
df2[ df2.age < 30 ]

Unnamed: 0,age,sex,profession
john,22,M,Gardener
jill,19,F,Accountant


In [8]:
df2[ (df2.age < 30) & ( df2.sex == 'M')]

Unnamed: 0,age,sex,profession
john,22,M,Gardener


### Adding a new Column

In [9]:
df2['salary'] = [ 9000 , 3000 , 25000 , 10000 ]

df2

Unnamed: 0,age,sex,profession,salary
vivek,35,M,Trainer,9000
john,22,M,Gardener,3000
jill,19,F,Accountant,25000
ravi,42,M,Sales,10000


#### Dervied Columns

In [10]:
df2['young'] = df2.age < 30

df2

Unnamed: 0,age,sex,profession,salary,young
vivek,35,M,Trainer,9000,False
john,22,M,Gardener,3000,True
jill,19,F,Accountant,25000,True
ravi,42,M,Sales,10000,False


In [11]:
del df2['young']  # hate being told I am getting older

df2

Unnamed: 0,age,sex,profession,salary
vivek,35,M,Trainer,9000
john,22,M,Gardener,3000
jill,19,F,Accountant,25000
ravi,42,M,Sales,10000


In [12]:
df2.loc['jim'] = [ 56, 'M', 'Hacker', 0]

df2

Unnamed: 0,age,sex,profession,salary
vivek,35,M,Trainer,9000
john,22,M,Gardener,3000
jill,19,F,Accountant,25000
ravi,42,M,Sales,10000
jim,56,M,Hacker,0


In [13]:
df3 = df2.drop('jim', axis=0)  # Drop is not inplace like del

df2

Unnamed: 0,age,sex,profession,salary
vivek,35,M,Trainer,9000
john,22,M,Gardener,3000
jill,19,F,Accountant,25000
ravi,42,M,Sales,10000
jim,56,M,Hacker,0


In [14]:
df3

Unnamed: 0,age,sex,profession,salary
vivek,35,M,Trainer,9000
john,22,M,Gardener,3000
jill,19,F,Accountant,25000
ravi,42,M,Sales,10000


### Statistics

In [15]:
df2

Unnamed: 0,age,sex,profession,salary
vivek,35,M,Trainer,9000
john,22,M,Gardener,3000
jill,19,F,Accountant,25000
ravi,42,M,Sales,10000
jim,56,M,Hacker,0


In [16]:
df2.describe()

Unnamed: 0,salary
count,5.0
mean,9400.0
std,9659.192513
min,0.0
25%,3000.0
50%,9000.0
75%,10000.0
max,25000.0


In [18]:
df2.age.describe()   #no space in column name

count      5
unique     5
top       56
freq       1
Name: age, dtype: int64

In [19]:
df2.age.mean()

34.8

In [20]:
df2.T

Unnamed: 0,vivek,john,jill,ravi,jim
age,35,22,19,42,56
sex,M,M,F,M,M
profession,Trainer,Gardener,Accountant,Sales,Hacker
salary,9000,3000,25000,10000,0


In [23]:
df2.T.ix[0].mean()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  df2.T.ix[0].mean()


34.8

### Resetting Index

In [24]:
df2

Unnamed: 0,age,sex,profession,salary
vivek,35,M,Trainer,9000
john,22,M,Gardener,3000
jill,19,F,Accountant,25000
ravi,42,M,Sales,10000
jim,56,M,Hacker,0


In [25]:
df2.reset_index()

Unnamed: 0,index,age,sex,profession,salary
0,vivek,35,M,Trainer,9000
1,john,22,M,Gardener,3000
2,jill,19,F,Accountant,25000
3,ravi,42,M,Sales,10000
4,jim,56,M,Hacker,0


In [26]:
# Let's make sure the new column gets a good name by default 

df2.index.name = 'name'

df4 = df2.reset_index()

df4

Unnamed: 0,name,age,sex,profession,salary
0,vivek,35,M,Trainer,9000
1,john,22,M,Gardener,3000
2,jill,19,F,Accountant,25000
3,ravi,42,M,Sales,10000
4,jim,56,M,Hacker,0


In [28]:
df2

Unnamed: 0_level_0,age,sex,profession,salary
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
vivek,35,M,Trainer,9000
john,22,M,Gardener,3000
jill,19,F,Accountant,25000
ravi,42,M,Sales,10000
jim,56,M,Hacker,0


In [29]:
df2.reset_index(inplace=True)

df2

Unnamed: 0,name,age,sex,profession,salary
0,vivek,35,M,Trainer,9000
1,john,22,M,Gardener,3000
2,jill,19,F,Accountant,25000
3,ravi,42,M,Sales,10000
4,jim,56,M,Hacker,0


In [30]:
df2.set_index('name', inplace=True)

df2

Unnamed: 0_level_0,age,sex,profession,salary
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
vivek,35,M,Trainer,9000
john,22,M,Gardener,3000
jill,19,F,Accountant,25000
ravi,42,M,Sales,10000
jim,56,M,Hacker,0
