# 1. Create a DataFrame and its Property

## Four ways to create a DataFrame

In [2]:
import pandas as pd
import numpy as np

--

In [17]:
# Take a 2D array as input to your DataFrame 
my_2darray = np.array([[1, 2, 3], [4, 5, 6]])
print(pd.DataFrame(my_2darray, index = ['obs1', 'obs2'], columns = ['one', 'two', 'three']))
print("\n")

# Take a dictionary as input to your DataFrame 
my_dict = {1: ['1', '3'], 2: ['1', '2'], 3: ['2', '4']}
print(pd.DataFrame(my_dict, index = ['one', 'two']))
print("\n")

# Take a DataFrame as input to your DataFrame 
my_df = pd.DataFrame(data=[4,5,6,7], index=range(0,4), columns=['A'])
print(pd.DataFrame(my_df))
print("\n")

# Take a Series as input to your DataFrame
my_series = pd.Series({"Belgium":"Brussels", "India":"New Delhi", "United Kingdom":"London", "United States":"Washington"})
print(pd.DataFrame(my_series))
print("\n")

      one  two  three
obs1    1    2      3
obs2    4    5      6


     1  2  3
one  1  1  2
two  3  2  4


   A
0  4
1  5
2  6
3  7


                         0
Belgium           Brussels
India            New Delhi
United Kingdom      London
United States   Washington




## Properties of Data Frame

In [127]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df = pd.DataFrame(data)  # change the default index
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [20]:
#shape
df.shape

(6, 3)

In [21]:
#index
df.index

RangeIndex(start=0, stop=6, step=1)

In [30]:
#column
df.columns

['state', 'year', 'pop']

In [31]:
#values of the label
print(df.columns.values)
print(df.index.values)

['state' 'year' 'pop']
[0 1 2 3 4 5]


---

# 2. How To Select an Index or Column From a Pandas DataFrame

In [128]:
df.index = ['obs1', 'obs2', 'obs3', 'obs4', 'obs5', 'obs6']
df

Unnamed: 0,state,year,pop
obs1,Ohio,2000,1.5
obs2,Ohio,2001,1.7
obs3,Ohio,2002,3.6
obs4,Nevada,2001,2.4
obs5,Nevada,2002,2.9
obs6,Nevada,2003,3.2


### Various ways to get 1.5

In [129]:
# Approach 1
df.loc['obs1', 'pop']

1.5

In [130]:
# Approach 2
df.iloc[0][2]  ## be care full, it is the chain slicing.

1.5

In [131]:
# Approach 3
df['pop'][0]

1.5

In [132]:
# Approach 4
df.at['obs1', 'pop']

1.5

In [133]:
# Approach 5
df.iat[0,2]

1.5

# 3. How To Add an Index, Row or Column to a Pandas DataFrame

In [134]:
df

Unnamed: 0,state,year,pop
obs1,Ohio,2000,1.5
obs2,Ohio,2001,1.7
obs3,Ohio,2002,3.6
obs4,Nevada,2001,2.4
obs5,Nevada,2002,2.9
obs6,Nevada,2003,3.2


### Add a row
You shoud use the loc[] only, rather than the iloc[].

In [135]:
df.loc['obs7'] = ['PA', 2001, 3.0] 
df

Unnamed: 0,state,year,pop
obs1,Ohio,2000,1.5
obs2,Ohio,2001,1.7
obs3,Ohio,2002,3.6
obs4,Nevada,2001,2.4
obs5,Nevada,2002,2.9
obs6,Nevada,2003,3.2
obs7,PA,2001,3.0


In [136]:
df.loc['obs8'] = ['PA', 2002, 3.5]
df.loc['obs9'] = ['PA', 2002, 3.5]
df

Unnamed: 0,state,year,pop
obs1,Ohio,2000,1.5
obs2,Ohio,2001,1.7
obs3,Ohio,2002,3.6
obs4,Nevada,2001,2.4
obs5,Nevada,2002,2.9
obs6,Nevada,2003,3.2
obs7,PA,2001,3.0
obs8,PA,2002,3.5
obs9,PA,2002,3.5


In [137]:
df.ix['obs9'] = ['PA', 2002, 3.5]
df

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,state,year,pop
obs1,Ohio,2000,1.5
obs2,Ohio,2001,1.7
obs3,Ohio,2002,3.6
obs4,Nevada,2001,2.4
obs5,Nevada,2002,2.9
obs6,Nevada,2003,3.2
obs7,PA,2001,3.0
obs8,PA,2002,3.5
obs9,PA,2002,3.5


### Add a column

In [138]:
gdp_pc = range(11000, 20000, 1000)
gdp_pc

range(11000, 20000, 1000)

In [139]:
df['gdppc'] = gdp_pc
df

Unnamed: 0,state,year,pop,gdppc
obs1,Ohio,2000,1.5,11000
obs2,Ohio,2001,1.7,12000
obs3,Ohio,2002,3.6,13000
obs4,Nevada,2001,2.4,14000
obs5,Nevada,2002,2.9,15000
obs6,Nevada,2003,3.2,16000
obs7,PA,2001,3.0,17000
obs8,PA,2002,3.5,18000
obs9,PA,2002,3.5,19000


In [140]:
df.drop('gdppc', axis = 1, inplace = True)
df

Unnamed: 0,state,year,pop
obs1,Ohio,2000,1.5
obs2,Ohio,2001,1.7
obs3,Ohio,2002,3.6
obs4,Nevada,2001,2.4
obs5,Nevada,2002,2.9
obs6,Nevada,2003,3.2
obs7,PA,2001,3.0
obs8,PA,2002,3.5
obs9,PA,2002,3.5


In [141]:
df.loc[:, 'gdppc'] = gdp_pc
df

Unnamed: 0,state,year,pop,gdppc
obs1,Ohio,2000,1.5,11000
obs2,Ohio,2001,1.7,12000
obs3,Ohio,2002,3.6,13000
obs4,Nevada,2001,2.4,14000
obs5,Nevada,2002,2.9,15000
obs6,Nevada,2003,3.2,16000
obs7,PA,2001,3.0,17000
obs8,PA,2002,3.5,18000
obs9,PA,2002,3.5,19000


In [142]:
print(df.index.values)

['obs1' 'obs2' 'obs3' 'obs4' 'obs5' 'obs6' 'obs7' 'obs8' 'obs9']


### Removing columns

In [143]:
df.drop('gdppc', axis = 1, inplace = True)
df

Unnamed: 0,state,year,pop
obs1,Ohio,2000,1.5
obs2,Ohio,2001,1.7
obs3,Ohio,2002,3.6
obs4,Nevada,2001,2.4
obs5,Nevada,2002,2.9
obs6,Nevada,2003,3.2
obs7,PA,2001,3.0
obs8,PA,2002,3.5
obs9,PA,2002,3.5


### Removing columns

In [144]:
df2 = df.drop('obs9', axis = 0, inplace = False)
df2

Unnamed: 0,state,year,pop
obs1,Ohio,2000,1.5
obs2,Ohio,2001,1.7
obs3,Ohio,2002,3.6
obs4,Nevada,2001,2.4
obs5,Nevada,2002,2.9
obs6,Nevada,2003,3.2
obs7,PA,2001,3.0
obs8,PA,2002,3.5


In [145]:
df3 = df.drop_duplicates(keep = 'last')
df3

Unnamed: 0,state,year,pop
obs1,Ohio,2000,1.5
obs2,Ohio,2001,1.7
obs3,Ohio,2002,3.6
obs4,Nevada,2001,2.4
obs5,Nevada,2002,2.9
obs6,Nevada,2003,3.2
obs7,PA,2001,3.0
obs9,PA,2002,3.5


In [146]:
df4 = df.drop(df.index[1])
df4

Unnamed: 0,state,year,pop
obs1,Ohio,2000,1.5
obs3,Ohio,2002,3.6
obs4,Nevada,2001,2.4
obs5,Nevada,2002,2.9
obs6,Nevada,2003,3.2
obs7,PA,2001,3.0
obs8,PA,2002,3.5
obs9,PA,2002,3.5


# 5. How to Rename the Index or Columns of a Pandas DataFrame

In [147]:
df

Unnamed: 0,state,year,pop
obs1,Ohio,2000,1.5
obs2,Ohio,2001,1.7
obs3,Ohio,2002,3.6
obs4,Nevada,2001,2.4
obs5,Nevada,2002,2.9
obs6,Nevada,2003,3.2
obs7,PA,2001,3.0
obs8,PA,2002,3.5
obs9,PA,2002,3.5


In [148]:
# approach 1
df.index = range(9)
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2
6,PA,2001,3.0
7,PA,2002,3.5
8,PA,2002,3.5


In [149]:
# approach 2
df.rename(index = {8:10}, inplace = True)
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2
6,PA,2001,3.0
7,PA,2002,3.5
10,PA,2002,3.5


In [150]:
# rename the column
# approach 1
df.columns = ['STATE', 'YEAR', 'POP']
df

Unnamed: 0,STATE,YEAR,POP
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2
6,PA,2001,3.0
7,PA,2002,3.5
10,PA,2002,3.5


In [153]:
# approach 2
df.rename(columns = {'STATE':'state', 'YEAR':'year', 'POP':'pop'}, inplace = True)
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2
6,PA,2001,3.0
7,PA,2002,3.5
10,PA,2002,3.5


---

# 6. How To Format The Data in Your Pandas DataFrame

## Replacing All Occurrences of a String in a DataFrame

In [154]:
data = {'student 1': ['ok', 'awful', 'good'], 'student 2': ['ok', 'awful', 'good'] ,'student 3': ['ok', 'awful', 'good']}

In [159]:
evaluation = pd.DataFrame(data)
evaluation.rename(index = {0:'test 1', 1: 'test 2', 2: 'test 3'} , inplace = True)
evaluation

Unnamed: 0,student 1,student 2,student 3
test 1,ok,ok,ok
test 2,awful,awful,awful
test 3,good,good,good


In [161]:
df2 = evaluation.replace(['ok', 'awful', 'good'], [1,0,2]) 
df2

Unnamed: 0,student 1,student 2,student 3
test 1,1,1,1
test 2,0,0,0
test 3,2,2,2


In [163]:
df3 = evaluation.replace({'ok':1})
df3

Unnamed: 0,student 1,student 2,student 3
test 1,1,1,1
test 2,awful,awful,awful
test 3,good,good,good


In [165]:
#### Using regular expression
import re

In [168]:
df3 = evaluation.replace({"^o": 1}  , regex = True)
df3

Unnamed: 0,student 1,student 2,student 3
test 1,1,1,1
test 2,awful,awful,awful
test 3,good,good,good
