## to use pandas , it is required to install numpy first


In [3]:
import numpy as np
import pandas as pd
pd.__version__

'0.23.0'

In [5]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [6]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [7]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [8]:
print(data[1])
print(data[1:3])

0.5
1    0.50
2    0.75
dtype: float64


In [9]:
data = pd.Series([0.25, 0.5, 0.74, 1.0], index=['a','b','c','d'])
data

a    0.25
b    0.50
c    0.74
d    1.00
dtype: float64

In [13]:
print(data['b'])
print(data[1])
data['b'] == data[1]

0.5
0.5


True

In [14]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
index = [2,5,3,7]
print(data)

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64


In [15]:
population_dict = {'Cal': 12345, 'Texas': 298999, 'NY': 2143343, 'Florida': 12348, 'Ill': 1394839247}
population = pd.Series(population_dict)
population

Cal             12345
Texas          298999
NY            2143343
Florida         12348
Ill        1394839247
dtype: int64

In [16]:
print(population['Cal'])
print(population['Cal':'NY'])

12345
Cal        12345
Texas     298999
NY       2143343
dtype: int64


In [17]:
pd.Series({2:'a',1:'b',3:'c'})

2    a
1    b
3    c
dtype: object

## pandas dataframe - generalized (collection of) numpy array

In [18]:
population_dict = {'Cal': 12345, 'Texas': 298999, 'NY': 2143343, 'Florida': 12348, 'Ill': 1394839247}
population = pd.Series(population_dict)
population

Cal             12345
Texas          298999
NY            2143343
Florida         12348
Ill        1394839247
dtype: int64

In [19]:
area_dict = {'Cal': 4983090, 'Texas': 392399, 'NY': 2233343, 'Florida': 1989848, 'Ill': 839247}
area = pd.Series(area_dict)
area

Cal        4983090
Texas       392399
NY         2233343
Florida    1989848
Ill         839247
dtype: int64

In [21]:
states = pd.DataFrame({'population':population, 'area':area})
states

Unnamed: 0,population,area
Cal,12345,4983090
Texas,298999,392399
NY,2143343,2233343
Florida,12348,1989848
Ill,1394839247,839247


In [22]:
states.index

Index(['Cal', 'Texas', 'NY', 'Florida', 'Ill'], dtype='object')

In [23]:
states['area']

Cal        4983090
Texas       392399
NY         2233343
Florida    1989848
Ill         839247
Name: area, dtype: int64

In [24]:
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
Cal,12345
Texas,298999
NY,2143343
Florida,12348
Ill,1394839247


In [26]:
pd.DataFrame([{'a':1, 'b':2},{'b':3,'c':4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [27]:
pd.DataFrame(np.random.rand(3,2), columns=['foo','bar'],index=['a','b','c'])

Unnamed: 0,foo,bar
a,0.385997,0.435525
b,0.615209,0.33254
c,0.214406,0.082811


In [28]:
ind = pd.Index([2,3,5,7,11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [29]:
print(ind[1])
print(ind[::2])
print(ind.size, ind.shape, ind.ndim, ind.dtype)

3
Int64Index([2, 5, 11], dtype='int64')
5 (5,) 1 int64


In [30]:
inda = pd.Index([1,3,5,7,9])
indb = pd.Index([2,3,5,7,11])
inda & indb

Int64Index([3, 5, 7], dtype='int64')

In [31]:
inda | indb

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [32]:
inda^indb

Int64Index([1, 2, 9, 11], dtype='int64')

In [33]:
states.keys()

Index(['population', 'area'], dtype='object')

In [37]:
list(states.population.items())

[('Cal', 12345),
 ('Texas', 298999),
 ('NY', 2143343),
 ('Florida', 12348),
 ('Ill', 1394839247)]

## indexer : loc, iloc, ix

In [41]:
states.loc['Texas','population']

298999

In [42]:
states.iloc[1,0]

298999

In [44]:
states.loc[:,'population']

Cal             12345
Texas          298999
NY            2143343
Florida         12348
Ill        1394839247
Name: population, dtype: int64

In [45]:
states.iloc[:,0]

Cal             12345
Texas          298999
NY            2143343
Florida         12348
Ill        1394839247
Name: population, dtype: int64

## Dataframe functions - selection of data

In [47]:
area = pd.Series({'Cal':34533, 'Texas':39820, 'New York': 193849, 'Florida':19293, 'Illinois':123934})
pop = pd.Series({'Cal':988293, 'Texas':456720, 'New York': 2452349, 'Florida':345454, 'Illinois':435430})
density = pd.Series({'Cal':533, 'Texas':820, 'New York': 849, 'Florida':293, 'Illinois':934})
data = pd.DataFrame({'area':area, 'pop':pop, 'density':density})
data

Unnamed: 0,area,pop,density
Cal,34533,988293,533
Texas,39820,456720,820
New York,193849,2452349,849
Florida,19293,345454,293
Illinois,123934,435430,934


In [48]:
data.values

array([[  34533,  988293,     533],
       [  39820,  456720,     820],
       [ 193849, 2452349,     849],
       [  19293,  345454,     293],
       [ 123934,  435430,     934]])

In [49]:
data.T

Unnamed: 0,Cal,Texas,New York,Florida,Illinois
area,34533,39820,193849,19293,123934
pop,988293,456720,2452349,345454,435430
density,533,820,849,293,934


In [50]:
data.values[0]

array([ 34533, 988293,    533])

In [51]:
data.area

Cal          34533
Texas        39820
New York    193849
Florida      19293
Illinois    123934
Name: area, dtype: int64

In [52]:
data.iloc[:3,:2]

Unnamed: 0,area,pop
Cal,34533,988293
Texas,39820,456720
New York,193849,2452349


In [53]:
data.loc[:'Illinois', :'pop']

Unnamed: 0,area,pop
Cal,34533,988293
Texas,39820,456720
New York,193849,2452349
Florida,19293,345454
Illinois,123934,435430


In [54]:
data.ix[:3,:'pop']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,area,pop
Cal,34533,988293
Texas,39820,456720
New York,193849,2452349


In [55]:
data.loc[data.density>100, ['pop','density']]

Unnamed: 0,pop,density
Cal,988293,533
Texas,456720,820
New York,2452349,849
Florida,345454,293
Illinois,435430,934


## universal function

In [57]:
import pandas as pd
import numpy as np
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [59]:
df = pd.DataFrame(rng.randint(0,10,(3,4)), columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,1,7,5,1
1,4,0,9,5
2,8,0,9,2


In [60]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [62]:
np.sin(df*np.pi/4)

Unnamed: 0,A,B,C,D
0,0.7071068,-0.707107,-0.707107,0.707107
1,1.224647e-16,0.0,0.707107,-0.707107
2,-2.449294e-16,0.0,0.707107,1.0


In [63]:
## pandas method
## add()
## sub(), subtract()
## mul(), multiply()
## truediv(), div(), divide()
## floordiv()
## mod()
## pow()


## missing data handling

In [64]:
## None
import numpy as np
import pandas as pd

In [65]:
vals1 = np.array([1, None, 3,4])
vals1

array([1, None, 3, 4], dtype=object)