# Data Structures in pandas
1. Series: 1-dimensional labeled array
2. DataFrame: 2-dimensional labeled data structure
3. Panel: 3D or ND..

In [1]:
import pandas as pd
import numpy as np

# Series

In [2]:
# create series
se = pd.Series(np.random.randint(3, size = 5), index=['a', 'b', 'c', 'd', 'e'], dtype = np.int8)
se

a    1
b    0
c    1
d    2
e    2
dtype: int8

In [3]:
# access index
se.index

Index([u'a', u'b', u'c', u'd', u'e'], dtype='object')

In [4]:
# from dict
d = {'a' : 0, 'b' : 1, 'c' : 2}
pd.Series(d)

a    0
b    1
c    2
dtype: int64

In [5]:
# accessing
print se[0] # the first element
print se[1:3] # the second to the third
print se[:2] # from the 1st to the 2nd
print se[-1] # the last element
print se['b'] # select values by index label

1
b    0
c    1
dtype: int8
a    1
b    0
dtype: int8
2
0


In [6]:
# vectorized operations
print se + se
print se * 2
print np.exp(se)

a    2
b    0
c    2
d    4
e    4
dtype: int8
a    2
b    0
c    2
d    4
e    4
dtype: int8
a    2.718750
b    1.000000
c    2.718750
d    7.390625
e    7.390625
dtype: float16


In [7]:
# get array values, this is useful when applied machine learning require numpy arrays, quickly convert.
se.values

array([1, 0, 1, 2, 2], dtype=int8)

# DataFrame
1. support input format: 
   + dict of ndarray, lists, series...
   + 2-D numpy.ndarray
   + structured or record ndarray
   + series
   + another dataframe

In [8]:
# create dataframe <-- dict of ndarray, list, series, tuples
df = pd.DataFrame({'A': np.random.randint(3, size=5),
                   'B': ['a', 'b', 'c', 'd', 'e'],
                   'C': pd.Series([1, 4, 5, 6, 2]), 
                   'D': ('one', 1, 'two', 2, np.nan)})
df

Unnamed: 0,A,B,C,D
0,0,a,1,one
1,2,b,4,1
2,2,c,5,two
3,2,d,6,2
4,1,e,2,


In [9]:
# access index, columns, values
print df.index
print df.columns
print df.values

RangeIndex(start=0, stop=5, step=1)
Index([u'A', u'B', u'C', u'D'], dtype='object')
[[0 'a' 1L 'one']
 [2 'b' 4L 1]
 [2 'c' 5L 'two']
 [2 'd' 6L 2]
 [1 'e' 2L nan]]


### Dataframe column selection, addition, deletion

In [10]:
# select column B
df['B'] 

0    a
1    b
2    c
3    d
4    e
Name: B, dtype: object

In [11]:
# select based on some condition
df[df.A > 1]

Unnamed: 0,A,B,C,D
1,2,b,4,1
2,2,c,5,two
3,2,d,6,2


In [12]:
# delete column
del df['D']
df

Unnamed: 0,A,B,C
0,0,a,1
1,2,b,4
2,2,c,5
3,2,d,6
4,1,e,2


In [13]:
# another method, drop one or more column
df.drop(['B', 'C'], axis=1) #if want to keep, set inplace =True

Unnamed: 0,A
0,0
1,2
2,2
3,2
4,1


In [14]:
# add one column
df['D'] = 0
df

Unnamed: 0,A,B,C,D
0,0,a,1,0
1,2,b,4,0
2,2,c,5,0
3,2,d,6,0
4,1,e,2,0


In [15]:
# use insert to add one column in a specific location
df.insert(1, 'insert_col', np.ones(5))
df

Unnamed: 0,A,insert_col,B,C,D
0,0,1.0,a,1,0
1,2,1.0,b,4,0
2,2,1.0,c,5,0
3,2,1.0,d,6,0
4,1,1.0,e,2,0


### DataFrame indexing / Selection

In [16]:
# select column
df['B']

0    a
1    b
2    c
3    d
4    e
Name: B, dtype: object

In [17]:
# select row by label
df.loc[2]

A             2
insert_col    1
B             c
C             5
D             0
Name: 2, dtype: object

In [18]:
# select row by integer location
df.iloc[2]

A             2
insert_col    1
B             c
C             5
D             0
Name: 2, dtype: object

In [19]:
# slice rows
df[0:2]

Unnamed: 0,A,insert_col,B,C,D
0,0,1.0,a,1,0
1,2,1.0,b,4,0
