In [2]:
import pandas as pd
import numpy as np
ser = pd.Series([7,4,-1,3])

In [2]:
ser

0    7
1    4
2   -1
3    3
dtype: int64

In [3]:
ser.values

array([ 7,  4, -1,  3])

In [4]:
ser.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
ser[1]

4

In [6]:
ser[[2,1,3]]

2   -1
1    4
3    3
dtype: int64

[2,1,3] interpreted as a list of indices

In [10]:
np.exp(pd.Series([1,1,1,1]))

0    2.718282
1    2.718282
2    2.718282
3    2.718282
dtype: float64

In [11]:
1 in ser  # ser.index just like the dict's key

True

Create a Series object from a Python dict 

In [12]:
d = {'name':"zjtprince","age":32,"profession":"engineer"}

In [13]:
obj = pd.Series(d)

In [14]:
obj

name          zjtprince
age                  32
profession     engineer
dtype: object

In [16]:
index = ['name','age','gender']
obj2 = pd.Series(d, index=index)


In [17]:
obj2

name      zjtprince
age              32
gender          NaN
dtype: object

In [18]:
obj+obj2

age                           64
gender                       NaN
name          zjtprincezjtprince
profession                   NaN
dtype: object

A useful Series feature for many applications  is that it automatically asigns by index label in arithmatic operations

In [21]:
obj.name='me'
# obj.index.name='info'
obj


info
name          zjtprince
age                  32
profession     engineer
Name: me, dtype: object

In [23]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


pd.DataFrame in jupyter notebook displayed as a more browser-frendly HTML table

In [24]:
frame.index=['one','two','three','four','five','six']

In [25]:
frame

Unnamed: 0,state,year,pop
one,Ohio,2000,1.5
two,Ohio,2001,1.7
three,Ohio,2002,3.6
four,Nevada,2001,2.4
five,Nevada,2002,2.9
six,Nevada,2003,3.2


In [26]:
frame['pop']

one      1.5
two      1.7
three    3.6
four     2.4
five     2.9
six      3.2
Name: pop, dtype: float64

The returned Series have the same index of the DataFrame , and their name has been appropriately set.

In [31]:
pop = frame['pop']

In [32]:
pop[:]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop[:]=1


In [33]:
pop

one      1.0
two      1.0
three    1.0
four     1.0
five     1.0
six      1.0
Name: pop, dtype: float64

In [34]:
frame

Unnamed: 0,state,year,pop
one,Ohio,2000,1.0
two,Ohio,2001,1.0
three,Ohio,2002,1.0
four,Nevada,2001,1.0
five,Nevada,2002,1.0
six,Nevada,2003,1.0


In [35]:
frame.T

Unnamed: 0,one,two,three,four,five,six
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
year,2000,2001,2002,2001,2002,2003
pop,1,1,1,1,1,1


In [36]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [37]:
df3 = pd.DataFrame(pop)

In [38]:
df3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [39]:
df3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [42]:
type (df3.values
     )

numpy.ndarray

#reindex

In [43]:
s = pd.Series(['puple','yellow','blue'], index=[0,2,4])

In [44]:
s

0     puple
2    yellow
4      blue
dtype: object

In [45]:
s.reindex(range(6), method='ffill')

0     puple
1     puple
2    yellow
3    yellow
4      blue
5      blue
dtype: object

In [46]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),

    index=['Ohio', 'Colorado', 'Utah', 'New York'],

    columns=['one', 'two', 'three', 'four'])

In [53]:
df = pd.DataFrame(data)
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [48]:
df['one']  # returned a Series

Ohio         0
Colorado     4
Utah         8
New York    12
Name: one, dtype: int64

In [51]:
df[['one']]  # returned a DataFrame

Unnamed: 0,one
Ohio,0
Colorado,4
Utah,8
New York,12


In [52]:
df[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [54]:
data['Ohio':'Utah'] 

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11


Slicing with labels behaves differently than normal Python siicing in that the end-point is inclusive

In [55]:
df[1:3]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11


In [60]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [62]:
data.loc['Colorado',['one','two']]

one    4
two    5
Name: Colorado, dtype: int64

In [64]:
type(data.loc[['Colorado','Utah'],['one','two']])

pandas.core.frame.DataFrame

In [70]:
data.loc[:,:][data['two'] > 5]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [None]:
Data Alignment 
In the case of DataFrame, alignment is performed on both the rows and the columns


In [6]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
index=['Ohio', 'Texas', 'Colorado'])
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [7]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),

index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [5]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [8]:
df1.add(df2,fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


Operations between DataFrame and Series

In [9]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [10]:
arr[0]

array([0., 1., 2., 3.])

In [11]:
arr - arr[0]  #broadcasting

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [13]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),

columns=list('bde'),

index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [15]:
series = frame.iloc[0]
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [16]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [18]:
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [24]:
frame.sub(series3, axis=0)


Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [26]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),

index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.059721,-0.208271,-0.233786
Ohio,-0.618692,1.385125,0.03348
Texas,-0.436104,-0.860228,-0.844626
Oregon,0.943906,-1.519166,-1.007058


In [28]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.059721,0.208271,0.233786
Ohio,0.618692,1.385125,0.03348
Texas,0.436104,0.860228,0.844626
Oregon,0.943906,1.519166,1.007058


In [31]:
frame.apply(lambda x : x.max()- x.min(),axis=1)

Utah      0.293508
Ohio      2.003816
Texas     0.424124
Oregon    2.463072
dtype: float64

The function passed to apply need not return a scalar value ;it can return a Series with multiple values.


In [34]:
def f (x) :
    return pd.Series([x.max(), x.min()] , index=['max','min'])

In [35]:
frame.apply(f)

Unnamed: 0,b,d,e
max,0.943906,1.385125,0.03348
min,-0.618692,-1.519166,-1.007058


In [None]:
format = lambda x : '%.2f' % x

In [37]:
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.0597212375273662,-0.2082710765171152,-0.2337863920839318
Ohio,-0.6186916488199996,1.385124830626148,0.0334800397909709
Texas,-0.4361044602843293,-0.8602280422760598,-0.8446256725566547
Oregon,0.9439061131866706,-1.5191660456172489,-1.0070582911213162


In [38]:
frame['e'].map(format)

Utah      -0.23378639208393184
Ohio      0.033480039790970986
Texas      -0.8446256725566547
Oregon     -1.0070582911213162
Name: e, dtype: object

In [39]:
frame['e']

Utah     -0.233786
Ohio      0.033480
Texas    -0.844626
Oregon   -1.007058
Name: e, dtype: float64

Sorting and Ranking

In [41]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),

index=['three', 'one'],

columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [42]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [47]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [48]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [49]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [50]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [58]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],

'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [62]:
frame.rank(method='first')

Unnamed: 0,b,a,c,b_rank,c_rank,d_rank
0,3.0,1.0,2.0,3.0,1.0,2.0
1,4.0,3.0,3.0,4.0,3.0,3.0
2,1.0,2.0,4.0,1.0,2.0,4.0
3,2.0,4.0,1.0,2.0,4.0,1.0


In [63]:
frame[['b_rank','c_rank','d_rank']] = frame.rank(method='first')

ValueError: Columns must be same length as key

In [61]:
frame

Unnamed: 0,b,a,c,b_rank,c_rank,d_rank
0,4.3,0,-2.0,3.0,1.0,2.0
1,7.0,1,5.0,4.0,3.0,3.0
2,-3.0,0,8.0,1.0,2.0,4.0
3,2.0,1,-2.5,2.0,4.0,1.0


In [64]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],

'Qu2': [2, 3, 1, 2, 3],

'Qu3': [1, 5, 2, 4, 4]})

In [65]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [67]:
pd.value_counts(data['Qu1'])

4    2
3    2
1    1
Name: Qu1, dtype: int64

values_counts :Return a Series containing unique values as its index and frequencies as its values, ordered count in
descending order