# Pandas
- table-shape 2-D arrays
- DataFrame, Series
- useful functions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dic = {'city':['seoul'],
      'year':['2017'],
      'temp':[25.2]}
dic

{'city': ['seoul'], 'year': ['2017'], 'temp': [25.2]}

In [22]:
dic2 = {'city': ['seoul','busan','incheon','daejun','kwangju'],
       'year': ['2017','2018','2019','2020','2022'],
       'temp': [25.2, 27, 29, 25.4, 26.7]}
dic2

{'city': ['seoul', 'busan', 'incheon', 'daejun', 'kwangju'],
 'year': ['2017', '2018', '2019', '2020', '2022'],
 'temp': [25.2, 27, 29, 25.4, 26.7]}

In [23]:
pd.DataFrame(dic2)

Unnamed: 0,city,year,temp
0,seoul,2017,25.2
1,busan,2018,27.0
2,incheon,2019,29.0
3,daejun,2020,25.4
4,kwangju,2022,26.7


In [24]:
df = pd.DataFrame(dic2, index = ['a','b','c','d','e']); df

Unnamed: 0,city,year,temp
a,seoul,2017,25.2
b,busan,2018,27.0
c,incheon,2019,29.0
d,daejun,2020,25.4
e,kwangju,2022,26.7


In [25]:
pd.Series(dic)

city    [seoul]
year     [2017]
temp     [25.2]
dtype: object

In [26]:
pd.DataFrame(dic)

Unnamed: 0,city,year,temp
0,seoul,2017,25.2


In [27]:
df.columns

Index(['city', 'year', 'temp'], dtype='object')

In [28]:
df.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [29]:
df.values

array([['seoul', '2017', 25.2],
       ['busan', '2018', 27.0],
       ['incheon', '2019', 29.0],
       ['daejun', '2020', 25.4],
       ['kwangju', '2022', 26.7]], dtype=object)

In [30]:
df

Unnamed: 0,city,year,temp
a,seoul,2017,25.2
b,busan,2018,27.0
c,incheon,2019,29.0
d,daejun,2020,25.4
e,kwangju,2022,26.7


In [31]:
df['city']

a      seoul
b      busan
c    incheon
d     daejun
e    kwangju
Name: city, dtype: object

In [32]:
df['year']

a    2017
b    2018
c    2019
d    2020
e    2022
Name: year, dtype: object

In [33]:
type(df['city'])

pandas.core.series.Series

In [34]:
df['city'], df.city

(a      seoul
 b      busan
 c    incheon
 d     daejun
 e    kwangju
 Name: city, dtype: object,
 a      seoul
 b      busan
 c    incheon
 d     daejun
 e    kwangju
 Name: city, dtype: object)

In [37]:
df[['city','year']]

Unnamed: 0,city,year
a,seoul,2017
b,busan,2018
c,incheon,2019
d,daejun,2020
e,kwangju,2022


In [38]:
df[['year','city']]

Unnamed: 0,year,city
a,2017,seoul
b,2018,busan
c,2019,incheon
d,2020,daejun
e,2022,kwangju


In [39]:
df.index =  ['x','y','z','w','q']

In [40]:
df

Unnamed: 0,city,year,temp
x,seoul,2017,25.2
y,busan,2018,27.0
z,incheon,2019,29.0
w,daejun,2020,25.4
q,kwangju,2022,26.7


In [41]:
df.set_index('city')

Unnamed: 0_level_0,year,temp
city,Unnamed: 1_level_1,Unnamed: 2_level_1
seoul,2017,25.2
busan,2018,27.0
incheon,2019,29.0
daejun,2020,25.4
kwangju,2022,26.7


In [42]:
df.index =  ['a','b','c','d','e']

In [43]:
df

Unnamed: 0,city,year,temp
a,seoul,2017,25.2
b,busan,2018,27.0
c,incheon,2019,29.0
d,daejun,2020,25.4
e,kwangju,2022,26.7


In [44]:
df.loc['c']

city    incheon
year       2019
temp         29
Name: c, dtype: object

In [45]:
df.loc[['c','e']]

Unnamed: 0,city,year,temp
c,incheon,2019,29.0
e,kwangju,2022,26.7


In [46]:
df.loc[['c','e']]['temp']

c    29.0
e    26.7
Name: temp, dtype: float64

In [47]:
type(df.loc[['c','e']]['temp'])

pandas.core.series.Series

In [48]:
df.loc[['c','e']].temp

c    29.0
e    26.7
Name: temp, dtype: float64

In [50]:
df.loc['b':'d']  # end point included

Unnamed: 0,city,year,temp
b,busan,2018,27.0
c,incheon,2019,29.0
d,daejun,2020,25.4


In [52]:
df.iloc[0], df.loc['a']

(city    seoul
 year     2017
 temp     25.2
 Name: a, dtype: object,
 city    seoul
 year     2017
 temp     25.2
 Name: a, dtype: object)

In [54]:
df.loc['b':'d'], df.iloc[1:4]  # end point included / excluded

(      city  year  temp
 b    busan  2018  27.0
 c  incheon  2019  29.0
 d   daejun  2020  25.4,
       city  year  temp
 b    busan  2018  27.0
 c  incheon  2019  29.0
 d   daejun  2020  25.4)

In [55]:
df['pop'] = [1000, 350, 120, 440, 180]

In [56]:
df

Unnamed: 0,city,year,temp,pop
a,seoul,2017,25.2,1000
b,busan,2018,27.0,350
c,incheon,2019,29.0,120
d,daejun,2020,25.4,440
e,kwangju,2022,26.7,180


In [57]:
df.drop(['d','e'])

Unnamed: 0,city,year,temp,pop
a,seoul,2017,25.2,1000
b,busan,2018,27.0,350
c,incheon,2019,29.0,120


In [58]:
df

Unnamed: 0,city,year,temp,pop
a,seoul,2017,25.2,1000
b,busan,2018,27.0,350
c,incheon,2019,29.0,120
d,daejun,2020,25.4,440
e,kwangju,2022,26.7,180


In [61]:
df.drop('pop', axis = 1)

Unnamed: 0,city,year,temp
a,seoul,2017,25.2
b,busan,2018,27.0
c,incheon,2019,29.0
d,daejun,2020,25.4
e,kwangju,2022,26.7


In [65]:
df = pd.DataFrame(np.arange(12).reshape(3,4),
                 columns = ['A','B','C','D'],
                 index = ['a','b','c'])

In [66]:
df

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


In [67]:
arr = df.values; arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [68]:
arr.sum(), arr.mean()

(66, 5.5)

In [69]:
df.sum()

A    12
B    15
C    18
D    21
dtype: int64

In [70]:
df.sum(axis = 0)

A    12
B    15
C    18
D    21
dtype: int64

In [71]:
df.sum(axis = 1)

a     6
b    22
c    38
dtype: int64

In [72]:
df.mean(), df.max(), df.min()

(A    4.0
 B    5.0
 C    6.0
 D    7.0
 dtype: float64,
 A     8
 B     9
 C    10
 D    11
 dtype: int32,
 A    0
 B    1
 C    2
 D    3
 dtype: int32)

In [73]:
dir(df)

['A',
 'B',
 'C',
 'D',
 'T',
 '_AXIS_ALIASES',
 '_AXIS_IALIASES',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__',
 '_

In [74]:
f = lambda x: x.max() - x.min()
df.apply(f)

A    8
B    8
C    8
D    8
dtype: int64

In [75]:
df.apply(f, axis = 1)

a    3
b    3
c    3
dtype: int64

In [76]:
df.apply(lambda x: x.max() - x.min())

A    8
B    8
C    8
D    8
dtype: int64

In [77]:
# sorting
ser = pd.Series(np.arange(4), index = ['d','a','b','c'])
ser

d    0
a    1
b    2
c    3
dtype: int32

In [78]:
ser.sort_index()

a    1
b    2
c    3
d    0
dtype: int32

In [79]:
ser.sort_values()

d    0
a    1
b    2
c    3
dtype: int32

In [80]:
ser.sort_values(ascending = False)

c    3
b    2
a    1
d    0
dtype: int32

In [81]:
ser

d    0
a    1
b    2
c    3
dtype: int32

In [82]:
df

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


In [83]:
df.index = ['d','a','b']

In [84]:
df

Unnamed: 0,A,B,C,D
d,0,1,2,3
a,4,5,6,7
b,8,9,10,11


In [85]:
df.sort_index()

Unnamed: 0,A,B,C,D
a,4,5,6,7
b,8,9,10,11
d,0,1,2,3


In [86]:
df.sort_index(ascending = False)

Unnamed: 0,A,B,C,D
d,0,1,2,3
b,8,9,10,11
a,4,5,6,7


In [87]:
df

Unnamed: 0,A,B,C,D
d,0,1,2,3
a,4,5,6,7
b,8,9,10,11


In [88]:
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
d,3,2,1,0
a,7,6,5,4
b,11,10,9,8


In [89]:
df.sort_values(by='C', ascending = False)

Unnamed: 0,A,B,C,D
b,8,9,10,11
a,4,5,6,7
d,0,1,2,3


In [90]:
# rank
arr.argsort()

array([[0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3]], dtype=int64)

In [94]:
ser2 = pd.Series([100, 23, 13, 91, 23])
ser2

0    100
1     23
2     13
3     91
4     23
dtype: int64

In [95]:
ser2.rank()

0    5.0
1    2.5
2    1.0
3    4.0
4    2.5
dtype: float64

In [96]:
ser2.rank(method = 'first')

0    5.0
1    2.0
2    1.0
3    4.0
4    3.0
dtype: float64

In [93]:
ser2.rank(ascending = False)

0    1.0
1    3.0
2    4.0
3    2.0
dtype: float64

In [103]:
df2 = pd.DataFrame({'b': [3, 5, 2, 1],
                   'a': [4, 9, 4, 5],
                   'c': [5, 3, 7, 9]})
df2

Unnamed: 0,b,a,c
0,3,4,5
1,5,9,3
2,2,4,7
3,1,5,9


In [104]:
df2.rank()

Unnamed: 0,b,a,c
0,3.0,1.5,2.0
1,4.0,4.0,1.0
2,2.0,1.5,3.0
3,1.0,3.0,4.0


In [105]:
df2.rank(ascending = False)

Unnamed: 0,b,a,c
0,2.0,3.5,3.0
1,1.0,1.0,4.0
2,3.0,3.5,2.0
3,4.0,2.0,1.0


In [106]:
df2.rank(axis = 1)

Unnamed: 0,b,a,c
0,1.0,2.0,3.0
1,2.0,3.0,1.0
2,1.0,2.0,3.0
3,1.0,2.0,3.0


In [107]:
df3 = pd.DataFrame({'b': [4, 7, 3, 2],
                   'a': [4, 9, 2, 5],
                   'c': [5, 3, 7, np.nan]})
df3

Unnamed: 0,b,a,c
0,4,4,5.0
1,7,9,3.0
2,3,2,7.0
3,2,5,


In [110]:
df3.sum(), df3.mean()

(b    16.0
 a    20.0
 c    15.0
 dtype: float64,
 b    4.0
 a    5.0
 c    5.0
 dtype: float64)

In [112]:
df3.sum(skipna = False) , df3.mean(skipna = False)

(b    16.0
 a    20.0
 c     NaN
 dtype: float64,
 b    4.0
 a    5.0
 c    NaN
 dtype: float64)

In [113]:
df3.max()

b    7.0
a    9.0
c    7.0
dtype: float64

In [114]:
df3.idxmax(), df3.idxmin()

(b    1
 a    1
 c    2
 dtype: int64,
 b    3
 a    2
 c    1
 dtype: int64)

In [115]:
ser3 = pd.Series(['c','a','c','b','a','d','b','b','c','c'])
ser3

0    c
1    a
2    c
3    b
4    a
5    d
6    b
7    b
8    c
9    c
dtype: object

In [119]:
uq = ser3.unique()

In [120]:
uq

array(['c', 'a', 'b', 'd'], dtype=object)

In [117]:
set(ser3)

{'a', 'b', 'c', 'd'}

In [121]:
uq.sort(); uq

array(['a', 'b', 'c', 'd'], dtype=object)

In [122]:
ser3

0    c
1    a
2    c
3    b
4    a
5    d
6    b
7    b
8    c
9    c
dtype: object

In [123]:
ser3.value_counts()

c    4
b    3
a    2
d    1
dtype: int64