# p05. Pandas

#### : 표 형식의 데이터나 다양한 형태의 데이터를 다루는데 초점(index, name,name)
####     : Excel 형식과 비슷
####    (Numpy: 단일 산술배열 데이터를 다루는데 특화) : data만 존재 , index(암묵적)

## 1. Series   : 1차원 배열

In [12]:
import numpy as np
import pandas as pd

obj=pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [3]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [13]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
obj.dtype

dtype('int64')

In [7]:
obj2=pd.Series([4,7,-5,3], index=['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [6]:
sdata={'kim':35000, 'hong1':67000, 'h2':12000, 'h3':40000} #dictionary => Series
obj3= pd.Series(sdata)
obj3

kim      35000
hong1    67000
h2       12000
h3       40000
dtype: int64

In [8]:
#기존 자료 추가
obj3.name='Salary'
obj3.index.name="Names"
obj3

Names
kim      35000
hong1    67000
h2       12000
h3       40000
Name: Salary, dtype: int64

In [9]:
#index 변경
obj3.index=['A','B','C','D']
obj3

A    35000
B    67000
C    12000
D    40000
Name: Salary, dtype: int64

## 2. Data Frame : 2차원 배열

In [10]:
#행과 열의구조를 가진 데이터 생성

data={'name': ['hong1','hong2','hong3','hong4','hong5'],
          'year': [2015,2016,2017,2018,2019],
          'points':[1.5,1.7,3.6,2.4,2.9]
     }

df=pd.DataFrame(data)
df

Unnamed: 0,name,year,points
0,hong1,2015,1.5
1,hong2,2016,1.7
2,hong3,2017,3.6
3,hong4,2018,2.4
4,hong5,2019,2.9


In [11]:
# 행 방향의 index
df.index

RangeIndex(start=0, stop=5, step=1)

In [12]:
#열 방향의 index
df.columns

Index(['name', 'year', 'points'], dtype='object')

In [13]:
# 값 얻기
df.values

array([['hong1', 2015, 1.5],
       ['hong2', 2016, 1.7],
       ['hong3', 2017, 3.6],
       ['hong4', 2018, 2.4],
       ['hong5', 2019, 2.9]], dtype=object)

In [14]:
#각 인덱스에 대한 이름 설정하기
df.index.name='Num'
df.columns.name='Info'
df

Info,name,year,points
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,hong1,2015,1.5
1,hong2,2016,1.7
2,hong3,2017,3.6
3,hong4,2018,2.4
4,hong5,2019,2.9


In [16]:
# DataFrame 만들면서 columns & index 설정
#NaN : data에 포함되지 않는 값
df2=pd.DataFrame(data, columns=['year','name','points','penalty'],
                  index=['one', 'two', 'three', 'four', 'five'])
df2

Unnamed: 0,year,name,points,penalty
one,2015,hong1,1.5,
two,2016,hong2,1.7,
three,2017,hong3,3.6,
four,2018,hong4,2.4,
five,2019,hong5,2.9,


In [17]:
df2.describe()

Unnamed: 0,year,points
count,5.0,5.0
mean,2017.0,2.42
std,1.581139,0.864292
min,2015.0,1.5
25%,2016.0,1.7
50%,2017.0,2.4
75%,2018.0,2.9
max,2019.0,3.6


In [None]:
### 3. DataFrame Indexing

In [2]:
import numpy as np
import pandas as pd

data={"names":['h1','h2','h3','h4','h5'],
         'year':[2015,2016,2017,2018,2019],
        'points':[1.5,1.7,3.6,2.4,2.9]
     }

df=pd.DataFrame(data,
               columns=['year','names','points','penalty'],
                index=['one','two','three','four','five']
               )

df

Unnamed: 0,year,names,points,penalty
one,2015,h1,1.5,
two,2016,h2,1.7,
three,2017,h3,3.6,
four,2018,h4,2.4,
five,2019,h5,2.9,


In [3]:
# DataFrame에서 열을 선택하고 조작
df['year']

one      2015
two      2016
three    2017
four     2018
five     2019
Name: year, dtype: int64

In [4]:
df.year

one      2015
two      2016
three    2017
four     2018
five     2019
Name: year, dtype: int64

In [6]:
df[['year','names']]
df

Unnamed: 0,year,names,points,penalty
one,2015,h1,1.5,
two,2016,h2,1.7,
three,2017,h3,3.6,
four,2018,h4,2.4,
five,2019,h5,2.9,


In [7]:
df['penalty']= 0.5
df

Unnamed: 0,year,names,points,penalty
one,2015,h1,1.5,0.5
two,2016,h2,1.7,0.5
three,2017,h3,3.6,0.5
four,2018,h4,2.4,0.5
five,2019,h5,2.9,0.5


In [8]:
df['penalty']=[0.1, 0.2, 0.3, 0.4, 0.5]
df

Unnamed: 0,year,names,points,penalty
one,2015,h1,1.5,0.1
two,2016,h2,1.7,0.2
three,2017,h3,3.6,0.3
four,2018,h4,2.4,0.4
five,2019,h5,2.9,0.5


In [9]:
# 새로운 열 추가 (열이름: abc, value:0~4)
df['abc']=np.arange(5)
df

Unnamed: 0,year,names,points,penalty,abc
one,2015,h1,1.5,0.1,0
two,2016,h2,1.7,0.2,1
three,2017,h3,3.6,0.3,2
four,2018,h4,2.4,0.4,3
five,2019,h5,2.9,0.5,4


In [11]:
# Series를 추가
# : index에 맞추어서 데이터 저장가능(numpy 다른점)

val=pd.Series([-1.2, -1.5, -1.7],  index=['two','four','five'])
df['bcd'] = val
df

Unnamed: 0,year,names,points,penalty,abc,bcd
one,2015,h1,1.5,0.1,0,
two,2016,h2,1.7,0.2,1,-1.2
three,2017,h3,3.6,0.3,2,
four,2018,h4,2.4,0.4,3,-1.5
five,2019,h5,2.9,0.5,4,-1.7


In [12]:
df['point_penalty'] = df['points']-df['penalty']
df

Unnamed: 0,year,names,points,penalty,abc,bcd,point_penalty
one,2015,h1,1.5,0.1,0,,1.4
two,2016,h2,1.7,0.2,1,-1.2,1.5
three,2017,h3,3.6,0.3,2,,3.3
four,2018,h4,2.4,0.4,3,-1.5,2.0
five,2019,h5,2.9,0.5,4,-1.7,2.4


In [13]:
df['True_False'] = df['point_penalty'] > 2.0
df

Unnamed: 0,year,names,points,penalty,abc,bcd,point_penalty,True_False
one,2015,h1,1.5,0.1,0,,1.4,False
two,2016,h2,1.7,0.2,1,-1.2,1.5,False
three,2017,h3,3.6,0.3,2,,3.3,True
four,2018,h4,2.4,0.4,3,-1.5,2.0,False
five,2019,h5,2.9,0.5,4,-1.7,2.4,True


In [14]:
#열 삭제
del df['True_False']
del df['point_penalty']
df

Unnamed: 0,year,names,points,penalty,abc,bcd
one,2015,h1,1.5,0.1,0,
two,2016,h2,1.7,0.2,1,-1.2
three,2017,h3,3.6,0.3,2,
four,2018,h4,2.4,0.4,3,-1.5
five,2019,h5,2.9,0.5,4,-1.7


In [16]:
df.columns

Index(['year', 'names', 'points', 'penalty', 'abc', 'bcd'], dtype='object')

In [17]:
df.index.name='Order'
df.columns.name='Info'
df

Info,year,names,points,penalty,abc,bcd
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,2015,h1,1.5,0.1,0,
two,2016,h2,1.7,0.2,1,-1.2
three,2017,h3,3.6,0.3,2,
four,2018,h4,2.4,0.4,3,-1.5
five,2019,h5,2.9,0.5,4,-1.7


In [18]:
# DataFrame에서 행을 선택하고 조작하기
df[0:3]

Info,year,names,points,penalty,abc,bcd
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,2015,h1,1.5,0.1,0,
two,2016,h2,1.7,0.2,1,-1.2
three,2017,h3,3.6,0.3,2,


In [19]:
df['two':'four']

Info,year,names,points,penalty,abc,bcd
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
two,2016,h2,1.7,0.2,1,-1.2
three,2017,h3,3.6,0.3,2,
four,2018,h4,2.4,0.4,3,-1.5


In [20]:
df.loc['two']

Info
year       2016
names        h2
points      1.7
penalty     0.2
abc           1
bcd        -1.2
Name: two, dtype: object

In [21]:
df.loc['two':'four']

Info,year,names,points,penalty,abc,bcd
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
two,2016,h2,1.7,0.2,1,-1.2
three,2017,h3,3.6,0.3,2,
four,2018,h4,2.4,0.4,3,-1.5


In [22]:
df.loc['two':'four', 'points']

Order
two      1.7
three    3.6
four     2.4
Name: points, dtype: float64

In [23]:
df.loc[:,'year']

Order
one      2015
two      2016
three    2017
four     2018
five     2019
Name: year, dtype: int64

In [25]:
# 새로운 행 삽입하기(loc)
df.loc['six', :] = [2020,'sep',4.0, 0.1,2.1,3]
df

Info,year,names,points,penalty,abc,bcd
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,2015.0,h1,1.5,0.1,0.0,
two,2016.0,h2,1.7,0.2,1.0,-1.2
three,2017.0,h3,3.6,0.3,2.0,
four,2018.0,h4,2.4,0.4,3.0,-1.5
five,2019.0,h5,2.9,0.5,4.0,-1.7
six,2020.0,sep,4.0,0.1,2.1,3.0


In [26]:
# iloc  : index location
df.iloc[3]

Info
year       2018
names        h4
points      2.4
penalty     0.4
abc           3
bcd        -1.5
Name: four, dtype: object

In [27]:
df.iloc[[0,1,3],[1,2]]

Info,names,points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,h1,1.5
two,h2,1.7
four,h4,2.4


In [28]:
df.iloc[:, 1:4]

Info,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,h1,1.5,0.1
two,h2,1.7,0.2
three,h3,3.6,0.3
four,h4,2.4,0.4
five,h5,2.9,0.5
six,sep,4.0,0.1


In [30]:
df.iloc[0,1]

'h1'

### 4. DataFrame에서 boolean indexing

In [31]:
df['year'] > 2016


Order
one      False
two      False
three     True
four      True
five      True
six       True
Name: year, dtype: bool

In [32]:
# 2016보다 큰 모든 행의 값
df.loc[df['year']>2016, :]

Info,year,names,points,penalty,abc,bcd
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
three,2017.0,h3,3.6,0.3,2.0,
four,2018.0,h4,2.4,0.4,3.0,-1.5
five,2019.0,h5,2.9,0.5,4.0,-1.7
six,2020.0,sep,4.0,0.1,2.1,3.0


In [33]:
df.loc[df['names']=='h2', ['names','points']]

Info,names,points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
two,h2,1.7


In [35]:
df.loc[df['points'] > 3, 'penalty'] = 0
df

Info,year,names,points,penalty,abc,bcd
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,2015.0,h1,1.5,0.1,0.0,
two,2016.0,h2,1.7,0.2,1.0,-1.2
three,2017.0,h3,3.6,0.0,2.0,
four,2018.0,h4,2.4,0.4,3.0,-1.5
five,2019.0,h5,2.9,0.5,4.0,-1.7
six,2020.0,sep,4.0,0.0,2.1,3.0


In [None]:
### 5. data

In [36]:
df = pd.DataFrame(np.random.randn(6,4))
df

Unnamed: 0,0,1,2,3
0,-1.027945,-0.149006,-0.898993,-0.418875
1,-0.498027,-0.441543,-0.274232,2.620338
2,0.523205,-0.804743,-0.599175,0.838656
3,-0.218254,0.193624,-1.068698,-0.617032
4,0.279634,-0.685504,0.411839,-1.450836
5,-0.126218,-0.64916,-0.466509,2.203105


In [37]:
df.columns = ['A','B','C','D']
df.index = pd.date_range('20191114', periods=6)
df.index

DatetimeIndex(['2019-11-14', '2019-11-15', '2019-11-16', '2019-11-17',
               '2019-11-18', '2019-11-19'],
              dtype='datetime64[ns]', freq='D')

In [38]:
df

Unnamed: 0,A,B,C,D
2019-11-14,-1.027945,-0.149006,-0.898993,-0.418875
2019-11-15,-0.498027,-0.441543,-0.274232,2.620338
2019-11-16,0.523205,-0.804743,-0.599175,0.838656
2019-11-17,-0.218254,0.193624,-1.068698,-0.617032
2019-11-18,0.279634,-0.685504,0.411839,-1.450836
2019-11-19,-0.126218,-0.64916,-0.466509,2.203105


In [42]:
# np.nan : 입력
df['F']=[1.0, np.nan, 3.5, 6.1, np.nan, 7.0]
df

Unnamed: 0,A,B,C,D,F
2019-11-14,-1.027945,-0.149006,-0.898993,-0.418875,1.0
2019-11-15,-0.498027,-0.441543,-0.274232,2.620338,
2019-11-16,0.523205,-0.804743,-0.599175,0.838656,3.5
2019-11-17,-0.218254,0.193624,-1.068698,-0.617032,6.1
2019-11-18,0.279634,-0.685504,0.411839,-1.450836,
2019-11-19,-0.126218,-0.64916,-0.466509,2.203105,7.0


In [43]:
#NaN 없애기
# any: 행의 값중 하나라도 nan인 경우 그 행을 없애기
df.dropna(how='any')

Unnamed: 0,A,B,C,D,F
2019-11-14,-1.027945,-0.149006,-0.898993,-0.418875,1.0
2019-11-16,0.523205,-0.804743,-0.599175,0.838656,3.5
2019-11-17,-0.218254,0.193624,-1.068698,-0.617032,6.1
2019-11-19,-0.126218,-0.64916,-0.466509,2.203105,7.0


In [44]:
# all: 행의  모든 값이 nan인 경우 그 행을 없애기
df.dropna(how='all')

Unnamed: 0,A,B,C,D,F
2019-11-14,-1.027945,-0.149006,-0.898993,-0.418875,1.0
2019-11-15,-0.498027,-0.441543,-0.274232,2.620338,
2019-11-16,0.523205,-0.804743,-0.599175,0.838656,3.5
2019-11-17,-0.218254,0.193624,-1.068698,-0.617032,6.1
2019-11-18,0.279634,-0.685504,0.411839,-1.450836,
2019-11-19,-0.126218,-0.64916,-0.466509,2.203105,7.0


In [45]:
#nan에 값 넣기
df.fillna(value=0.5)

Unnamed: 0,A,B,C,D,F
2019-11-14,-1.027945,-0.149006,-0.898993,-0.418875,1.0
2019-11-15,-0.498027,-0.441543,-0.274232,2.620338,0.5
2019-11-16,0.523205,-0.804743,-0.599175,0.838656,3.5
2019-11-17,-0.218254,0.193624,-1.068698,-0.617032,6.1
2019-11-18,0.279634,-0.685504,0.411839,-1.450836,0.5
2019-11-19,-0.126218,-0.64916,-0.466509,2.203105,7.0


In [46]:
df.isnull()

Unnamed: 0,A,B,C,D,F
2019-11-14,False,False,False,False,False
2019-11-15,False,False,False,False,True
2019-11-16,False,False,False,False,False
2019-11-17,False,False,False,False,False
2019-11-18,False,False,False,False,True
2019-11-19,False,False,False,False,False


In [47]:
# F열에서 nan값을 포함하는 행만 추출
df.loc[ df.isnull()['F']  , :]

Unnamed: 0,A,B,C,D,F
2019-11-15,-0.498027,-0.441543,-0.274232,2.620338,
2019-11-18,0.279634,-0.685504,0.411839,-1.450836,


In [48]:
pd.to_datetime('20191114')

Timestamp('2019-11-14 00:00:00')

In [51]:
#특정행 drop하기
df.drop([pd.to_datetime('20191116'), pd.to_datetime('20191119')])

Unnamed: 0,A,B,C,D,F
2019-11-14,-1.027945,-0.149006,-0.898993,-0.418875,1.0
2019-11-15,-0.498027,-0.441543,-0.274232,2.620338,
2019-11-17,-0.218254,0.193624,-1.068698,-0.617032,6.1
2019-11-18,0.279634,-0.685504,0.411839,-1.450836,


In [52]:
# 특정 열 삭제
df.drop(['B','D'], axis=1)

Unnamed: 0,A,C,F
2019-11-14,-1.027945,-0.898993,1.0
2019-11-15,-0.498027,-0.274232,
2019-11-16,0.523205,-0.599175,3.5
2019-11-17,-0.218254,-1.068698,6.1
2019-11-18,0.279634,0.411839,
2019-11-19,-0.126218,-0.466509,7.0


### 5. Data 분석용 함수들

In [15]:
data=[[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]]
df=pd.DataFrame(data, columns=['one','two'],
               index=['a','b','c','d']
               )
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [16]:
#행 방향으로 합(즉, 열의 합): NaN값은 배제하고 계산
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [17]:
#열 방향으로 합(즉, 행의 합)
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [18]:
#nan이 계산에 참여하는 경우
df.sum(axis=1, skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [19]:
# 특정행이나 열에서 계산
df['one'].sum()

9.25

In [20]:
df.loc['b'].sum()

2.5999999999999996

In [21]:
df2=pd.DataFrame(np.random.randn(6,4),
                columns=["A","B","C","D"],
                 index=pd.date_range('20191114', periods=6)
                )
df2

Unnamed: 0,A,B,C,D
2019-11-14,-0.157556,-0.161492,0.510613,-0.229169
2019-11-15,0.807816,1.537852,-0.155769,-1.691371
2019-11-16,-0.28696,-0.173129,0.782968,-1.384199
2019-11-17,0.326022,-1.10599,0.053319,1.398398
2019-11-18,-0.352387,-1.038549,-0.544845,0.651002
2019-11-19,0.369203,-1.265867,0.865147,0.986216


In [22]:
#A열과 B열의 상관계수 구하기
df2['A'].corr(df2['B'])

0.46497159848408137

In [23]:
#B열과 C열의 공분산구하기
df2['B'].cov(df2['C'])

-0.09796948557013253

## 6. Sort Function & 기타 함수

In [25]:
#permutation(순열): array를 복사해서 리턴

dates= df2.index
random_dates=np.random.permutation(dates)

df2=df2.reindex(index=random_dates, columns=['D','C','B','A'])
df2

Unnamed: 0,D,C,B,A
2019-11-14,-0.229169,0.510613,-0.161492,-0.157556
2019-11-15,-1.691371,-0.155769,1.537852,0.807816
2019-11-18,0.651002,-0.544845,-1.038549,-0.352387
2019-11-19,0.986216,0.865147,-1.265867,0.369203
2019-11-17,1.398398,0.053319,-1.10599,0.326022
2019-11-16,-1.384199,0.782968,-0.173129,-0.28696


In [27]:
df2.sort_index(axis=0)

Unnamed: 0,D,C,B,A
2019-11-14,-0.229169,0.510613,-0.161492,-0.157556
2019-11-15,-1.691371,-0.155769,1.537852,0.807816
2019-11-16,-1.384199,0.782968,-0.173129,-0.28696
2019-11-17,1.398398,0.053319,-1.10599,0.326022
2019-11-18,0.651002,-0.544845,-1.038549,-0.352387
2019-11-19,0.986216,0.865147,-1.265867,0.369203


In [29]:
df2.sort_index(axis=1)

Unnamed: 0,A,B,C,D
2019-11-14,-0.157556,-0.161492,0.510613,-0.229169
2019-11-15,0.807816,1.537852,-0.155769,-1.691371
2019-11-18,-0.352387,-1.038549,-0.544845,0.651002
2019-11-19,0.369203,-1.265867,0.865147,0.986216
2019-11-17,0.326022,-1.10599,0.053319,1.398398
2019-11-16,-0.28696,-0.173129,0.782968,-1.384199


In [32]:
df2.sort_index(axis=0,ascending=False)

Unnamed: 0,D,C,B,A
2019-11-19,0.986216,0.865147,-1.265867,0.369203
2019-11-18,0.651002,-0.544845,-1.038549,-0.352387
2019-11-17,1.398398,0.053319,-1.10599,0.326022
2019-11-16,-1.384199,0.782968,-0.173129,-0.28696
2019-11-15,-1.691371,-0.155769,1.537852,0.807816
2019-11-14,-0.229169,0.510613,-0.161492,-0.157556


In [35]:
df2.sort_values(by="D")

Unnamed: 0,D,C,B,A
2019-11-15,-1.691371,-0.155769,1.537852,0.807816
2019-11-16,-1.384199,0.782968,-0.173129,-0.28696
2019-11-14,-0.229169,0.510613,-0.161492,-0.157556
2019-11-18,0.651002,-0.544845,-1.038549,-0.352387
2019-11-19,0.986216,0.865147,-1.265867,0.369203
2019-11-17,1.398398,0.053319,-1.10599,0.326022


In [37]:
#B열의 값이 내림차순이 되도록 정렬

df2.sort_values(by="B",ascending=False)

Unnamed: 0,D,C,B,A
2019-11-15,-1.691371,-0.155769,1.537852,0.807816
2019-11-14,-0.229169,0.510613,-0.161492,-0.157556
2019-11-16,-1.384199,0.782968,-0.173129,-0.28696
2019-11-18,0.651002,-0.544845,-1.038549,-0.352387
2019-11-17,1.398398,0.053319,-1.10599,0.326022
2019-11-19,0.986216,0.865147,-1.265867,0.369203


In [56]:
df2['E']=np.random.randint(0,6,size=6)
df2['F']=['alpha','beta','gamma','gamma','alpha','delta']
df2

Unnamed: 0,D,C,B,A,E,F
2019-11-14,-0.229169,0.510613,-0.161492,-0.157556,3,alpha
2019-11-15,-1.691371,-0.155769,1.537852,0.807816,4,beta
2019-11-18,0.651002,-0.544845,-1.038549,-0.352387,1,gamma
2019-11-19,0.986216,0.865147,-1.265867,0.369203,0,gamma
2019-11-17,1.398398,0.053319,-1.10599,0.326022,5,alpha
2019-11-16,-1.384199,0.782968,-0.173129,-0.28696,0,delta


In [57]:
# E&F열을 동시에 고려해서 오름차순

df2.sort_values(by=['E','F'])

Unnamed: 0,D,C,B,A,E,F
2019-11-16,-1.384199,0.782968,-0.173129,-0.28696,0,delta
2019-11-19,0.986216,0.865147,-1.265867,0.369203,0,gamma
2019-11-18,0.651002,-0.544845,-1.038549,-0.352387,1,gamma
2019-11-14,-0.229169,0.510613,-0.161492,-0.157556,3,alpha
2019-11-15,-1.691371,-0.155769,1.537852,0.807816,4,beta
2019-11-17,1.398398,0.053319,-1.10599,0.326022,5,alpha


In [58]:
#중복 제거한 값

df2['F'].unique()

array(['alpha', 'beta', 'gamma', 'delta'], dtype=object)

In [59]:
#지정한 행,열에서 값에 따른 갯수 구하기

df2['F'].value_counts()

gamma    2
alpha    2
delta    1
beta     1
Name: F, dtype: int64

In [60]:
df2['F'].isin(["gamma"])

2019-11-14    False
2019-11-15    False
2019-11-18     True
2019-11-19     True
2019-11-17    False
2019-11-16    False
Name: F, dtype: bool

In [61]:
df2.loc[df2['F'].isin(["alpha","beta"])]

Unnamed: 0,D,C,B,A,E,F
2019-11-14,-0.229169,0.510613,-0.161492,-0.157556,3,alpha
2019-11-15,-1.691371,-0.155769,1.537852,0.807816,4,beta
2019-11-17,1.398398,0.053319,-1.10599,0.326022,5,alpha


In [63]:
df3=pd.DataFrame(np.random.randn(4,3),columns=['b','d','e'],index=['Seoul','Incheon','Busan','Daegu'])
df3

Unnamed: 0,b,d,e
Seoul,-0.116058,1.863299,0.979955
Incheon,2.267778,-0.866841,0.623385
Busan,0.95601,-0.065203,-2.006419
Daegu,-0.245909,-1.269571,-1.174573


In [70]:
func=lambda x: x.max() - x.min()
print(df3.apply(func,axis=0))
print(func(df3))

b    2.513687
d    3.132870
e    2.986374
dtype: float64
b    2.513687
d    3.132870
e    2.986374
dtype: float64
