In [2]:
import numpy as np
import pandas as pd

In [0]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [0]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [0]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [0]:
data[1]

0.5

In [0]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [0]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index = ['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [0]:
data.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [0]:
data['b']

0.5

In [0]:
data[1]

0.5

In [0]:
data['b':'c']

b    0.50
c    0.75
dtype: float64

In [0]:
data = pd.Series( [0.25, 0.5, 0.75, 1.0], index = [2, 5, 3, 7])
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [0]:
data.index

Int64Index([2, 5, 3, 7], dtype='int64')

In [0]:
data[2]   # 기 지정된 인덱스 값으로 인덱싱

0.25

In [0]:
data[5:7] # 기존의 인덱스 개념으로 인덱싱

Series([], dtype: float64)

In [0]:
data[1:4]

5    0.50
3    0.75
7    1.00
dtype: float64

In [0]:
population_dict = {'Seoul':9705000, 'Busan':3400000, 'Daegu':2450000, 'Daejeon':1518000, 'Inchon':2939000}
population = pd.Series(population_dict)
population

Seoul      9705000
Busan      3400000
Daegu      2450000
Daejeon    1518000
Inchon     2939000
dtype: int64

In [0]:
population['Daejeon']

1518000

In [0]:
population['Busan':'Daejeon'] # 끝 인덱스인 Daejeon도 포함이 된다.

Busan      3400000
Daegu      2450000
Daejeon    1518000
dtype: int64

In [0]:
pd.Series([2, 4, 6])

0    2
1    4
2    6
dtype: int64

In [0]:
pd.Series(5, index = [100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [0]:
pd.Series({2:'a', 1:'b', 3:'c'})

2    a
1    b
3    c
dtype: object

In [0]:
pd.Series({2:'a', 1:'b', 3:'c'}, index = [3, 2]) #원하는 인덱스만 추출

3    c
2    a
dtype: object

In [0]:
array = np.arange(5)
data = pd.Series(array)
data

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [0]:
area_dict = {'Seoul':605, 'Busan':770, 'Daegu':884,
             'Daejeon':539, 'Inchon':1063}
area = pd.Series(area_dict)
area

Seoul       605
Busan       770
Daegu       884
Daejeon     539
Inchon     1063
dtype: int64

In [0]:
city = pd.DataFrame({'population':population, 'area':area})
city

Unnamed: 0,population,area
Seoul,9705000,605
Busan,3400000,770
Daegu,2450000,884
Daejeon,1518000,539
Inchon,2939000,1063


In [0]:
city.index

Index(['Seoul', 'Busan', 'Daegu', 'Daejeon', 'Inchon'], dtype='object')

In [0]:
city.columns

Index(['population', 'area'], dtype='object')

In [0]:
city['area']

Seoul       605
Busan       770
Daegu       884
Daejeon     539
Inchon     1063
Name: area, dtype: int64

In [0]:
city.area

Seoul       605
Busan       770
Daegu       884
Daejeon     539
Inchon     1063
Name: area, dtype: int64

In [0]:
pd.DataFrame(population, columns = ['population'])

Unnamed: 0,population
Seoul,9705000
Busan,3400000
Daegu,2450000
Daejeon,1518000
Inchon,2939000


In [0]:
data = [{'a':i, 'b':2*i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [0]:
pd.DataFrame([{'a':1, 'b':2}, {'b':3, 'c':4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [0]:
pd.DataFrame(np.random.rand(3, 2), columns = ['foo', 'bar'],
             index = ['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.132027,0.743615
b,0.350639,0.713935
c,0.89297,0.969669


In [0]:
A = np.zeros(3, dtype = [ ('A', 'i8'), ('B', 'f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [0]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


# Index

1.   중복을 허용하는 Set
2.   불변이다.



In [0]:
idx = pd.Index([2,3,5,7,11]) # i가 소문자가 아니라 대문자임 I
idx

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [0]:
idx[1]

3

In [0]:
idx[::2]

Int64Index([2, 5, 11], dtype='int64')

In [0]:
print(idx.size, idx.shape, idx.ndim, idx.dtype)

5 (5,) 1 int64


In [0]:
idxA = pd.Index([1, 3, 5, 7, 9])
idxB = pd.Index([2, 3, 5, 7, 11])

In [0]:
idxA & idxB

Int64Index([3, 5, 7], dtype='int64')

In [0]:
idxA | idxB

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [0]:
idxA & idxB

Int64Index([3, 5, 7], dtype='int64')

In [0]:
idxA ^ idxB

Int64Index([1, 2, 9, 11], dtype='int64')

In [0]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [0]:
'a' in data

True

In [0]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [0]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [0]:
data['e']=1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [0]:
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [0]:
data[0:2]

a    0.25
b    0.50
dtype: float64

In [0]:
data[(data>0.3) & (data<0.8)]

b    0.50
c    0.75
dtype: float64

In [0]:
data = pd.Series(['a', 'b', 'c'], index = [1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [0]:
data[1:3]

3    b
5    c
dtype: object

In [0]:
data.loc[1]  # 명시적 index

'a'

In [0]:
data.loc[1:3]

1    a
3    b
dtype: object

In [0]:
data.iloc[1] # 암시적 index

'b'

In [0]:
data.iloc[1:3]

3    b
5    c
dtype: object

In [0]:
data = pd.DataFrame({'area':area, 'population':population})
data

Unnamed: 0,area,population
Seoul,605,9705000
Busan,770,3400000
Daegu,884,2450000
Daejeon,539,1518000
Inchon,1063,2939000


In [0]:
data['population']

Seoul      9705000
Busan      3400000
Daegu      2450000
Daejeon    1518000
Inchon     2939000
Name: population, dtype: int64

In [0]:
data.population

Seoul      9705000
Busan      3400000
Daegu      2450000
Daejeon    1518000
Inchon     2939000
Name: population, dtype: int64

In [0]:
data.area is data['area']

True

In [0]:
data.population is data['population']

True

In [0]:
data['density'] = data['population'] / data['area']
data

Unnamed: 0,area,population,density
Seoul,605,9705000,16041.322314
Busan,770,3400000,4415.584416
Daegu,884,2450000,2771.493213
Daejeon,539,1518000,2816.326531
Inchon,1063,2939000,2764.816557


In [0]:
data.values

array([[6.05000000e+02, 9.70500000e+06, 1.60413223e+04],
       [7.70000000e+02, 3.40000000e+06, 4.41558442e+03],
       [8.84000000e+02, 2.45000000e+06, 2.77149321e+03],
       [5.39000000e+02, 1.51800000e+06, 2.81632653e+03],
       [1.06300000e+03, 2.93900000e+06, 2.76481656e+03]])

In [0]:
data.T

Unnamed: 0,Seoul,Busan,Daegu,Daejeon,Inchon
area,605.0,770.0,884.0,539.0,1063.0
population,9705000.0,3400000.0,2450000.0,1518000.0,2939000.0
density,16041.32,4415.584,2771.493,2816.327,2764.817


In [0]:
data.iloc[:3,:2]

Unnamed: 0,area,population
Seoul,605,9705000
Busan,770,3400000
Daegu,884,2450000


In [0]:
data.iloc[:3,:2]

Unnamed: 0,area,population
Seoul,605,9705000
Busan,770,3400000
Daegu,884,2450000


In [0]:
data.loc[:'Daejeon', :'population']

Unnamed: 0,area,population
Seoul,605,9705000
Busan,770,3400000
Daegu,884,2450000
Daejeon,539,1518000


In [0]:
data.loc[data.density > 3000, ['population', 'density']]

Unnamed: 0,population,density
Seoul,9705000,16041.322314
Busan,3400000,4415.584416


In [0]:
data['Busan':'Inchon']

Unnamed: 0,area,population,density
Busan,770,3400000,4415.584416
Daegu,884,2450000,2771.493213
Daejeon,539,1518000,2816.326531
Inchon,1063,2939000,2764.816557


In [0]:
data[1:5]

Unnamed: 0,area,population,density
Busan,770,3400000,4415.584416
Daegu,884,2450000,2771.493213
Daejeon,539,1518000,2816.326531
Inchon,1063,2939000,2764.816557


In [0]:
rng = np.random.RandomState(42)
s = pd.Series(rng.randint( 0, 10, 4 ))
s

0    6
1    3
2    7
3    4
dtype: int64

In [0]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)), columns = ['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [0]:
np.exp(s)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [0]:
np.sin(df*np.pi/4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


In [0]:
pd.isnull(df) # 결측치 확인 함수

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False


In [0]:
pd.notnull(df) # 결측치가 아닌 것 확인

Unnamed: 0,A,B,C,D
0,True,True,True,True
1,True,True,True,True
2,True,True,True,True


In [0]:
df.name = 'popu'

In [0]:
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [0]:
df.index.name = 'idx'

In [0]:
df

Unnamed: 0_level_0,A,B,C,D
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [0]:
''' series 개체 연산 시,
    하나라도 NaN 값이 포함되어 있으면 결과도 NaN,
    인덱스를 추가 해줌
'''

a = pd.Series({"a":1, "b":2}, {"b":3, "c":4})
a

b    2.0
c    NaN
dtype: float64

In [0]:
a.name = 'val'
a.index.name='idx'
a

idx
b    2.0
c    NaN
Name: val, dtype: float64

In [0]:
data = {'cities' : ['Seoul', 'Seoul', 'Seoul', 'Bousan', 'Bousan', 'Bousan'],
        'year' : [2016, 2017, 2018, 2016, 2017, 2018],
        'pop' : [9843, 9766, 9705, 3447, 3424, 3400]}
df = pd.DataFrame(data)
df

Unnamed: 0,cities,year,pop
0,Seoul,2016,9843
1,Seoul,2017,9766
2,Seoul,2018,9705
3,Bousan,2016,3447
4,Bousan,2017,3424
5,Bousan,2018,3400


In [0]:
df.head()

Unnamed: 0,cities,year,pop
0,Seoul,2016,9843
1,Seoul,2017,9766
2,Seoul,2018,9705
3,Bousan,2016,3447
4,Bousan,2017,3424


In [0]:
pd.DataFrame(data, columns = ['year', 'cities', 'pop']) # 열 위치 재배정

Unnamed: 0,year,cities,pop
0,2016,Seoul,9843
1,2017,Seoul,9766
2,2018,Seoul,9705
3,2016,Bousan,3447
4,2017,Bousan,3424
5,2018,Bousan,3400


In [0]:
df2 = pd.DataFrame(data, columns = ['year', 'cities', 'pop', 'debt'],
                   index = ['one', 'two', 'three', 'four', 'five', 'six'])
df2

Unnamed: 0,year,cities,pop,debt
one,2016,Seoul,9843,
two,2017,Seoul,9766,
three,2018,Seoul,9705,
four,2016,Bousan,3447,
five,2017,Bousan,3424,
six,2018,Bousan,3400,


In [0]:
df2.columns

Index(['year', 'cities', 'pop', 'debt'], dtype='object')

In [0]:
df2['cities']

one       Seoul
two       Seoul
three     Seoul
four     Bousan
five     Bousan
six      Bousan
Name: cities, dtype: object

In [0]:
df2.year # 속성표기법. 주의점 : 기존 함수랑 겹치지 않도록 조심.

one      2016
two      2017
three    2018
four     2016
five     2017
six      2018
Name: year, dtype: int64

In [0]:
df2.loc['three']

year       2018
cities    Seoul
pop        9705
debt        NaN
Name: three, dtype: object

In [0]:
df2['debt'] = 16.5
df2

Unnamed: 0,year,cities,pop,debt
one,2016,Seoul,9843,16.5
two,2017,Seoul,9766,16.5
three,2018,Seoul,9705,16.5
four,2016,Bousan,3447,16.5
five,2017,Bousan,3424,16.5
six,2018,Bousan,3400,16.5


In [0]:
df2['debt'] = np.arange(6, )
df2

Unnamed: 0,year,cities,pop,debt
one,2016,Seoul,9843,0
two,2017,Seoul,9766,1
three,2018,Seoul,9705,2
four,2016,Bousan,3447,3
five,2017,Bousan,3424,4
six,2018,Bousan,3400,5


In [0]:
val = pd.Series( [-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])

In [0]:
df2['debt'] = val
df2

Unnamed: 0,year,cities,pop,debt
one,2016,Seoul,9843,
two,2017,Seoul,9766,-1.2
three,2018,Seoul,9705,
four,2016,Bousan,3447,-1.5
five,2017,Bousan,3424,-1.7
six,2018,Bousan,3400,


In [0]:
df2['north'] = df2.cities == 'Seoul'
df2

Unnamed: 0,year,cities,pop,debt,north
one,2016,Seoul,9843,,True
two,2017,Seoul,9766,-1.2,True
three,2018,Seoul,9705,,True
four,2016,Bousan,3447,-1.5,False
five,2017,Bousan,3424,-1.7,False
six,2018,Bousan,3400,,False


In [0]:
del df2['north']

In [0]:
df2.columns

Index(['year', 'cities', 'pop', 'debt'], dtype='object')

In [0]:
pop = {'Daegu':{2016:2461, 2017:2458},
       'Daejeon':{2016:1536, 2017:1528, 2018:1518}}
df3 = pd.DataFrame(pop)
df3

Unnamed: 0,Daegu,Daejeon
2016,2461.0,1536
2017,2458.0,1528
2018,,1518


In [0]:
df3.T

Unnamed: 0,2016,2017,2018
Daegu,2461.0,2458.0,
Daejeon,1536.0,1528.0,1518.0


In [0]:
pd.DataFrame(pop, index = [2016, 2017, 2018, 2019])

Unnamed: 0,Daegu,Daejeon
2016,2461.0,1536.0
2017,2458.0,1528.0
2018,,1518.0
2019,,


In [0]:
pdata = {'Daegu':df3['Daegu'][:-1],
         'Daejeon':df3['Daejeon'][:2]}

In [0]:
pd.DataFrame(pdata)

Unnamed: 0,Daegu,Daejeon
2016,2461.0,1536
2017,2458.0,1528


In [0]:
df3.index.name = 'year'
df3

Unnamed: 0_level_0,Daegu,Daejeon
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2016,2461.0,1536
2017,2458.0,1528
2018,,1518


In [0]:
df3.columns.name = 'cities'
df3

cities,Daegu,Daejeon
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2016,2461.0,1536
2017,2458.0,1528
2018,,1518


In [0]:
df3.values

array([[2461., 1536.],
       [2458., 1528.],
       [  nan, 1518.]])

In [0]:
df2.values

array([[2016, 'Seoul', 9843, nan],
       [2017, 'Seoul', 9766, -1.2],
       [2018, 'Seoul', 9705, nan],
       [2016, 'Bousan', 3447, -1.5],
       [2017, 'Bousan', 3424, -1.7],
       [2018, 'Bousan', 3400, nan]], dtype=object)

In [0]:
obj = pd.Series( range(3), index = ['a', 'b', 'c'])

In [0]:
index=obj.index

In [0]:
index

Index(['a', 'b', 'c'], dtype='object')

In [0]:
index[1:]

Index(['b', 'c'], dtype='object')

In [0]:
index[1]='d' # set 형태, 중복은 가능하지만 변경은 불가능하다.

TypeError: ignored

In [0]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [0]:
obj2 = pd.Series([1.5, -2.5, 0], index = labels)
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [0]:
obj2.index is labels

True

In [0]:
df3

cities,Daegu,Daejeon
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2016,2461.0,1536
2017,2458.0,1528
2018,,1518


In [0]:
df3.columns

Index(['Daegu', 'Daejeon'], dtype='object', name='cities')

In [0]:
'Daegu' in df3.columns

True

In [0]:
2017 in df3.index

True

In [0]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

In [0]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [0]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [0]:
obj3= pd.Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [0]:
# method = 'ffill' : 추가되는 인덱스에 값이 부족하면 NA가 아닌 전 값으로 채워 넣음.
obj3.reindex(range(6), method = 'ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [0]:
df = pd.DataFrame(np.arange(9).reshape((3,3)),
				  index = ['a', 'c', 'd'],
				  columns = ['Seoul', 'Busan', 'Daejeon'])
df

Unnamed: 0,Seoul,Busan,Daejeon
a,0,1,2
c,3,4,5
d,6,7,8


In [0]:
df2 = df.reindex(['a', 'b', 'c', 'd'])
df2

Unnamed: 0,Seoul,Busan,Daejeon
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [0]:
cities = ['Busan', 'Daegu', 'Kwagju']

In [0]:
df.reindex( columns = cities)

Unnamed: 0,Busan,Daegu,Kwagju
a,1,,
c,4,,
d,7,,


In [0]:
df.loc[['a', 'b', 'c', 'd'], cities] # loc를 넣어서 재인덱싱. 가능하면 지양해라.

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,Busan,Daegu,Kwagju
a,1.0,,
b,,,
c,4.0,,
d,7.0,,


In [0]:
obj = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [0]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [0]:
obj.drop(['d', 'c'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [0]:
df = pd.DataFrame(np.arange(16).reshape(4, 4),
                  index = ['Seoul', 'Busan', 'Daegu', 'Daejeon'],
                  columns = ['one', 'two', 'three', 'four'])
df

Unnamed: 0,one,two,three,four
Seoul,0,1,2,3
Busan,4,5,6,7
Daegu,8,9,10,11
Daejeon,12,13,14,15


In [0]:
df.drop(['Busan', 'Daejeon'])

Unnamed: 0,one,two,three,four
Seoul,0,1,2,3
Daegu,8,9,10,11


In [0]:
df.drop(['Busan', 'Daejeon'])

Unnamed: 0,one,two,three,four
Seoul,0,1,2,3
Daegu,8,9,10,11


In [0]:
df.drop('two', axis=1) # col에 있는 'two'를 삭제하라.

Unnamed: 0,one,three,four
Seoul,0,2,3
Busan,4,6,7
Daegu,8,10,11
Daejeon,12,14,15


In [0]:
df.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Seoul,0,2
Busan,4,6
Daegu,8,10
Daejeon,12,14


In [0]:
obj.drop(['d', 'c'], inplace = True) #inplace : 객체에 반영을 해줌.
obj

a    0.0
b    1.0
e    4.0
dtype: float64

In [0]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [0]:
obj['b']

1.0

In [0]:
obj[1]

1.0

In [0]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [0]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [0]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [0]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

In [0]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [0]:
obj['b':'c'] = 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [0]:
df = pd.DataFrame(np.arange(16).reshape((4,4)),
                 index = ['Seoul', 'Busan', 'Daegu', 'Inchon'],
                 columns = ['one', 'two', 'three', 'four'])
df

Unnamed: 0,one,two,three,four
Seoul,0,1,2,3
Busan,4,5,6,7
Daegu,8,9,10,11
Inchon,12,13,14,15


In [0]:
df['two']

Seoul      1
Busan      5
Daegu      9
Inchon    13
Name: two, dtype: int64

In [0]:
df[['three', 'one']]

Unnamed: 0,three,one
Seoul,2,0
Busan,6,4
Daegu,10,8
Inchon,14,12


In [0]:
df[:2]

Unnamed: 0,one,two,three,four
Seoul,0,1,2,3
Busan,4,5,6,7


In [0]:
df[df['three']>5]

Unnamed: 0,one,two,three,four
Busan,4,5,6,7
Daegu,8,9,10,11
Inchon,12,13,14,15


In [0]:
df<5

Unnamed: 0,one,two,three,four
Seoul,True,True,True,True
Busan,True,False,False,False
Daegu,False,False,False,False
Inchon,False,False,False,False


In [0]:
df[df<5]=0
df

Unnamed: 0,one,two,three,four
Seoul,0,0,0,0
Busan,0,5,6,7
Daegu,8,9,10,11
Inchon,12,13,14,15


In [0]:
df.loc['Busan', ['two', 'three']]

two      5
three    6
Name: Busan, dtype: int64

In [0]:
df.iloc[1, [3, 0, 1]]

four    7
one     0
two     5
Name: Busan, dtype: int64

In [0]:
df.iloc[2]

one       8
two       9
three    10
four     11
Name: Daegu, dtype: int64

In [0]:
df.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Busan,7,0,5
Daegu,11,8,9


In [0]:
df.loc[:'Daegu', 'two'] # iloc는 포함 안되고, loc는 포함이 된다.

Seoul    0
Busan    5
Daegu    9
Name: two, dtype: int64

In [0]:
df.iloc[:,:3][df.three>5]

Unnamed: 0,one,two,three
Busan,0,5,6
Daegu,8,9,10
Inchon,12,13,14


In [0]:
s = pd.Series(np.arange(3.))
s

0    0.0
1    1.0
2    2.0
dtype: float64

In [0]:
s[-1] #역순으로는 안됨.

KeyError: ignored

In [0]:
s

0    0.0
1    1.0
2    2.0
dtype: float64

In [0]:
s2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
s2

a    0.0
b    1.0
c    2.0
dtype: float64

In [0]:
s2[-1] # 인덱스를 지정해준 경우 -1 가능

2.0

In [0]:
s[:1]

0    0.0
dtype: float64

In [0]:
s.loc[:1]

0    0.0
1    1.0
dtype: float64

In [0]:
s.iloc[:1]

0    0.0
dtype: float64

In [0]:
s = pd.Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
df = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'g'])

In [0]:
s

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [0]:
df

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [0]:
s + df

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [0]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)),
                   columns = list('bcd'),
                   index = ['Seoul', 'Busan', 'Daegu'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                   columns = list('bcd'),
                   index = ['Daejeon', 'Seoul', 'Busan', 'Kwangju'])


In [0]:
df1

Unnamed: 0,b,c,d
Seoul,0.0,1.0,2.0
Busan,3.0,4.0,5.0
Daegu,6.0,7.0,8.0


In [0]:
df2

Unnamed: 0,b,c,d
Daejeon,0.0,1.0,2.0
Seoul,3.0,4.0,5.0
Busan,6.0,7.0,8.0
Kwangju,9.0,10.0,11.0


In [0]:
df1+df2

Unnamed: 0,b,c,d
Busan,9.0,11.0,13.0
Daegu,,,
Daejeon,,,
Kwangju,,,
Seoul,3.0,5.0,7.0


In [0]:
df1 = pd.DataFrame(np.arange(12.).reshape((3,4)),
                   columns = list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4,5)),
                   columns = list('abcde'))

In [0]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [0]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [0]:
df2.loc[1, 'b'] = np.nan

In [0]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [0]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [0]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [0]:
#중요
df1.add(df2, fill_value=0) # NaN 값이 아닌 나머지 값으로 넣는다.

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [0]:
1/df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [0]:
#중요
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [0]:
#중요
df1.reindex(columns = df2.columns, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [0]:
arr = np.arange(12.).reshape((3,4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [0]:
arr[0]

array([0., 1., 2., 3.])

In [0]:
#브로드캐스팅
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [0]:
df = pd.DataFrame(np.arange(12.).reshape((4,3)),
                  columns = list('bde'),
                  index = ['Daegu', 'Seoul', 'Busan', 'Daejeon'])
df

Unnamed: 0,b,d,e
Daegu,0.0,1.0,2.0
Seoul,3.0,4.0,5.0
Busan,6.0,7.0,8.0
Daejeon,9.0,10.0,11.0


In [0]:
s = df.iloc[0]
s

b    0.0
d    1.0
e    2.0
Name: Daegu, dtype: float64

In [0]:
df - s # Series의 인덱스를 col에 먼저 맞춤. 그다음에 확장.

Unnamed: 0,b,d,e
Daegu,0.0,0.0,0.0
Seoul,3.0,3.0,3.0
Busan,6.0,6.0,6.0
Daejeon,9.0,9.0,9.0


In [0]:
s2 = pd.Series(range(3), index = ['b', 'e', 'f'])
s2

b    0
e    1
f    2
dtype: int64

In [0]:
df + s2

Unnamed: 0,b,d,e,f
Daegu,0.0,,3.0,
Seoul,3.0,,6.0,
Busan,6.0,,9.0,
Daejeon,9.0,,12.0,


In [0]:
s3 = df['d']
s3

Daegu       1.0
Seoul       4.0
Busan       7.0
Daejeon    10.0
Name: d, dtype: float64

In [0]:
#중요
df.sub(s3, axis = 'index') # 0축. df - s3

Unnamed: 0,b,d,e
Daegu,-1.0,0.0,1.0
Seoul,-1.0,0.0,1.0
Busan,-1.0,0.0,1.0
Daejeon,-1.0,0.0,1.0


In [0]:
df = pd.DataFrame(np.random.rand(4, 3), columns = list('bde'),
                  index = ['Daegu', 'Seoul', 'Busan', 'Daejeon'])
df

Unnamed: 0,b,d,e
Daegu,0.931371,0.436586,0.268926
Seoul,0.648846,0.322648,0.346294
Busan,0.065387,0.752873,0.770456
Daejeon,0.920012,0.65904,0.247355


In [0]:
np.abs(df)

Unnamed: 0,b,d,e
Daegu,0.931371,0.436586,0.268926
Seoul,0.648846,0.322648,0.346294
Busan,0.065387,0.752873,0.770456
Daejeon,0.920012,0.65904,0.247355


In [0]:
f = lambda x : x.max() - x.min()

In [0]:
#중요
df.apply(f) # col이나 row에 적용하는 함수, 결과는 일반적으로 Series 형태.

b    0.865984
d    0.430225
e    0.523101
dtype: float64

In [0]:
df.apply(f, axis = 'columns')

Daegu      0.662445
Seoul      0.326198
Busan      0.705069
Daejeon    0.672657
dtype: float64

In [0]:
def f(x):
  return pd.Series([x.min(), x.max()], index = ['min', 'max'])

In [0]:
df.apply(f)

Unnamed: 0,b,d,e
min,0.065387,0.322648,0.247355
max,0.931371,0.752873,0.770456


In [0]:
fmt = lambda x : '%.2f' % x

In [0]:
#중요
#map() : 집합의 요소에 특정 동작을 자동으로 모든 요소에 적용 시키는 함수(벡터화와 유사)
df.applymap(fmt)

Unnamed: 0,b,d,e
Daegu,0.93,0.44,0.27
Seoul,0.65,0.32,0.35
Busan,0.07,0.75,0.77
Daejeon,0.92,0.66,0.25


In [0]:
df['e'].map(fmt)

Daegu      0.27
Seoul      0.35
Busan      0.77
Daejeon    0.25
Name: e, dtype: object

In [0]:
obj = pd.Series(range(4), index = ['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [0]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [0]:
df = pd.DataFrame(np.arange(8).reshape((2,4)),
                  index = ['three', 'one'],
                  columns = ['d', 'a', 'b', 'c'])
df

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [0]:
df.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [0]:
df.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [0]:
# 1축을 기준으로 거꾸로.
df.sort_index(axis=1, ascending = False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [0]:
obj = pd.Series([4, 7, -3, 2])
obj

0    4
1    7
2   -3
3    2
dtype: int64

In [0]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [0]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

In [0]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [0]:
df = pd.DataFrame({'b':[4, 7, -3, 2], 'a':[0, 1, 0, 1]})
df

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [0]:
df.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [0]:
df.sort_values( by = ['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [0]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [0]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [0]:
obj.rank(method = 'first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [0]:
obj.rank(ascending = False, method = 'max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [0]:
df = pd.DataFrame({'b':[4.3, 7, -3, 2], 'a':[0, 1, 0, 1],
                   'c':[-2, 5, 8, -2.5]})
df

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [0]:
df.rank(axis = 'columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [0]:
df.rank(ascending=False) # 내림차순

Unnamed: 0,b,a,c
0,2.0,3.5,3.0
1,1.0,1.5,2.0
2,4.0,3.5,1.0
3,3.0,1.5,4.0


In [0]:
obj = pd.Series(range(5), index= ['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [0]:
obj.index.is_unique

False

In [0]:
obj['a']

a    0
a    1
dtype: int64

In [0]:
obj['c']

4

In [0]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df

Unnamed: 0,0,1,2
a,0.559601,0.265778,2.381784
a,0.792561,0.943873,0.702228
b,2.298212,0.05117,-1.211544
b,-0.359779,0.68722,1.338327


In [0]:
df.loc['b']

Unnamed: 0,0,1,2
b,2.298212,0.05117,-1.211544
b,-0.359779,0.68722,1.338327


In [0]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                   index = ['a', 'b', 'c', 'd'],
                   columns = ['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [0]:
df.sum() #결과가 2차원 -> 1차원으로 '축소'되어 나옴.

one    9.25
two   -5.80
dtype: float64

In [0]:
df.sum(axis='columns') #기본적으로 NA를 포함시키지 않고 연산.

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [0]:
df.mean(axis = 'columns', skipna = False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [0]:
df.idxmax()

one    b
two    d
dtype: object

In [0]:
df.cumsum() #누적합

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [0]:
df.describe() #요약통계

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [2]:
obj = pd.Series(['a', 'a', 'b', 'c'] *4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [3]:
obj.describe() #문자열에 대한 describe, count : 전체빈도, unique : 종류, top : 가장 빈도수가 큰 값, freq : top의 빈도

count     16
unique     3
top        a
freq       8
dtype: object

In [4]:
uniques = obj.unique() # 단어 종류 나열
uniques

array(['a', 'b', 'c'], dtype=object)

In [5]:
obj.value_counts() # 단어별 빈도분석(내림차순)

a    8
c    4
b    4
dtype: int64

In [6]:
pd.value_counts(obj.values, sort = False)

b    4
a    8
c    4
dtype: int64

In [7]:
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [0]:
#중요
mask = obj.isin(['b', 'c']) # 해당 값이 포함되어 있으면 True

In [9]:
mask

0     False
1     False
2      True
3      True
4     False
5     False
6      True
7      True
8     False
9     False
10     True
11     True
12    False
13    False
14     True
15     True
dtype: bool

In [10]:
obj[mask] # 마스크 인덱싱

2     b
3     c
6     b
7     c
10    b
11    c
14    b
15    c
dtype: object

In [15]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a', 'd'])
to_match

0    c
1    a
2    b
3    b
4    c
5    a
6    d
dtype: object

In [12]:
unique_vals = pd.Series(['c', 'b', 'a'])
unique_vals

0    c
1    b
2    a
dtype: object

In [16]:
#중요
pd.Index(unique_vals).get_indexer(to_match) # 값이 없으면 -1.

array([ 0,  2,  1,  1,  0,  2, -1])

In [14]:
df = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                   'Qu2': [2, 3, 1, 2, 3],
                   'Qu3': [1, 5, 2, 4, 4]})
df

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [19]:
#중요
result = df.apply(pd.value_counts).fillna(0) #index = 레이블, value = 도수카운트
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [20]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [22]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/datas/ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [23]:
df.dtypes

a           int64
b           int64
c           int64
d           int64
message    object
dtype: object

In [24]:
pd.read_table('/content/drive/My Drive/Colab Notebooks/datas/ex1.csv', sep=',')

  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [27]:
pd.read_csv('/content/drive/My Drive/Colab Notebooks/datas/ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [28]:
pd.read_csv('/content/drive/My Drive/Colab Notebooks/datas/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [0]:
names = ['a', 'b', 'c', 'd', 'message']

In [30]:
pd.read_csv('/content/drive/My Drive/Colab Notebooks/datas/ex2.csv', names= names, index_col = 'message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [31]:
pd.read_csv('/content/drive/My Drive/Colab Notebooks/datas/ex2.csv', names= names, index_col = 4)

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [33]:
#중요
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/datas/ex3.csv', index_col=['key1', 'key2'])
df #색인 one two. 계층적이다.

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


#정규표현식(문자표현식)



In [38]:
#중요
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/datas/ex4.txt', sep='\s+')
df

Unnamed: 0,A,B,C
aaa,-0.264436,-1.026059,-0.556644
bbb,-0.443436,-0.826059,-1.256644
ccc,-0.623436,-0.260359,-0.156644
ddd,-0.364436,-1.026059,-1.156644


In [39]:
#원하지 않는 행은 skip.
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/datas/ex5.csv', skiprows=[0, 2, 3])
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [40]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/datas/ex6.csv')
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [41]:
df.isnull()

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [43]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/datas/ex6.csv', na_values = [0])
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [0]:
sentence = {'message':['foo', 'NA'], 'something':['two']}

In [45]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/datas/ex6.csv', na_values = sentence)
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


In [3]:
df = pd.read_csv('강우량+기상정보+(2017년+09월+10일+00_00+_+05_59).csv')
df.head()

Unnamed: 0,CHNL_ID,STN_ID,AWS_DTM,SKY_CODE_ID,RN_DAY,RN_1HR,RNM_10M,RNM_30M,RNM_60M,RNM_6HR,RNM_12HR,RNM_24HR,PA,PS,PTY_CD
0,KMA,92,2017-09-10 0:00,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0,0.0
1,KMA,95,2017-09-10 0:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,996.5,1014.5,0,0.0
2,KMA,96,2017-09-10 0:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1003.7,1014.7,0,0.0
3,KMA,98,2017-09-10 0:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1001.2,1014.3,0,0.0
4,KMA,99,2017-09-10 0:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1010.7,1014.1,0,0.0


In [9]:
df = pd.read_csv('강우량+기상정보+(2017년+09월+10일+00_00+_+05_59).csv',
                 index_col = 'STN_ID')
df

Unnamed: 0_level_0,CHNL_ID,AWS_DTM,SKY_CODE_ID,RN_DAY,RN_1HR,RNM_10M,RNM_30M,RNM_60M,RNM_6HR,RNM_12HR,RNM_24HR,PA,PS,PTY_CD
STN_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
92,KMA,2017-09-10 0:00,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0,0.0
95,KMA,2017-09-10 0:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,996.5,1014.5,0,0.0
96,KMA,2017-09-10 0:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1003.7,1014.7,0,0.0
98,KMA,2017-09-10 0:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1001.2,1014.3,0,0.0
99,KMA,2017-09-10 0:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1010.7,1014.1,0,0.0
100,KMA,2017-09-10 0:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,928.2,1016.6,0,0.0
101,KMA,2017-09-10 0:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1005.5,1014.4,0,0.0
102,KMA,2017-09-10 0:00,,0.0,0.0,0.0,0.0,0.0,0.0,0.5,996.9,1013.7,0,0.0
104,KMA,2017-09-10 0:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1004.8,1014.0,0,0.0
105,KMA,2017-09-10 0:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1010.9,1013.9,0,0.0


In [10]:
df = pd.read_csv('ex6.csv', index_col = 'something' )
df

Unnamed: 0_level_0,a,b,c,d,message
something,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [11]:
df.to_csv('out.csv')

In [12]:
!type out.csv # ! 사용 : OS의 명령 프롬프트 사용.

something,a,b,c,d,message
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo



out.csv


지정된 파일을 찾을 수 없습니다.
다음 내용 진행 중 오류 발생: #.
지정된 파일을 찾을 수 없습니다.
다음 내용 진행 중 오류 발생: !.
지정된 파일을 찾을 수 없습니다.
다음 내용 진행 중 오류 발생: 사용.
파일 이름, 디렉터리 이름 또는 볼륨 레이블 구문이 잘못되었습니다.


In [13]:
df.to_csv('out2.csv')
!type out2.csv

something,a,b,c,d,message
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [4]:
# pip install xlrd, openpyxl
import xlrd, openpyxl

In [5]:
xlsx = pd.ExcelFile('korea_population_stats_100701.xlsx')
xlsx

<pandas.io.excel._base.ExcelFile at 0x276a0f872b0>

In [6]:
df = pd.read_excel(xlsx, 'korea_population_stats_100701') # pd.read_excel(file, sheet이름)
df

Unnamed: 0,"지역별 인구 및 인구밀도 [단위 : 천명, 명/㎢]",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,,2014,,2015,,2016,,2017,,2018,
1,,인구,인구밀도,인구,인구밀도,인구,인구밀도,인구,인구밀도,인구,인구밀도
2,계,50747,506,51015,509,51218,510,51362,512,51607,514
3,서울,9975,16482,9941,16425,9843,16263,9766,16136,9705,16034
4,부산,3452,4485,3452,4484,3447,4477,3424,4447,3400,4416
5,대구,2475,2801,2469,2794,2461,2786,2458,2782,2450,2773
6,인천,2862,2732,2883,2748,2907,2736,2924,2750,2939,2764
7,광주,1505,3002,1506,3005,1502,2997,1495,2984,1493,2980
8,대전,1553,2879,1542,2860,1536,2848,1528,2832,1518,2813
9,울산,1151,1085,1164,1097,1166,1099,1159,1092,1154,1088


In [20]:
#중요
df = pd.read_excel(xlsx, 'korea_population_stats_100701', header = 2,
                   usecols = 'B, D, F, H, J')
df

Unnamed: 0,인구,인구.1,인구.2,인구.3,인구.4
0,50747,51015,51218,51362,51607
1,9975,9941,9843,9766,9705
2,3452,3452,3447,3424,3400
3,2475,2469,2461,2458,2450
4,2862,2883,2907,2924,2939
5,1505,1506,1502,1495,1493
6,1553,1542,1536,1528,1518
7,1151,1164,1166,1159,1154
8,132,187,234,266,304
9,12282,12423,12600,12786,13031


In [21]:
column_names = {'인구':2014, '인구.1':2015, '인구.2':2016,
               '인구.3':2017, '인구.4':2018}

In [22]:
df.rename(columns = column_names, inplace = True)

In [25]:
index_names = {0:'계', 1:'서울', 2:'부산', 3:'대구', 4:'인천', 5:'광주', 6:'대전', 7:'울산', 8:'세종',
               9:'경기', 10:'강원', 11:'충북', 12:'충남', 13:'전북', 14:'전남', 15:'경북',
               16:'경남', 17:'제주', 18:'수도권'}

In [26]:
df.rename(index = index_names, inplace=True)
df.columns.name = '년도'
df.index.name = '지역'
df

년도,2014,2015,2016,2017,2018
지역,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
계,50747,51015,51218,51362,51607
서울,9975,9941,9843,9766,9705
부산,3452,3452,3447,3424,3400
대구,2475,2469,2461,2458,2450
인천,2862,2883,2907,2924,2939
광주,1505,1506,1502,1495,1493
대전,1553,1542,1536,1528,1518
울산,1151,1164,1166,1159,1154
세종,132,187,234,266,304
경기,12282,12423,12600,12786,13031


In [27]:
df.index

Index(['계', '서울', '부산', '대구', '인천', '광주', '대전', '울산', '세종', '경기', '강원', '충북',
       '충남', '전북', '전남', '경북', '경남', '제주', '수도권'],
      dtype='object', name='지역')

In [28]:
df.columns

Int64Index([2014, 2015, 2016, 2017, 2018], dtype='int64', name='년도')

In [30]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [31]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [32]:
string_data[string_data.isnull()]

2    NaN
dtype: object

In [33]:
string_data[string_data.notnull()]

0     aardvark
1    artichoke
3      avocado
dtype: object

In [34]:
string_data[0] = None

In [35]:
string_data[string_data.isnull()]

0    None
2     NaN
dtype: object

In [36]:
from numpy import nan as NA

In [37]:
s = pd.Series([1, NA, 3.5, NA, 7])
s

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [38]:
#중요 #결측값 제거
s.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [39]:
s[s.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [40]:
df = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                   [NA, NA, NA], [NA, 6.5, 3]])
df

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [41]:
#결측값이 포함된 행 모두 삭제
df_c = df.dropna()

In [42]:
df_c

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [43]:
df_c2 = df.dropna(how = 'all')
df_c2

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [44]:
df[4] = NA
df

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [48]:
df.dropna(how = 'all', axis = 1) #행 열 구분 가능

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [49]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,0.433669,,
1,-1.769614,,
2,1.46711,,0.387489
3,0.204203,,0.985462
4,0.226962,-0.561545,-0.983411
5,1.21207,-1.173745,1.010498
6,-0.817382,0.422491,-0.974312


In [55]:
df.dropna()

Unnamed: 0,0,1,2
4,0.226962,-0.561545,-0.983411
5,1.21207,-1.173745,1.010498
6,-0.817382,0.422491,-0.974312


In [56]:
df.dropna( thresh = 1)

Unnamed: 0,0,1,2
0,0.433669,,
1,-1.769614,,
2,1.46711,,0.387489
3,0.204203,,0.985462
4,0.226962,-0.561545,-0.983411
5,1.21207,-1.173745,1.010498
6,-0.817382,0.422491,-0.974312


In [54]:
df.dropna( thresh = 2) #남은 데이터 개수를 정하는거. 몇 개 이상.

Unnamed: 0,0,1,2
2,1.46711,,0.387489
3,0.204203,,0.985462
4,0.226962,-0.561545,-0.983411
5,1.21207,-1.173745,1.010498
6,-0.817382,0.422491,-0.974312


In [57]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.433669,0.0,0.0
1,-1.769614,0.0,0.0
2,1.46711,0.0,0.387489
3,0.204203,0.0,0.985462
4,0.226962,-0.561545,-0.983411
5,1.21207,-1.173745,1.010498
6,-0.817382,0.422491,-0.974312


In [58]:
df.fillna(df.mean() )

Unnamed: 0,0,1,2
0,0.433669,-0.4376,0.085145
1,-1.769614,-0.4376,0.085145
2,1.46711,-0.4376,0.387489
3,0.204203,-0.4376,0.985462
4,0.226962,-0.561545,-0.983411
5,1.21207,-1.173745,1.010498
6,-0.817382,0.422491,-0.974312


In [59]:
df.fillna({1:0.5, 2:0})

Unnamed: 0,0,1,2
0,0.433669,0.5,0.0
1,-1.769614,0.5,0.0
2,1.46711,0.5,0.387489
3,0.204203,0.5,0.985462
4,0.226962,-0.561545,-0.983411
5,1.21207,-1.173745,1.010498
6,-0.817382,0.422491,-0.974312


In [61]:
#중요
df.fillna({1:df[1].mean(), 2:df[2].mean()})

Unnamed: 0,0,1,2
0,0.433669,-0.4376,0.085145
1,-1.769614,-0.4376,0.085145
2,1.46711,-0.4376,0.387489
3,0.204203,-0.4376,0.985462
4,0.226962,-0.561545,-0.983411
5,1.21207,-1.173745,1.010498
6,-0.817382,0.422491,-0.974312


In [64]:
_ = df.fillna(0, inplace = True) # df 써도 되고 _써도 되고
df

Unnamed: 0,0,1,2
0,0.433669,0.0,0.0
1,-1.769614,0.0,0.0
2,1.46711,0.0,0.387489
3,0.204203,0.0,0.985462
4,0.226962,-0.561545,-0.983411
5,1.21207,-1.173745,1.010498
6,-0.817382,0.422491,-0.974312


In [65]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.532686,-1.034458,-0.690923
1,-2.293714,-0.454747,-2.273939
2,-0.991173,,1.169225
3,-2.162785,,0.145591
4,0.026918,,
5,-0.100344,,


In [66]:
df.fillna(method = 'ffill') #이전 값으로 대체

Unnamed: 0,0,1,2
0,0.532686,-1.034458,-0.690923
1,-2.293714,-0.454747,-2.273939
2,-0.991173,-0.454747,1.169225
3,-2.162785,-0.454747,0.145591
4,0.026918,-0.454747,0.145591
5,-0.100344,-0.454747,0.145591


In [67]:
df.fillna(method = 'ffill', limit = 2) #2개 까지만 대체

Unnamed: 0,0,1,2
0,0.532686,-1.034458,-0.690923
1,-2.293714,-0.454747,-2.273939
2,-0.991173,-0.454747,1.169225
3,-2.162785,-0.454747,0.145591
4,0.026918,,0.145591
5,-0.100344,,0.145591


In [92]:
df = pd.DataFrame({'k1':['one', 'two'] * 3 + ['two'], 
                   'k2':[1, 1, 2, 3, 3, 4, 4]})
df

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [93]:
df.duplicated() #중복데이터 유무 확인 - two 4 가 중복으로 있음.

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [94]:
df.drop_duplicates() #중복데이터 제거
df

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [95]:
df['v1'] = range(7) # 이 상황에서 중복제거하면 아무것도 삭제가 안된다.
df #모든 변수에 있어서 삭제가 중복이 되어야 삭제됨.(인자값을 안주면)

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [89]:
df.drop_duplicates(['k1']) # k1 기준으로 중복 구분한다.

Unnamed: 0,k1,k2
0,one,1
1,two,1


In [88]:
df.drop_duplicates(['k1', 'k2']) # k1, k2 기준으로 중복 구분한다.

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [96]:
df = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                            'Pastrami', 'corned beef', 'Bacon',
                            'pastrami', 'honey ham', 'nova lox'],
                   'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
df

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [97]:
meat_to_animal = {'bacon': 'pig', 'pulled pork': 'pig', 'pastrami': 'cow',
                  'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon'}

In [98]:
lowercased = df['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [99]:
# col 추가
# map 일괄적으로, 특정 데이터의 값에 맞는 값을 처리함.
df['animal'] = lowercased.map(meat_to_animal)
df

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [100]:
df['food'].map(lambda x : meat_to_animal[x.lower() ] )

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [101]:
s = pd.Series([1., -999., 2., -999., -1000., 3. ] )
s

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [102]:
s.replace(-999, np.nan) # replace(a, b) : a를 b로 바꿈

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [103]:
s.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [104]:
s.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [105]:
s.replace({-999:np.nan, -1000:0})

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [110]:
df = pd.DataFrame(np.arange(12).reshape((3, 4)),
                  index = ['Seoul', 'Busan', 'Daejeon'],
                  columns = ['one', 'two', 'three', 'four'])
df

Unnamed: 0,one,two,three,four
Seoul,0,1,2,3
Busan,4,5,6,7
Daejeon,8,9,10,11


In [111]:

transform = lambda x : x[:4].upper()

In [112]:
df.index.map(transform)

Index(['SEOU', 'BUSA', 'DAEJ'], dtype='object')

In [113]:
df.index = df.index.map(transform)
df

Unnamed: 0,one,two,three,four
SEOU,0,1,2,3
BUSA,4,5,6,7
DAEJ,8,9,10,11


In [115]:
df.rename(index = {'SEOU':'SEOUL'}, columns = {'three':'sam'})

Unnamed: 0,one,two,sam,four
SEOUL,0,1,2,3
BUSA,4,5,6,7
DAEJ,8,9,10,11


In [116]:
ages = [20, 22, 25, 27, 22, 23, 37, 31, 61, 45, 71, 32]
bins = [18, 25, 35, 60, 100]

In [117]:
#중요 #카테고리
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (60, 100], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [118]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 3, 1], dtype=int8)

In [119]:
cats.categories # ( : 초과, ] : 이하

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [120]:
pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(60, 100]    2
(35, 60]     2
dtype: int64

In [121]:
pd.cut(ages, [18, 26, 36, 61, 100], right = False) # right = False : [a, b)로 만듬 (a이상 b미만)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [61, 100), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [122]:
group_names = ['Youth', 'YoungAdult', 'MiddleAge', 'Senior']
cats = pd.cut(ages, bins, labels = group_names)
cats

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAge, Senior, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAge < Senior]

In [123]:
pd.value_counts(cats)

Youth         5
YoungAdult    3
Senior        2
MiddleAge     2
dtype: int64

In [124]:
a = np.random.rand(20)
a

array([0.30342877, 0.26682779, 0.62854446, 0.96807552, 0.7176056 ,
       0.92686136, 0.43705279, 0.42074702, 0.3047669 , 0.18243407,
       0.84317811, 0.0561745 , 0.70437257, 0.20111514, 0.38180067,
       0.31761769, 0.58242382, 0.10087472, 0.79510241, 0.00940067])

In [125]:
# 4등분으로 분류하겠다.
cats = pd.cut(a, 4, precision = 2)

In [126]:
pd.value_counts(cats)

(0.25, 0.49]      7
(0.0084, 0.25]    5
(0.73, 0.97]      4
(0.49, 0.73]      4
dtype: int64

In [129]:
a = np.random.randn(1000)
cats = pd.qcut(a, 4) # cut보다 좀더 정밀함
cats

[(0.646, 3.487], (-0.665, 0.0243], (0.646, 3.487], (-0.665, 0.0243], (0.646, 3.487], ..., (0.646, 3.487], (0.0243, 0.646], (0.0243, 0.646], (0.0243, 0.646], (-4.179, -0.665]]
Length: 1000
Categories (4, interval[float64]): [(-4.179, -0.665] < (-0.665, 0.0243] < (0.0243, 0.646] < (0.646, 3.487]]

In [130]:
pd.value_counts(cats)

(0.646, 3.487]      250
(0.0243, 0.646]     250
(-0.665, 0.0243]    250
(-4.179, -0.665]    250
dtype: int64

In [131]:
df = pd.DataFrame(np.random.randn(1000, 4))
df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.027131,0.038826,-0.016259,0.002402
std,1.019403,0.987612,0.985422,0.989272
min,-3.285917,-3.110784,-2.781508,-2.885875
25%,-0.689369,-0.641915,-0.659312,-0.635427
50%,-0.028775,0.073064,-0.007221,-0.007943
75%,0.721029,0.713361,0.649782,0.669739
max,3.345516,2.947308,3.034731,3.736117


In [132]:
col = df[2]

In [133]:
col[np.abs(col)>3]

909    3.034731
Name: 2, dtype: float64

In [135]:
df[(np.abs(df)>3).any(1)]

Unnamed: 0,0,1,2,3
93,0.561699,-0.520562,0.403951,3.736117
204,0.638471,-3.090057,0.162134,0.230788
205,-3.285917,1.423102,-0.594685,-1.504976
288,-0.118111,-1.177806,-0.287979,3.130316
294,3.077731,0.485639,0.646775,2.375342
517,-1.90124,-3.110784,-0.753895,-2.4662
580,1.966621,0.0936,-0.661176,3.128935
696,3.345516,2.448278,0.882817,-0.597505
909,-0.036862,0.204238,3.034731,-0.270036


In [136]:
#sign : 음수인지 양수인지 나타냄. 음수면 -1, 양수면 +1
df[np.abs(df)>3] = np.sign(df) * 3

In [137]:
df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.026993,0.039027,-0.016294,0.001407
std,1.017213,0.986983,0.985315,0.985956
min,-3.0,-3.0,-2.781508,-2.885875
25%,-0.689369,-0.641915,-0.659312,-0.635427
50%,-0.028775,0.073064,-0.007221,-0.007943
75%,0.721029,0.713361,0.649782,0.669739
max,3.0,2.947308,3.0,3.0


# 2019-07-29 월

In [6]:
df = pd.DataFrame(np.arange(5*4).reshape((5,4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [7]:
#중요 #샘플 #난수 #행_난수
sampler = np.random.permutation(5)
sampler

array([3, 0, 2, 4, 1])

In [8]:
#중요 #행섞기 #행_난수 #row의 순서를 임의로 바꿔야할 때가 있을 때.
df.take(sampler)

Unnamed: 0,0,1,2,3
3,12,13,14,15
0,0,1,2,3
2,8,9,10,11
4,16,17,18,19
1,4,5,6,7


In [9]:
df #inplace를 사용하지 않았기에 원 데이터에서는 바뀌지 않았음.

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [13]:
#중요 #임의의 행 3개만 순서 상관없이 추출
df.sample(n=3)

Unnamed: 0,0,1,2,3
4,16,17,18,19
0,0,1,2,3
3,12,13,14,15


In [14]:
s = pd.Series([5, 7, -1, 6, 4])
s

0    5
1    7
2   -1
3    6
4    4
dtype: int64

In [20]:
#중요
#10개를 임의로 생성 단, 복원추출
#원데이터 개수가보다 n이 크면, replace는 무조건 True가 되야함.
draws = s.sample(n = 10, replace = True)
draws

3    6
1    7
4    4
2   -1
2   -1
3    6
4    4
1    7
3    6
1    7
dtype: int64

In [21]:
df = pd.DataFrame({'key':['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [22]:
#중요 #dummy변수 생성 #더미변수
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [23]:
#중요 #prefix 사용시, 앞에 접두사를 붙힘.
dummies = pd.get_dummies(df['key'], prefix = 'key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [27]:
#중요 #열추가 #R_cbind
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [28]:
np.random.seed(12345)

In [29]:
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [30]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

In [32]:
#중요 #범위별 더미변수 생성
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


# 정규 표현식 : re
## 문자열 처리에 사용하는 표현식
## 단축된 형식으로 문자열 표현식에 사용

In [54]:
import re

In [34]:
text = "foo bar\t bax\tqux"

In [35]:
#  \s+ : 문자열을 구분할 수 있는 모든 구분자를 구분자로 활용.
re.split('\s+', text)

['foo', 'bar', 'bax', 'qux']

In [36]:
#중요, #자주 쓰는 정규 표현식을 미리 컴파일 할 수 있다.
regex = re.compile('\s+')

In [37]:
regex.split(text)

['foo', 'bar', 'bax', 'qux']

In [38]:
# 공백, 탭 공백, 탭이 구분자로 사용되었다.
regex.findall(text)

[' ', '\t ', '\t']

In [40]:
#중요
text = '''Hong Hong@korea.com
Kim Kim@korea.com
Park Park@korea.com
Lee Lee@korea.com'''
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern, flags = re.IGNORECASE) #대소문자 구분 않겠다

In [41]:
#첫 행을 가져옴
m = regex.search(text)
m

<re.Match object; span=(5, 19), match='Hong@korea.com'>

In [42]:
text[ m.start():m.end() ]

'Hong@korea.com'

In [45]:
print(regex.sub('PRDACTED', text))

Hong PRDACTED
Kim PRDACTED
Park PRDACTED
Lee PRDACTED


In [58]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags = re.IGNORECASE) 

In [59]:
m = regex.match('wesm@korea.com')

In [60]:
m.groups()

('wesm', 'korea', 'com')

In [61]:
regex.findall(text)

[('Hong', 'korea', 'com'),
 ('Kim', 'korea', 'com'),
 ('Park', 'korea', 'com'),
 ('Lee', 'korea', 'com')]

In [63]:
print(regex.sub(r'Username:\1, Domain:\2, Suffix:\3', text))

Hong Username:Hong, Domain:korea, Suffix:com
Kim Username:Kim, Domain:korea, Suffix:com
Park Username:Park, Domain:korea, Suffix:com
Lee Username:Lee, Domain:korea, Suffix:com


In [64]:
data = {'Hong': 'hong@korea.com', 'Kim': 'kim@naver.com',
        'Park': 'park@gmail.com', 'Lee': 'lee@korea.com', 'lm':np.nan }
s = pd.Series(data)
s

Hong    hong@korea.com
Kim      kim@naver.com
Park    park@gmail.com
Lee      lee@korea.com
lm                 NaN
dtype: object

In [65]:
s.str.contains('korea')

Hong     True
Kim     False
Park    False
Lee      True
lm        NaN
dtype: object

In [66]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [70]:
#중요 #이메일 주소 분리
s.str.findall(pattern, flags = re.IGNORECASE)

Hong    [(hong, korea, com)]
Kim      [(kim, naver, com)]
Park    [(park, gmail, com)]
Lee      [(lee, korea, com)]
lm                       NaN
dtype: object

In [71]:
matches = s.str.match(pattern, flags = re.IGNORECASE)
matches

Hong    True
Kim     True
Park    True
Lee     True
lm       NaN
dtype: object

In [76]:
#계층적 인덱스
s = pd.Series(np.random.randn(9),
               index = [ ['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                         [ 1, 2, 3, 1, 3, 1, 2, 2, 3 ] ] )
s

a  1   -0.753887
   2    0.331286
   3    1.349742
b  1    0.069877
   3    0.246674
c  1   -0.011862
   2    1.004812
d  2    1.327195
   3   -0.919262
dtype: float64

In [77]:
s['a']

1   -0.753887
2    0.331286
3    1.349742
dtype: float64

In [78]:
s.b

1    0.069877
3    0.246674
dtype: float64

In [79]:
s.d

2    1.327195
3   -0.919262
dtype: float64

In [80]:
s['b':'c']

b  1    0.069877
   3    0.246674
c  1   -0.011862
   2    1.004812
dtype: float64

In [81]:
s.loc[['b','d']]

b  1    0.069877
   3    0.246674
d  2    1.327195
   3   -0.919262
dtype: float64

In [82]:
s.loc[:,2] #각 a, b, c, d에서 '2' 인덱스만 추출

a    0.331286
c    1.004812
d    1.327195
dtype: float64

In [83]:
#중요 #stack : 쌓다
s.unstack()

Unnamed: 0,1,2,3
a,-0.753887,0.331286,1.349742
b,0.069877,,0.246674
c,-0.011862,1.004812,
d,,1.327195,-0.919262


In [84]:
s.unstack().stack()

a  1   -0.753887
   2    0.331286
   3    1.349742
b  1    0.069877
   3    0.246674
c  1   -0.011862
   2    1.004812
d  2    1.327195
   3   -0.919262
dtype: float64

In [3]:
df = pd.DataFrame(np.arange(12).reshape((4,3)),
                  index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns = [ ['Seoul', 'Seoul', 'Daejeon'],
                              ['Green', 'Red', 'Green']])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Seoul,Seoul,Daejeon
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [4]:
df.index.names = ['key1', 'key2']

In [5]:
df.columns.names = ['city', 'color']

In [6]:
df

Unnamed: 0_level_0,city,Seoul,Seoul,Daejeon
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [7]:
#중요 #중첩 인덱스의 순서를 바꿈
df.swaplevel('key1', 'key2') #키를 바꿈

Unnamed: 0_level_0,city,Seoul,Seoul,Daejeon
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [9]:
#중요 #인덱스를 sort
df.sort_index(level=1)

Unnamed: 0_level_0,city,Seoul,Seoul,Daejeon
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [10]:
#중요 #0축 키의 순서를 바꿈
df.swaplevel(0, 1).sort_index(level = 0)

Unnamed: 0_level_0,city,Seoul,Seoul,Daejeon
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [12]:
#중요 #기준에 대한 합계를 구함.
df.sum(level = 'color', axis = 1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [13]:
df = pd.DataFrame({'a':range(7), 'b':range(7, 0, -1),
                   'c':['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                   'd':[0, 1, 2, 0, 1, 2, 3]})
df

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [14]:
#중요 #인덱스 중첩으로 설정
df2 = df.set_index(['c', 'd'])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [17]:
#인덱스 중첩으로 설정하지만, 해당 열 drop 안함.
df.set_index(['c', 'd'], drop = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [18]:
#인덱스 원위치
df.reset_index()

Unnamed: 0,index,a,b,c,d
0,0,0,7,one,0
1,1,1,6,one,1
2,2,2,5,one,2
3,3,3,4,two,0
4,4,4,3,two,1
5,5,5,2,two,2
6,6,6,1,two,3


In [24]:
df1 = pd.DataFrame({'key':['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1':range(7)})
df2 = pd.DataFrame({'key':['a', 'b', 'c'],
                    'data2':range(3)})

In [25]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [26]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,c,2


In [28]:
#기준점을 안정해줬으므로 공동 변수로 merge.
pd.merge(df1, df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0
6,c,3,2


In [29]:
pd.merge(df1, df2, on = 'key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0
6,c,3,2


In [34]:
df3 = pd.DataFrame({'lkey':['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df4 = pd.DataFrame({'rkey':['a', 'b', 'd'],
                    'data2': range(3)})

In [35]:
df3

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [36]:
df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [37]:
#교집합 #각 키별, 공통 값만 합침 #내부조인 #join
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [38]:
#합집합 #외부조인 #join #outer
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey', how='left')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1.0
1,b,1,b,1.0
2,a,2,a,0.0
3,c,3,,
4,a,4,a,0.0
5,a,5,a,0.0
6,b,6,b,1.0


In [39]:
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey', how='right')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1
1,b,1.0,b,1
2,b,6.0,b,1
3,a,2.0,a,0
4,a,4.0,a,0
5,a,5.0,a,0
6,,,d,2


In [40]:
#inner #내부조인 #왼쪽 교집합 오른쪽 중, inner는 겹치는것만.
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey', how='inner')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [41]:
df5 = pd.DataFrame({'key':['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1':range(6)})
df6 = pd.DataFrame({'key':['a', 'b', 'a', 'b', 'd'],
                    'data2': range(5)})

In [42]:
df5

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [43]:
df6

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [45]:
# 다vs다 조인
pd.merge(df5, df6, on = 'key', how = 'left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [46]:
pd.merge(df5, df6, on = 'key', how = 'inner')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,5,1
5,b,5,3
6,a,2,0
7,a,2,2
8,a,4,0
9,a,4,2


In [47]:
left = pd.DataFrame({'key1':['foo', 'foo', 'bar'],
                     'key2':['one', 'two', 'one'],
                     'lval':[1, 2, 3]})
right = pd.DataFrame({'key1':['foo', 'foo', 'bar', 'bar'],
                     'key2':['one', 'one', 'one', 'two'],
                     'lval':[4, 5, 6, 7]})

In [49]:
pd.merge(left, right, on = ['key1', 'key2'], how = 'outer')

Unnamed: 0,key1,key2,lval_x,lval_y
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [51]:
#키 하나만 잡음.
pd.merge(left, right, on = 'key1')

Unnamed: 0,key1,key2_x,lval_x,key2_y,lval_y
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [52]:
#col 이름 추가
pd.merge(left, right, on = 'key1', suffixes = ('_left', '_right'))

Unnamed: 0,key1,key2_left,lval_left,key2_right,lval_right
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [54]:
left1 = pd.DataFrame({'key':['a', 'b', 'a', 'a', 'b', 'c'],
                      'value':range(6)})
right1 = pd.DataFrame({'group_val':[3.5, 7]}, index = ['a', 'b'])

In [55]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [56]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [57]:
#인덱스 조인
pd.merge(left1, right1, left_on = 'key', right_index = True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [58]:
#인덱스 외부 조인
pd.merge(left1, right1, left_on = 'key', right_index = True, how = 'outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


In [70]:
lefth = pd.DataFrame({'key1':['Seoul', 'Seoul', 'Seoul', 'Daejeon', 'Daejeon'],
                      'key2':[2016, 2017, 1018, 2017, 2018],
                      'data': np.arange(5.)})
righth = pd.DataFrame(np.arange(12).reshape((6, 2)),
                     index = [['Daejeon', 'Daejeon', 'Seoul', 'Seoul',
                               'Seoul', 'Seoul'],
                              [2017, 2016, 2016, 2016, 2017, 2018]],
                     columns = ['event1', 'event2'])

In [72]:
righth

Unnamed: 0,Unnamed: 1,event1,event2
Daejeon,2017,0,1
Daejeon,2016,2,3
Seoul,2016,4,5
Seoul,2016,6,7
Seoul,2017,8,9
Seoul,2018,10,11


In [73]:
pd.merge(lefth, righth, left_on = ['key1', 'key2'], right_index = True)

Unnamed: 0,key1,key2,data,event1,event2
0,Seoul,2016,0.0,4,5
0,Seoul,2016,0.0,6,7
1,Seoul,2017,1.0,8,9
3,Daejeon,2017,3.0,0,1


In [75]:
pd.merge(lefth, righth, left_on = ['key1', 'key2'],
         right_index = True, how = 'outer')

Unnamed: 0,key1,key2,data,event1,event2
0,Seoul,2016,0.0,4.0,5.0
0,Seoul,2016,0.0,6.0,7.0
1,Seoul,2017,1.0,8.0,9.0
2,Seoul,1018,2.0,,
3,Daejeon,2017,3.0,0.0,1.0
4,Daejeon,2018,4.0,,
4,Daejeon,2016,,2.0,3.0
4,Seoul,2018,,10.0,11.0


In [77]:
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                     index = ['a', 'c', 'e'],
                     columns = ['Seoul', 'Busan'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13., 14]],
                      index = ['b', 'c', 'd', 'e'],
                      columns = ['Daejeon', 'Inchon'])

In [78]:
left2

Unnamed: 0,Seoul,Busan
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0


In [79]:
right2

Unnamed: 0,Daejeon,Inchon
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0


In [81]:
pd.merge(left2, right2, how = 'outer', left_index = True, right_index = True)

Unnamed: 0,Seoul,Busan,Daejeon,Inchon
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [82]:
left2.join(right2, how = 'outer')

Unnamed: 0,Seoul,Busan,Daejeon,Inchon
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [83]:
left1.join(right1, on = 'key')

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


In [84]:
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                       index = ['a', 'c', 'e', 'f'],
                       columns = ['Daegu', 'Kwangju'])

In [85]:
another

Unnamed: 0,Daegu,Kwangju
a,7.0,8.0
c,9.0,10.0
e,11.0,12.0
f,16.0,17.0


In [87]:
left2.join([right2, another]) # 이렇게 warning이 뜨면 이것보단 concat이 더 적절하다.

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  res = concat(frames, axis=1, join="outer", verify_integrity=True)


Unnamed: 0,Seoul,Busan,Daejeon,Inchon,Daegu,Kwangju
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0


In [88]:
#여기까지 #결합
left2.join([right2, another], how = 'outer')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return concat(frames, axis=1, join=how, verify_integrity=True)


Unnamed: 0,Seoul,Busan,Daejeon,Inchon,Daegu,Kwangju
a,1.0,2.0,,,7.0,8.0
b,,,7.0,8.0,,
c,3.0,4.0,9.0,10.0,9.0,10.0
d,,,11.0,12.0,,
e,5.0,6.0,13.0,14.0,11.0,12.0
f,,,,,16.0,17.0


In [89]:
#병합
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [90]:
np.concatenate([arr, arr], axis = 1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [91]:
s1 = pd.Series([0, 1], index = ['a', 'b'])
s2 = pd.Series([2, 3, 4], index = ['c', 'd', 'e'])
s3 = pd.Series([5, 6], index = ['f', 'g'])

In [92]:
s1

a    0
b    1
dtype: int64

In [93]:
s2

c    2
d    3
e    4
dtype: int64

In [94]:
s3

f    5
g    6
dtype: int64

In [95]:
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [96]:
pd.concat([s1, s2, s3], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [97]:
s4 = pd.concat([s1, s3])
s4

a    0
b    1
f    5
g    6
dtype: int64

In [99]:
pd.concat([s1, s4], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1
a,0.0,0
b,1.0,1
f,,5
g,,6


In [100]:
pd.concat([s1, s4], axis = 1, join = 'inner')

Unnamed: 0,0,1
a,0,0
b,1,1


In [101]:
pd.concat([s1, s4], axis = 1, join_axes = [['a', 'c', 'b', 'e']])

  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1
a,0.0,0.0
c,,
b,1.0,1.0
e,,


In [102]:
#어떤게 어디서 온 지 모르니까, 구분 지어줌.
result = pd.concat([s1, s1, s3], keys = ['one', 'two', 'three'])
result

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

In [103]:
result.unstack()

Unnamed: 0,a,b,f,g
one,0.0,1.0,,
two,0.0,1.0,,
three,,,5.0,6.0


In [104]:
pd.concat([s1, s2, s3], axis = 1, keys = ['one', 'two', 'three'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [106]:
df1 = pd. DataFrame(np.arange(6).reshape(3, 2), index = ['a', 'b', 'c'],
                    columns = ['one', 'two'])
df2 = pd. DataFrame(5 + np.arange(4).reshape(2, 2), index = ['a', 'c'],
                    columns = ['three', 'four'])

In [107]:
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [108]:
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [109]:
pd.concat([df1, df2], axis = 1, keys = ['level1', 'level2'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [110]:
pd.concat([df1, df2], axis = 1, keys = ['level1', 'level2'],
          names = ['upper', 'lower'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


upper,level1,level1,level2,level2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [111]:
df1 = pd.DataFrame(np.random.randn(3, 4), columns = ['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns = ['b', 'd', 'a'])

In [112]:
df1

Unnamed: 0,a,b,c,d
0,0.805165,-0.293435,-0.284272,-2.156493
1,0.147058,-0.599293,0.914859,1.885335
2,-0.444283,-0.641434,0.123965,-0.264916


In [113]:
df2

Unnamed: 0,b,d,a
0,-2.034326,-0.420182,0.230164
1,1.63671,-0.127891,0.268855


In [114]:
pd.concat([df1, df2], ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b,c,d
0,0.805165,-0.293435,-0.284272,-2.156493
1,0.147058,-0.599293,0.914859,1.885335
2,-0.444283,-0.641434,0.123965,-0.264916
3,0.230164,-2.034326,,-0.420182
4,0.268855,1.63671,,-0.127891


In [148]:
s1 = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
               index = ['f', 'e', 'd', 'c', 'b', 'a'])
s2 = pd.Series(np.arange(len(s1), dtype = np.float64),
               index = ['f', 'e', 'd', 'c', 'b', 'a'])

In [149]:
s1

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [150]:
s2

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

In [151]:
s2[-1] = np.nan

In [152]:
s2

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64

In [153]:
#널이 아닌경우 s1을 써라
np.where(pd.isnull(s1), s2, s1)

array([0. , 2.5, 2. , 3.5, 4.5, nan])

In [154]:
s2[:-2].combine_first(s1[2:])

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

In [155]:
df1 = pd.DataFrame({'a':[1., np.nan, 5., np.nan],
                    'b':[np.nan, 2., np.nan, 6.],
                    'c':range(2, 18, 4)})
df2 = pd.DataFrame({'a':[5., 4., np.nan, 3., 7.],
                    'b':[np.nan, 3., 4., 6., 8.]})

In [136]:
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [137]:
df2

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


In [138]:
#중요 #combine #df1을 기준으로 하되, NaN 값 있을 시, df2값으로 대체.
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


# Index 재형상
## stack : column -> row
## unstack : row -> column

# Pivet 연산
## 원하는 행과 열을 이용하여 DF 생성하는 연산

In [139]:
df = pd.DataFrame(np.arange(6).reshape((2, 3)),
                  index = pd.Index(['Seoul', 'Daejeon'], name = 'city'),
                  columns = pd.Index(['one', 'two', 'three'], name = 'number'))

In [140]:
df

number,one,two,three
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Seoul,0,1,2
Daejeon,3,4,5


In [141]:
result = df.stack()
result

city     number
Seoul    one       0
         two       1
         three     2
Daejeon  one       3
         two       4
         three     5
dtype: int32

In [142]:
result.unstack()

number,one,two,three
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Seoul,0,1,2
Daejeon,3,4,5


In [143]:
result.unstack(0)

city,Seoul,Daejeon
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [145]:
s1 = pd.Series([0, 1, 2, 3], index = ['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index = ['c', 'd', 'e'])
df = pd.concat([s1, s2], keys = ['one', 'two'])