# Pandas 함수

## JSON
> 웹 형식 문서 표현 방법 중 하나. 가볍고 속도가 빠름. 딕셔너리 구조로 작성된 문서

In [4]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

In [98]:
obj = """
{
    "name": "Wes",
    "places_lived": ["United States", "Spain", "Germany"],
    "pet": null,
    "siblings": [{"name": "Kim", "age": 25, "pets": ["ba", "ka"]},
                 {"name": "Lee", "age": 22, "pets": ["aa", "bb", "cc"]}]
}
"""

print(obj) # obj 문자열(json 형식 문서)


{
    "name": "Wes",
    "places_lived": ["United States", "Spain", "Germany"],
    "pet": null,
    "siblings": [{"name": "Kim", "age": 25, "pets": ["ba", "ka"]},
                 {"name": "Lee", "age": 22, "pets": ["aa", "bb", "cc"]}]
}



**JSON 문자열 파이썬 형태로 변환**

In [22]:
res = json.loads(obj)
res

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Kim', 'age': 25, 'pets': ['ba', 'ka']},
  {'name': 'Lee', 'age': 22, 'pets': ['aa', 'bb', 'cc']}]}

**파이썬 형태로 읽어진 객체를 JSON형식으로 변환**

In [23]:
asjson=json.dumps(res)
asjson

'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Kim", "age": 25, "pets": ["ba", "ka"]}, {"name": "Lee", "age": 22, "pets": ["aa", "bb", "cc"]}]}'

**JSON을 데이터프레임으로**

In [10]:
pd.DataFrame(res['siblings'])

Unnamed: 0,name,age,pets
0,Kim,25,"[ba, ka]"
1,Lee,22,"[aa, bb, cc]"


In [11]:
df = pd.DataFrame(res['siblings'], columns=['name', 'age'])
df

Unnamed: 0,name,age
0,Kim,25
1,Lee,22


**JSON문서 형식으로 변환**

In [12]:
df.to_json()

'{"name":{"0":"Kim","1":"Lee"},"age":{"0":25,"1":22}}'

**JSON문서로 저장**

In [16]:
df.to_json("myjson.json") 

**JSON문서 불러오기**

In [17]:
pd.read_json("myjson.json")

Unnamed: 0,name,age
0,Kim,25
1,Lee,22


## 데이터 정제

### NaN 처리 관련 메서드
* `dropna`: 누락된 데이터가 있는 축(행, 열)을 제외
* `fillna`, `ffill`, `bfill`: 누락 데이터를 대신한 값으로 채움
* `isnull`: 누락 데이터를 추출
* `notnull`: `isnull`에 반대되는 데이터를 추출

In [25]:
from numpy import nan as NA

data = pd.Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [26]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [27]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

### 축을 기준으로 `dropna`
* `axis = 0`: row-wise
* `axis = 1`: column-wise
* `how = 'all'`: 행이나 열이 모두 NaN인 경우만 제외

In [42]:
data = pd.DataFrame([[1, 6, 3],
                     [1, NA, NA],
                     [NA, NA, NA],
                     [NA, 5, 2]])
data

Unnamed: 0,0,1,2
0,1.0,6.0,3.0
1,1.0,,
2,,,
3,,5.0,2.0


In [43]:
data.dropna(axis=0)

Unnamed: 0,0,1,2
0,1.0,6.0,3.0


In [44]:
data.dropna(axis=1)

0
1
2
3


In [45]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.0,3.0
1,1.0,,
3,,5.0,2.0


### 중복 제거
* `duplicated`: 불린형태로 중복여부 파악
* `drop_duplcates`: duplicated 함수 결과가 False인 데이터프레임을 리턴

In [58]:
data = pd.DataFrame({'a': ['one', 'two'] * 3 + ['two'],
                     'b': [1, 1, 2, 3, 3, 4, 4],
                     'v1': [0, 1, 2, 3, 4, 5, 6]})
data

Unnamed: 0,a,b,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [59]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
dtype: bool

**전체 열에 대한 중복겂을 제외**

In [60]:
data.drop_duplicates()

Unnamed: 0,a,b,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


**특정 열에 대한 중복겂을 제외**

In [62]:
data.drop_duplicates(['a', 'b'], keep='last')

Unnamed: 0,a,b,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


## 데이터 범주화
* `pd.cut()`

In [95]:
ages = [20, 25, 28, 30, 20, 22, 37, 61, 44, 46, 33, 111]
bins = [0, 10, 20, 30, 40, 60, 100]
res = pd.cut(ages, bins)

**구간에 포함되지 않는 값은 NaN**

In [76]:
res

[(10, 20], (10, 20], (20, 30], (20, 30], (20, 30], ..., (60.0, 100.0], (40.0, 60.0], (40.0, 60.0], (30.0, 40.0], NaN]
Length: 13
Categories (6, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] < (40, 60] < (60, 100]]

**구간마다 포함되는 데이터 값을 수치화**

In [77]:
res.codes

array([ 1,  1,  2,  2,  2,  1,  2,  3,  5,  4,  4,  3, -1], dtype=int8)

In [81]:
res.value_counts()

(0, 10]      0
(10, 20]     3
(20, 30]     4
(30, 40]     2
(40, 60]     2
(60, 100]    1
dtype: int64

**범주 메서드**

In [78]:
res.categories

IntervalIndex([(0, 10], (10, 20], (20, 30], (30, 40], (40, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

**개구간 폐구간 설정**
* `right = True` : (], default
* `right = False` : [)

In [87]:
pd.cut(ages, [15, 26, 36, 61, 100], labels=gn, right=False) # 구간 : 이상 ~ 미만

[youth, youth, youth, youngyouth, youngyouth, ..., senior, middleaged, middleaged, youngyouth, NaN]
Length: 13
Categories (4, object): [youth < youngyouth < middleaged < senior]

**범주에 label 지정**

In [90]:
gn = ['youth', 'youngyouth', 'middleaged', 'senior']
pd.cut(ages, [15, 26, 36, 61, 100], labels=gn)

[NaN, youth, youth, youngyouth, youngyouth, ..., middleaged, middleaged, middleaged, youngyouth, NaN]
Length: 13
Categories (4, object): [youth < youngyouth < middleaged < senior]

**동일 개수로 나누어서 범주 만들기**

In [96]:
res = pd.qcut(ages, 4)
res.value_counts()

(19.999, 24.25]    3
(24.25, 31.5]      3
(31.5, 44.5]       3
(44.5, 111.0]      3
dtype: int64

## 그룹별 집계
* `groupby()`

In [31]:
abalone=pd.read_csv("abalone.txt", sep=",", header=None, names=["sex", 'length', "diameter",
                                                                "height", "whole_weight",
                                                                "shucked_weight", "viscera_weight",
                                                                "shell_weight","rings"])

abalone.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [32]:
(abalone.isnull()).sum()

sex               0
length            0
diameter          0
height            0
whole_weight      0
shucked_weight    0
viscera_weight    0
shell_weight      0
rings             0
dtype: int64

**전복 성별 그룹별 전체 무게 변수에 대해 집계**

In [33]:
grouped = abalone['whole_weight'].groupby(abalone['sex'])

In [34]:
#abalone의 'sex' 그룹화 -> 각 그룹별 whole_weight 컬럼값의 size()호출 결과
grouped.size()
#abalone의 'sex' 그룹화 -> 각 그룹별 whole_weight 컬럼값의 sum()호출 결과
grouped.sum()
#abalone의 'sex' 그룹화 -> 각 그룹별 whole_weight 컬럼값의 mean()호출 결과
grouped.mean()

sex
F    1.046532
I    0.431363
M    0.991459
Name: whole_weight, dtype: float64

In [35]:
abalone.groupby(abalone['sex']).sum()
abalone.groupby(abalone['sex']).mean()

Unnamed: 0_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,0.579093,0.454732,0.158011,1.046532,0.446188,0.230689,0.30201,11.129304
I,0.427746,0.326494,0.107996,0.431363,0.191035,0.09201,0.128182,7.890462
M,0.561391,0.439287,0.151381,0.991459,0.432946,0.215545,0.281969,10.705497


In [36]:
abalone.groupby('sex').sum()
abalone.groupby('sex').mean()

Unnamed: 0_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,0.579093,0.454732,0.158011,1.046532,0.446188,0.230689,0.30201,11.129304
I,0.427746,0.326494,0.107996,0.431363,0.191035,0.09201,0.128182,7.890462
M,0.561391,0.439287,0.151381,0.991459,0.432946,0.215545,0.281969,10.705497


In [37]:
abalone.length
#범주형(2가지) 변수(length_med) 추가: length값이 length열의 중앙값보다 크면 
#length_long, 그렇지 않으면 length_short

# length     length_med
# 0.455     length_short
# 0.350     length_short
# 0.95      length_long
# ...

#np.where(조건, 참, 거짓)

#abalone['length_med'] = (abalone.length>abalone.length.median()).map({True:'length_long',False:'length_short'})

# abalone["length_med"] = np.where(abalone["length"] > abalone["length"].median(), "length_long", "length_short")
# abalone[["length", "length_med"]]

#abalone['Length_label']=pd.cut(abalone.Length, [0,abalone.Length.median(),1], labels=['length_short', 'length_long'])


abalone["length_med"] = np.where(abalone["length"] > abalone["length"].median(), 
                                 "length_long", "length_short")
abalone[["length", "length_med"]]


Unnamed: 0,length,length_med
0,0.455,length_short
1,0.350,length_short
2,0.530,length_short
3,0.440,length_short
4,0.330,length_short
...,...,...
4172,0.565,length_long
4173,0.590,length_long
4174,0.600,length_long
4175,0.625,length_long


In [38]:
#abalone['length_med']

mean_weight=abalone['whole_weight'].groupby([abalone['sex'],abalone['length_med']]).mean()
mean_weight

sex  length_med  
F    length_long     1.261330
     length_short    0.589702
I    length_long     0.923215
     length_short    0.351234
M    length_long     1.255182
     length_short    0.538157
Name: whole_weight, dtype: float64

In [39]:
mean_weight.unstack()

length_med,length_long,length_short
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,1.26133,0.589702
I,0.923215,0.351234
M,1.255182,0.538157


In [40]:
#그룹별로 특정 작업을 반복

In [41]:
#abalone 성별로 그룹화 -> for loop -> 그룹별 데이터셋을 출력

abalone[['sex','length_med', 'whole_weight', 'rings']]

Unnamed: 0,sex,length_med,whole_weight,rings
0,M,length_short,0.5140,15
1,M,length_short,0.2255,7
2,F,length_short,0.6770,9
3,M,length_short,0.5160,10
4,I,length_short,0.2050,7
...,...,...,...,...
4172,F,length_long,0.8870,11
4173,M,length_long,0.9660,10
4174,M,length_long,1.1760,9
4175,F,length_long,1.0945,10


In [42]:
abalone[['sex','length_med', 'whole_weight', 'rings']].groupby('sex')
#그룹화 객체를 for문으로 반복하면, 그룹 이름(M, F, I)과 그룹별 데이터를 리턴할 수 있음

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001CCD55BDAF0>

In [43]:
for sex, group_data in abalone[['sex','length_med', 'whole_weight', 'rings']].groupby('sex'):
    print(sex), print(group_data[:5])

F
   sex    length_med  whole_weight  rings
2    F  length_short        0.6770      9
6    F  length_short        0.7775     20
7    F  length_short        0.7680     16
9    F   length_long        0.8945     19
10   F  length_short        0.6065     14
I
   sex    length_med  whole_weight  rings
4    I  length_short        0.2050      7
5    I  length_short        0.3515      8
16   I  length_short        0.2905      7
21   I  length_short        0.2255     10
42   I  length_short        0.0700      5
M
   sex    length_med  whole_weight  rings
0    M  length_short        0.5140     15
1    M  length_short        0.2255      7
3    M  length_short        0.5160     10
8    M  length_short        0.5095      9
11   M  length_short        0.4060     10


In [44]:
#그룹(sex, length_med 조합)별 데이터셋 출력

for (sex,length_med), group_data in abalone[['sex','length_med', 'whole_weight', 'rings']].groupby(['sex', 'length_med']):
    print(sex, length_med), print(group_data[:5])

F length_long
   sex   length_med  whole_weight  rings
9    F  length_long        0.8945     19
22   F  length_long        0.9395     12
23   F  length_long        0.7635      9
24   F  length_long        1.1615     10
25   F  length_long        0.9285     11
F length_short
   sex    length_med  whole_weight  rings
2    F  length_short        0.6770      9
6    F  length_short        0.7775     20
7    F  length_short        0.7680     16
10   F  length_short        0.6065     14
13   F  length_short        0.6845     10
I length_long
    sex   length_med  whole_weight  rings
509   I  length_long        0.8735     16
510   I  length_long        1.1095     10
549   I  length_long        0.8750     11
550   I  length_long        1.1625     17
551   I  length_long        0.9885     13
I length_short
   sex    length_med  whole_weight  rings
4    I  length_short        0.2050      7
5    I  length_short        0.3515      8
16   I  length_short        0.2905      7
21   I  length_short    

In [45]:
#{키:값, 키:값, 키:값}
#{'F':F그룹 데이터셋, 'M':M그룹 데이터셋, 'I':I그룹 데이터셋}
list(abalone[:10][['sex','length_med', 'whole_weight', 'rings']].groupby('sex'))

#성별로 그룹화 -> 성별 그룹을 key로 설정 -> 데이터셋은 value로 설정
aba_group=dict(list(abalone[:10][['sex','length_med', 'whole_weight', 'rings']].groupby('sex')))

In [46]:
aba_group

#그룹이름을 가지고 데이터셋을 인덱싱
aba_group['M']

Unnamed: 0,sex,length_med,whole_weight,rings
0,M,length_short,0.514,15
1,M,length_short,0.2255,7
3,M,length_short,0.516,10
8,M,length_short,0.5095,9


In [47]:
#abalone의 상위 10개 데이터에 대해 'sex'이 'M'인 자료 추출(불린참조)
#abalone의 상위 10개 데이터에 대해 'sex'이 'M'인 자료 추출(불린참조)
abalone[:10][abalone['sex']=='M'][['sex','length_med', 'whole_weight', 'rings']]

  abalone[:10][abalone['sex']=='M'][['sex','length_med', 'whole_weight', 'rings']]


Unnamed: 0,sex,length_med,whole_weight,rings
0,M,length_short,0.514,15
1,M,length_short,0.2255,7
3,M,length_short,0.516,10
8,M,length_short,0.5095,9


In [48]:
#특정 문자열을 매핑 규칙에 따른 변환 -> dict.get()
# ex) Lee, lee, LEE => lee
# ex) Choi, choi, Cho, CHO,... -> others

df=pd.DataFrame({'name':['kim','KIM','Kim','lee', 'LEE','Lee', 'cho','choi'],
             'value1':[1,2,3,4,5,6,7,8],
             'value2':[100,200,300,100,200,100,300,500]})
df

Unnamed: 0,name,value1,value2
0,kim,1,100
1,KIM,2,200
2,Kim,3,300
3,lee,4,100
4,LEE,5,200
5,Lee,6,100
6,cho,7,300
7,choi,8,500


In [49]:
nameMapping={
    "KIM":"kim",
    "Kim":"kim",
    "LEE":"lee",
    "Lee":"lee",
    "cho":"others",
    "choi":"others"    
}

In [50]:
#np.char.lower("LEE")

#매핑 규칙에 정의되지 않은 키가 전달되면 None이 리턴
#func=lambda x:nameMapping.get(x)

#매핑 규칙에 정의되지 않은 키(kim, lee)가 전달되면 그 값 그대로(kim, lee) 리턴
#func=lambda x:nameMapping.get(x, "etc") 

func=lambda x:nameMapping.get(x, x) 

#get

In [51]:
df['name2']=df.name.map(func)

In [52]:
df
#name2컬럼값을 그룹화 -> 그룹별 합계
df.groupby('name2').sum()

Unnamed: 0_level_0,value1,value2
name2,Unnamed: 1_level_1,Unnamed: 2_level_1
kim,6,600
lee,15,400
others,15,800


In [53]:
df.groupby(['name2', 'name']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
name2,name,Unnamed: 2_level_1,Unnamed: 3_level_1
kim,KIM,2,200
kim,Kim,3,300
kim,kim,1,100
lee,LEE,5,200
lee,Lee,6,100
lee,lee,4,100
others,cho,7,300
others,choi,8,500


In [54]:
df=pd.DataFrame({'id':[1,2,10,20,100,200],
             'name':['aa','aa2','aa3','aa4','aa5','aa6']})
df

Unnamed: 0,id,name
0,1,aa
1,2,aa2
2,10,aa3
3,20,aa4
4,100,aa5
5,200,aa6


In [55]:
df['id2']=df['id'].apply(lambda x: "{:0<5d}".format(x))
#df.info()

#df['id2'] = df['id'].astype(str).apply(lambda x: x.zfill(5))

In [56]:
# df

In [210]:
#abalone데이터셋
abalone=pd.read_csv("abalone.txt", sep=",", header=None, names=["sex", 'length', "diameter",
                                                       "height", "whole_weight",
                                                       "shucked_weight", "viscera_weight",
                                                       "shell_weight","rings"])
abalone

#abalone #4177건
#7:3 비율로 train set / test set 데이터를 분할(random state = 20201005)
#train set -> 모델 -> test set -> 나이를 예측(rings) =>  rmsle적용 => 점수 출력
#random forest regressor 이용

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [None]:
#min max scaling