## 데이터 핸들링(2)

> pandas 라이브러리를 활용한 데이터 핸들링에 이어지는 학습이다.

---
### 데이터 복제 및 피처 삭제
* copy(): 매 번 데이터를 불러올 필요 없이 한 번 불러온 데이터를 복제하는 방법이다.
* drop(): 필요 없는 데이터를 데이터프레임에서 제거하는 방법이다.
    * drop(columns=['피처 명']): 하나의 피처와 그에 해당하는 데이터를 삭제한다.
    > drop('피처명', axis=1)과 같은 방법으로도 삭제가 가능하다. axis=1은 열 데이터를 의미하며, 행 데이터는 axis=2이다.
    * drop(columns=[['피처 명','피처 명]]): 두 개 이상의 피처와 그에 해당하는 데이터를 삭제한다.
    * drop(행 인덱스, axis=0): 행 인덱스로 행 한 줄을 삭제한다.
    > 인자 값으로 inplace를 True로 설정하면 데이터프레임에 바로 적용된다. (inplace의 default값은 False이다.)  
    조건이나 인덱싱 등의 방법으로 복수 개의 행을 삭제할 수 있다.

In [None]:
import pandas as pd
import numpy as  np

# New_Seonjae_Chicken.csv 파일 불러오기
df = pd.read_csv('csv_dir/New_Seonjae_Chicken.csv')
df

In [163]:
# 데이터프레임 복제
cp_df = df.copy()
cp_df

Unnamed: 0.1,Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,0,Gold Fried,12000,1050g,1000.0,0.2,9600.0,domestic
1,1,Hot Seasoning,13000,1050g,1400.0,0.2,10400.0,
2,2,Soy Sauce,14000,950g,1600.0,0.2,11200.0,
3,3,Galic Sauce,14000,950g,1800.0,0.2,11200.0,
4,4,Leek,14000,1150g,1300.0,0.2,11200.0,Brazil
5,5,Bonelessness,15000,1250g,1500.0,0.2,12000.0,Brazil
6,6,HaH,13000,1050g,1300.0,0.2,10400.0,
7,new,[popular]bbq,16000,1150g,1200.0,0.5,8000.0,domestic
8,10,[best]oven-baked,9900,1050g,,,,


In [164]:
# 'unnamed: 0' 피처 제거
# cp_df.drop('Unnamed: 0', axis=1)
# cp_df = cp_df.drop(columns=['Unnamed: 0','origin'])
cp_df = cp_df.drop(columns=['Unnamed: 0'])

In [165]:
df = cp_df.copy()
df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,Gold Fried,12000,1050g,1000.0,0.2,9600.0,domestic
1,Hot Seasoning,13000,1050g,1400.0,0.2,10400.0,
2,Soy Sauce,14000,950g,1600.0,0.2,11200.0,
3,Galic Sauce,14000,950g,1800.0,0.2,11200.0,
4,Leek,14000,1150g,1300.0,0.2,11200.0,Brazil
5,Bonelessness,15000,1250g,1500.0,0.2,12000.0,Brazil
6,HaH,13000,1050g,1300.0,0.2,10400.0,
7,[popular]bbq,16000,1150g,1200.0,0.5,8000.0,domestic
8,[best]oven-baked,9900,1050g,,,,


In [166]:
# 4번 인덱스 한 행 제거
cp_df = cp_df.drop(4,axis=0)
cp_df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,Gold Fried,12000,1050g,1000.0,0.2,9600.0,domestic
1,Hot Seasoning,13000,1050g,1400.0,0.2,10400.0,
2,Soy Sauce,14000,950g,1600.0,0.2,11200.0,
3,Galic Sauce,14000,950g,1800.0,0.2,11200.0,
5,Bonelessness,15000,1250g,1500.0,0.2,12000.0,Brazil
6,HaH,13000,1050g,1300.0,0.2,10400.0,
7,[popular]bbq,16000,1150g,1200.0,0.5,8000.0,domestic
8,[best]oven-baked,9900,1050g,,,,


In [167]:
# 6번 인덱스 한 행 제거 및 반영
cp_df.drop(6,axis=0, inplace=True)
cp_df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,Gold Fried,12000,1050g,1000.0,0.2,9600.0,domestic
1,Hot Seasoning,13000,1050g,1400.0,0.2,10400.0,
2,Soy Sauce,14000,950g,1600.0,0.2,11200.0,
3,Galic Sauce,14000,950g,1800.0,0.2,11200.0,
5,Bonelessness,15000,1250g,1500.0,0.2,12000.0,Brazil
7,[popular]bbq,16000,1150g,1200.0,0.5,8000.0,domestic
8,[best]oven-baked,9900,1050g,,,,


In [168]:
# 가격 피처의 데이터 중에 값이 14000 이상인 데이터를 변수에 할당
cond = cp_df['price'] >= 14000
# 변수에 들어있는 데이터가 속한 인덱스 출력
cp_df[cond].index

Int64Index([2, 3, 5, 7], dtype='int64')

In [169]:
cp_df.drop(cp_df[cond].index, axis=0, inplace=True)
cp_df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,Gold Fried,12000,1050g,1000.0,0.2,9600.0,domestic
1,Hot Seasoning,13000,1050g,1400.0,0.2,10400.0,
8,[best]oven-baked,9900,1050g,,,,


In [170]:
cp_df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,Gold Fried,12000,1050g,1000.0,0.2,9600.0,domestic
1,Hot Seasoning,13000,1050g,1400.0,0.2,10400.0,
8,[best]oven-baked,9900,1050g,,,,


In [171]:
data = df.copy()
data

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,Gold Fried,12000,1050g,1000.0,0.2,9600.0,domestic
1,Hot Seasoning,13000,1050g,1400.0,0.2,10400.0,
2,Soy Sauce,14000,950g,1600.0,0.2,11200.0,
3,Galic Sauce,14000,950g,1800.0,0.2,11200.0,
4,Leek,14000,1150g,1300.0,0.2,11200.0,Brazil
5,Bonelessness,15000,1250g,1500.0,0.2,12000.0,Brazil
6,HaH,13000,1050g,1300.0,0.2,10400.0,
7,[popular]bbq,16000,1150g,1200.0,0.5,8000.0,domestic
8,[best]oven-baked,9900,1050g,,,,


---

### 결측치 처리
* 정상적인 학습을 위해 데이터 내에 있는 결측치(비어있는 값)을 핸들링하는 과정이다.
* 결측치 처리 관련 메서드로는 dropna(), isnull(), fillna() 등이 있다.
    * dropna(): 결측치가 들어 있는 데이터의 행 혹은 열을 제거한다. 
    > axis값에 0을 주면 행을, 1을 주면 열을 제거한다. 기본 값은 0이다.
    * isnull(): 전체 데이터 중 결측치 여부를 Boolean 형식으로 조회가 가능하다.
    > sum() 함수와 함께 사용하면 피처 별 결측 데이터 개수를 확인할 수 있다.
    * fillna(): 결측 데이터를 매개변수에 입력하는 값으로 채울 수 있다.

In [172]:
# 결측치가 있는 데이터의 행 제거
# data.dropna(axis=0)
data.dropna()

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,Gold Fried,12000,1050g,1000.0,0.2,9600.0,domestic
4,Leek,14000,1150g,1300.0,0.2,11200.0,Brazil
5,Bonelessness,15000,1250g,1500.0,0.2,12000.0,Brazil
7,[popular]bbq,16000,1150g,1200.0,0.5,8000.0,domestic


In [173]:
# 결측치가 있는 데이터의 열 제거
data.dropna(axis=1)

Unnamed: 0,menu,price,size
0,Gold Fried,12000,1050g
1,Hot Seasoning,13000,1050g
2,Soy Sauce,14000,950g
3,Galic Sauce,14000,950g
4,Leek,14000,1150g
5,Bonelessness,15000,1250g
6,HaH,13000,1050g
7,[popular]bbq,16000,1150g
8,[best]oven-baked,9900,1050g


In [174]:
# 전체 데이터의 결측 여부 조회
df.isnull()

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,True
2,False,False,False,False,False,False,True
3,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False
6,False,False,False,False,False,False,True
7,False,False,False,False,False,False,False
8,False,False,False,True,True,True,True


In [175]:
# 각 피처 별 결측 데이터 개수 조회
df.isnull().sum()

menu        0
price       0
size        0
kcal        1
dc rate     1
dc price    1
origin      5
dtype: int64

In [176]:
# 원산지 피처의 결측 데이터를 unknown으로 변경
df['origin'].fillna('unknown')

0    domestic
1     unknown
2     unknown
3     unknown
4      Brazil
5      Brazil
6     unknown
7    domestic
8     unknown
Name: origin, dtype: object

In [177]:
# 원산지 피처의 결측 데이터를 국내산으로 변경
df['origin'] = df['origin'].fillna('Korea')

In [178]:
df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,Gold Fried,12000,1050g,1000.0,0.2,9600.0,domestic
1,Hot Seasoning,13000,1050g,1400.0,0.2,10400.0,domestic
2,Soy Sauce,14000,950g,1600.0,0.2,11200.0,domestic
3,Galic Sauce,14000,950g,1800.0,0.2,11200.0,domestic
4,Leek,14000,1150g,1300.0,0.2,11200.0,Brazil
5,Bonelessness,15000,1250g,1500.0,0.2,12000.0,Brazil
6,HaH,13000,1050g,1300.0,0.2,10400.0,domestic
7,[popular]bbq,16000,1150g,1200.0,0.5,8000.0,domestic
8,[best]oven-baked,9900,1050g,,,,domestic


In [179]:
# 열량 피처의 결측 데이터를 열량 데이터의 최솟값으로 변경
df['kcal'].fillna(df['kcal'].min())

0    1000.0
1    1400.0
2    1600.0
3    1800.0
4    1300.0
5    1500.0
6    1300.0
7    1200.0
8    1000.0
Name: kcal, dtype: float64

In [180]:
df['kcal'] = df['kcal'].fillna(df['kcal'].min())
df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,Gold Fried,12000,1050g,1000.0,0.2,9600.0,domestic
1,Hot Seasoning,13000,1050g,1400.0,0.2,10400.0,domestic
2,Soy Sauce,14000,950g,1600.0,0.2,11200.0,domestic
3,Galic Sauce,14000,950g,1800.0,0.2,11200.0,domestic
4,Leek,14000,1150g,1300.0,0.2,11200.0,Brazil
5,Bonelessness,15000,1250g,1500.0,0.2,12000.0,Brazil
6,HaH,13000,1050g,1300.0,0.2,10400.0,domestic
7,[popular]bbq,16000,1150g,1200.0,0.5,8000.0,domestic
8,[best]oven-baked,9900,1050g,1000.0,,,domestic


---

### 데이터 정렬


In [181]:
df.sort_index(ascending=False)

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
8,[best]oven-baked,9900,1050g,1000.0,,,domestic
7,[popular]bbq,16000,1150g,1200.0,0.5,8000.0,domestic
6,HaH,13000,1050g,1300.0,0.2,10400.0,domestic
5,Bonelessness,15000,1250g,1500.0,0.2,12000.0,Brazil
4,Leek,14000,1150g,1300.0,0.2,11200.0,Brazil
3,Galic Sauce,14000,950g,1800.0,0.2,11200.0,domestic
2,Soy Sauce,14000,950g,1600.0,0.2,11200.0,domestic
1,Hot Seasoning,13000,1050g,1400.0,0.2,10400.0,domestic
0,Gold Fried,12000,1050g,1000.0,0.2,9600.0,domestic


In [182]:
df.sort_values('price')

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
8,[best]oven-baked,9900,1050g,1000.0,,,domestic
0,Gold Fried,12000,1050g,1000.0,0.2,9600.0,domestic
1,Hot Seasoning,13000,1050g,1400.0,0.2,10400.0,domestic
6,HaH,13000,1050g,1300.0,0.2,10400.0,domestic
2,Soy Sauce,14000,950g,1600.0,0.2,11200.0,domestic
3,Galic Sauce,14000,950g,1800.0,0.2,11200.0,domestic
4,Leek,14000,1150g,1300.0,0.2,11200.0,Brazil
5,Bonelessness,15000,1250g,1500.0,0.2,12000.0,Brazil
7,[popular]bbq,16000,1150g,1200.0,0.5,8000.0,domestic


In [183]:
df.sort_values('price', ascending=False)

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
7,[popular]bbq,16000,1150g,1200.0,0.5,8000.0,domestic
5,Bonelessness,15000,1250g,1500.0,0.2,12000.0,Brazil
2,Soy Sauce,14000,950g,1600.0,0.2,11200.0,domestic
3,Galic Sauce,14000,950g,1800.0,0.2,11200.0,domestic
4,Leek,14000,1150g,1300.0,0.2,11200.0,Brazil
1,Hot Seasoning,13000,1050g,1400.0,0.2,10400.0,domestic
6,HaH,13000,1050g,1300.0,0.2,10400.0,domestic
0,Gold Fried,12000,1050g,1000.0,0.2,9600.0,domestic
8,[best]oven-baked,9900,1050g,1000.0,,,domestic


In [184]:
df = df.sort_values(['price','origin'], ascending=[False, True])
df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
7,[popular]bbq,16000,1150g,1200.0,0.5,8000.0,domestic
5,Bonelessness,15000,1250g,1500.0,0.2,12000.0,Brazil
4,Leek,14000,1150g,1300.0,0.2,11200.0,Brazil
2,Soy Sauce,14000,950g,1600.0,0.2,11200.0,domestic
3,Galic Sauce,14000,950g,1800.0,0.2,11200.0,domestic
1,Hot Seasoning,13000,1050g,1400.0,0.2,10400.0,domestic
6,HaH,13000,1050g,1300.0,0.2,10400.0,domestic
0,Gold Fried,12000,1050g,1000.0,0.2,9600.0,domestic
8,[best]oven-baked,9900,1050g,1000.0,,,domestic


In [185]:
df = df.reset_index(drop=True)
df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,[popular]bbq,16000,1150g,1200.0,0.5,8000.0,domestic
1,Bonelessness,15000,1250g,1500.0,0.2,12000.0,Brazil
2,Leek,14000,1150g,1300.0,0.2,11200.0,Brazil
3,Soy Sauce,14000,950g,1600.0,0.2,11200.0,domestic
4,Galic Sauce,14000,950g,1800.0,0.2,11200.0,domestic
5,Hot Seasoning,13000,1050g,1400.0,0.2,10400.0,domestic
6,HaH,13000,1050g,1300.0,0.2,10400.0,domestic
7,Gold Fried,12000,1050g,1000.0,0.2,9600.0,domestic
8,[best]oven-baked,9900,1050g,1000.0,,,domestic


In [186]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   menu      9 non-null      object 
 1   price     9 non-null      int64  
 2   size      9 non-null      object 
 3   kcal      9 non-null      float64
 4   dc rate   8 non-null      float64
 5   dc price  8 non-null      float64
 6   origin    9 non-null      object 
dtypes: float64(3), int64(1), object(3)
memory usage: 632.0+ bytes


In [188]:
df['size'] = df['size'].str.replace('g','')
df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,[popular]bbq,16000,1150,1200.0,0.5,8000.0,domestic
1,Bonelessness,15000,1250,1500.0,0.2,12000.0,Brazil
2,Leek,14000,1150,1300.0,0.2,11200.0,Brazil
3,Soy Sauce,14000,950,1600.0,0.2,11200.0,domestic
4,Galic Sauce,14000,950,1800.0,0.2,11200.0,domestic
5,Hot Seasoning,13000,1050,1400.0,0.2,10400.0,domestic
6,HaH,13000,1050,1300.0,0.2,10400.0,domestic
7,Gold Fried,12000,1050,1000.0,0.2,9600.0,domestic
8,[best]oven-baked,9900,1050,1000.0,,,domestic


In [76]:
df['size'] = df['size'].astype(int)
df['size']

0    1150
1    1250
2    1150
3     950
4     950
5    1050
6    1050
7    1050
8    1050
Name: size, dtype: int64

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   menu      9 non-null      object 
 1   price     9 non-null      int64  
 2   size      9 non-null      int64  
 3   kcal      9 non-null      float64
 4   dc rate   8 non-null      float64
 5   dc price  8 non-null      float64
 6   origin    9 non-null      object 
dtypes: float64(3), int64(2), object(2)
memory usage: 632.0+ bytes


In [78]:
df['size'].mean()

1072.2222222222222

In [79]:
df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,[popular]bbq,16000,1150,1200.0,0.5,8000.0,domestic
1,Bonelessness,15000,1250,1500.0,0.2,12000.0,Brazil
2,Leek,14000,1150,1300.0,0.2,11200.0,Brazil
3,Soy Sauce,14000,950,1600.0,0.2,11200.0,domestic
4,Galic Sauce,14000,950,1800.0,0.2,11200.0,domestic
5,Hot Seasoning,13000,1050,1400.0,0.2,10400.0,domestic
6,HaH,13000,1050,1300.0,0.2,10400.0,domestic
7,Gold Fried,12000,1050,1000.0,0.2,9600.0,domestic
8,[best]oven-baked,9900,1050,1000.0,,,domestic


In [80]:
df.shape

(9, 7)

In [81]:
df.head()

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,[popular]bbq,16000,1150,1200.0,0.5,8000.0,domestic
1,Bonelessness,15000,1250,1500.0,0.2,12000.0,Brazil
2,Leek,14000,1150,1300.0,0.2,11200.0,Brazil
3,Soy Sauce,14000,950,1600.0,0.2,11200.0,domestic
4,Galic Sauce,14000,950,1800.0,0.2,11200.0,domestic


In [82]:
df.describe()

Unnamed: 0,price,size,kcal,dc rate,dc price
count,9.0,9.0,9.0,8.0,8.0
mean,13433.333333,1072.222222,1344.444444,0.2375,10500.0
std,1764.936259,97.182532,265.099562,0.106066,1242.118007
min,9900.0,950.0,1000.0,0.2,8000.0
25%,13000.0,1050.0,1200.0,0.2,10200.0
50%,14000.0,1050.0,1300.0,0.2,10800.0
75%,14000.0,1150.0,1500.0,0.2,11200.0
max,16000.0,1250.0,1800.0,0.5,12000.0


In [83]:
df.describe(include='O')

Unnamed: 0,menu,origin
count,9,9
unique,9,2
top,[popular]bbq,domestic
freq,1,7


In [84]:
df.corr()

Unnamed: 0,price,size,kcal,dc rate,dc price
price,1.0,0.374104,0.522744,0.688875,-0.138409
size,0.374104,1.0,-0.285724,0.29277,-0.022222
kcal,0.522744,-0.285724,1.0,-0.306122,0.636659
dc rate,0.688875,0.29277,-0.306122,1.0,-0.81325
dc price,-0.138409,-0.022222,0.636659,-0.81325,1.0


In [85]:
df.nunique()

menu        9
price       6
size        4
kcal        7
dc rate     2
dc price    5
origin      2
dtype: int64

In [87]:
df['size'].unique()

array([1150, 1250,  950, 1050])

In [88]:
df['size'].value_counts()

1050    4
1150    2
950     2
1250    1
Name: size, dtype: int64

In [89]:
df.count()

menu        9
price       9
size        9
kcal        9
dc rate     8
dc price    8
origin      9
dtype: int64

In [90]:
df.count(axis=1)

0    7
1    7
2    7
3    7
4    7
5    7
6    7
7    7
8    5
dtype: int64

In [91]:
len(df)

9

In [92]:
df.shape

(9, 7)

In [93]:
df.shape[0]

9

In [94]:
df.shape[1]

7

In [95]:
df['price'].max()

16000

In [96]:
df['price'].min()

9900

In [97]:
df['price'].mean()

13433.333333333334

In [98]:
df['price'].median()

14000.0

In [99]:
df['price'].sum()

120900

In [100]:
df['price'].std()

1764.936259472279

In [102]:
df['price'].var()

3115000.0

In [103]:
df['price'].describe()

count        9.000000
mean     13433.333333
std       1764.936259
min       9900.000000
25%      13000.000000
50%      14000.000000
75%      14000.000000
max      16000.000000
Name: price, dtype: float64

In [104]:
df['price'].quantile(.25)

13000.0

In [105]:
df['price'].quantile(.75)

14000.0

In [106]:
cond = df['price'].quantile(.25) > df['price']
df[cond]

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
7,Gold Fried,12000,1050,1000.0,0.2,9600.0,domestic
8,[best]oven-baked,9900,1050,1000.0,,,domestic


In [107]:
cond = df['price'].quantile(.75) < df['price']
df[cond]

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,[popular]bbq,16000,1150,1200.0,0.5,8000.0,domestic
1,Bonelessness,15000,1250,1500.0,0.2,12000.0,Brazil


In [111]:
df['origin'].mode()

0    domestic
dtype: object

In [113]:
df['origin'].mode()[0]

'domestic'

In [114]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8
menu,[popular]bbq,Bonelessness,Leek,Soy Sauce,Galic Sauce,Hot Seasoning,HaH,Gold Fried,[best]oven-baked
price,16000,15000,14000,14000,14000,13000,13000,12000,9900
size,1150,1250,1150,950,950,1050,1050,1050,1050
kcal,1200.0,1500.0,1300.0,1600.0,1800.0,1400.0,1300.0,1000.0,1000.0
dc rate,0.5,0.2,0.2,0.2,0.2,0.2,0.2,0.2,
dc price,8000.0,12000.0,11200.0,11200.0,11200.0,10400.0,10400.0,9600.0,
origin,domestic,Brazil,Brazil,domestic,domestic,domestic,domestic,domestic,domestic


In [119]:
df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,[popular]bbq,16000,1150,1200.0,0.5,8000.0,domestic
1,Bonelessness,15000,1250,1500.0,0.2,12000.0,Brazil
2,Leek,14000,1150,1300.0,0.2,11200.0,Brazil
3,Soy Sauce,14000,950,1600.0,0.2,11200.0,domestic
4,Galic Sauce,14000,950,1800.0,0.2,11200.0,domestic
5,Hot Seasoning,13000,1050,1400.0,0.2,10400.0,domestic
6,HaH,13000,1050,1300.0,0.2,10400.0,domestic
7,Gold Fried,12000,1050,1000.0,0.2,9600.0,domestic
8,[best]oven-baked,9900,1050,1000.0,,,domestic


In [135]:
df.groupby('origin').mean()

Unnamed: 0_level_0,price,size,kcal,dc rate,dc price,Will I gain weight?
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Brazil,14500.0,1200.0,1400.0,0.2,11600.0,1.0
domestic,13128.571429,1035.714286,1328.571429,0.25,10133.333333,0.571429


In [121]:
df.groupby(['origin','dc rate']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,price,size,kcal,dc price
origin,dc rate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Brazil,0.2,14500.0,1200.0,1400.0,11600.0
domestic,0.2,13200.0,1010.0,1420.0,10560.0
domestic,0.5,16000.0,1150.0,1200.0,8000.0


In [122]:
df.groupby(['origin','dc rate'])['price'].mean()

origin    dc rate
Brazil    0.2        14500.0
domestic  0.2        13200.0
          0.5        16000.0
Name: price, dtype: float64

In [123]:
type(df.groupby(['origin','dc rate'])['price'].mean())

pandas.core.series.Series

In [124]:
pd.DataFrame(df.groupby(['origin','dc rate'])['price'].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,price
origin,dc rate,Unnamed: 2_level_1
Brazil,0.2,14500.0
domestic,0.2,13200.0
domestic,0.5,16000.0


In [125]:
df.groupby(['origin','dc rate'])[['price']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,price
origin,dc rate,Unnamed: 2_level_1
Brazil,0.2,14500.0
domestic,0.2,13200.0
domestic,0.5,16000.0


In [126]:
df.groupby(['origin','dc rate']).max()

Unnamed: 0_level_0,Unnamed: 1_level_0,menu,price,size,kcal,dc price
origin,dc rate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Brazil,0.2,Leek,15000,1250,1500.0,12000.0
domestic,0.2,Soy Sauce,14000,1050,1800.0,11200.0
domestic,0.5,[popular]bbq,16000,1150,1200.0,8000.0


In [127]:
df.groupby(['origin','dc rate']).max().reset_index()

Unnamed: 0,origin,dc rate,menu,price,size,kcal,dc price
0,Brazil,0.2,Leek,15000,1250,1500.0,12000.0
1,domestic,0.2,Soy Sauce,14000,1050,1800.0,11200.0
2,domestic,0.5,[popular]bbq,16000,1150,1200.0,8000.0


In [128]:
df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin
0,[popular]bbq,16000,1150,1200.0,0.5,8000.0,domestic
1,Bonelessness,15000,1250,1500.0,0.2,12000.0,Brazil
2,Leek,14000,1150,1300.0,0.2,11200.0,Brazil
3,Soy Sauce,14000,950,1600.0,0.2,11200.0,domestic
4,Galic Sauce,14000,950,1800.0,0.2,11200.0,domestic
5,Hot Seasoning,13000,1050,1400.0,0.2,10400.0,domestic
6,HaH,13000,1050,1300.0,0.2,10400.0,domestic
7,Gold Fried,12000,1050,1000.0,0.2,9600.0,domestic
8,[best]oven-baked,9900,1050,1000.0,,,domestic


In [130]:
def cal(x):
    if x >= 1300:
        return True
    else:
        return False
df['kcal'].apply(cal)

0    False
1     True
2     True
3     True
4     True
5     True
6     True
7    False
8    False
Name: kcal, dtype: bool

In [131]:
df['Will I gain weight?'] = df['kcal'].apply(cal)
df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin,Will I gain weight?
0,[popular]bbq,16000,1150,1200.0,0.5,8000.0,domestic,False
1,Bonelessness,15000,1250,1500.0,0.2,12000.0,Brazil,True
2,Leek,14000,1150,1300.0,0.2,11200.0,Brazil,True
3,Soy Sauce,14000,950,1600.0,0.2,11200.0,domestic,True
4,Galic Sauce,14000,950,1800.0,0.2,11200.0,domestic,True
5,Hot Seasoning,13000,1050,1400.0,0.2,10400.0,domestic,True
6,HaH,13000,1050,1300.0,0.2,10400.0,domestic,True
7,Gold Fried,12000,1050,1000.0,0.2,9600.0,domestic,False
8,[best]oven-baked,9900,1050,1000.0,,,domestic,False


In [132]:
df['worry'] = df['kcal'].apply(lambda x : 'do not have' if x >= 1300 else 'eat it right now')
df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin,Will I gain weight?,worry
0,[popular]bbq,16000,1150,1200.0,0.5,8000.0,domestic,False,eat it right now
1,Bonelessness,15000,1250,1500.0,0.2,12000.0,Brazil,True,do not have
2,Leek,14000,1150,1300.0,0.2,11200.0,Brazil,True,do not have
3,Soy Sauce,14000,950,1600.0,0.2,11200.0,domestic,True,do not have
4,Galic Sauce,14000,950,1800.0,0.2,11200.0,domestic,True,do not have
5,Hot Seasoning,13000,1050,1400.0,0.2,10400.0,domestic,True,do not have
6,HaH,13000,1050,1300.0,0.2,10400.0,domestic,True,do not have
7,Gold Fried,12000,1050,1000.0,0.2,9600.0,domestic,False,eat it right now
8,[best]oven-baked,9900,1050,1000.0,,,domestic,False,eat it right now


In [133]:
df.to_csv('final_chiken.csv', index=False)

In [137]:
df = pd.read_csv('final_chiken.csv')
df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin,Will I gain weight?,worry
0,[popular]bbq,16000,1150,1200.0,0.5,8000.0,domestic,False,eat it right now
1,Bonelessness,15000,1250,1500.0,0.2,12000.0,Brazil,True,do not have
2,Leek,14000,1150,1300.0,0.2,11200.0,Brazil,True,do not have
3,Soy Sauce,14000,950,1600.0,0.2,11200.0,domestic,True,do not have
4,Galic Sauce,14000,950,1800.0,0.2,11200.0,domestic,True,do not have
5,Hot Seasoning,13000,1050,1400.0,0.2,10400.0,domestic,True,do not have
6,HaH,13000,1050,1300.0,0.2,10400.0,domestic,True,do not have
7,Gold Fried,12000,1050,1000.0,0.2,9600.0,domestic,False,eat it right now
8,[best]oven-baked,9900,1050,1000.0,,,domestic,False,eat it right now


In [139]:
df['menu'] = df['menu'].apply(lambda x:x+'❤️')
df

Unnamed: 0,menu,price,size,kcal,dc rate,dc price,origin,Will I gain weight?,worry
0,[popular]bbq❤️,16000,1150,1200.0,0.5,8000.0,domestic,False,eat it right now
1,Bonelessness❤️,15000,1250,1500.0,0.2,12000.0,Brazil,True,do not have
2,Leek❤️,14000,1150,1300.0,0.2,11200.0,Brazil,True,do not have
3,Soy Sauce❤️,14000,950,1600.0,0.2,11200.0,domestic,True,do not have
4,Galic Sauce❤️,14000,950,1800.0,0.2,11200.0,domestic,True,do not have
5,Hot Seasoning❤️,13000,1050,1400.0,0.2,10400.0,domestic,True,do not have
6,HaH❤️,13000,1050,1300.0,0.2,10400.0,domestic,True,do not have
7,Gold Fried❤️,12000,1050,1000.0,0.2,9600.0,domestic,False,eat it right now
8,[best]oven-baked❤️,9900,1050,1000.0,,,domestic,False,eat it right now
