# 데이터 정제 및 준비

In [2]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

## 누락 데이터 처리하기 : isnull()

In [3]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull()

# isnull() : null값이 있니?
# np.nan만 null이 있었다.. => True

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### 누락 데이터 골라내기 : dropna()      => 데이터에서 NA를 지워줘!!!

In [7]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [10]:
[data.notnull()]     #조건문이다! so, T/F이 나온다... 

[0     True
 1    False
 2     True
 3    False
 4     True
 dtype: bool]

In [11]:

data[data.notnull()]               

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()

In [13]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [14]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [15]:
data.dropna(how='all')   #dropna에 옵션을 줄 수 있다! => how='all'은 '전부 다 NA인 것들을 지워죠!'라는 것

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [16]:
data[4] = NA    #4번이라는 컬럼값을 만들고 NA로 채웠다.
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [17]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.204708,,
1,-0.55573,,
2,0.092908,,0.769023
3,1.246435,,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [19]:
df.dropna()

Unnamed: 0,0,1,2
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [14]:
# 몇 개의 이상값이 들어있는 로우만 볼 경우
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.092908,,0.769023
3,1.246435,,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


### 결측치 채우기: fillna()

In [15]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.204708,0.0,0.0
1,-0.55573,0.0,0.0
2,0.092908,0.0,0.769023
3,1.246435,0.0,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [16]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-0.204708,0.5,0.0
1,-0.55573,0.5,0.0
2,0.092908,0.5,0.769023
3,1.246435,0.5,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [17]:
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,-0.204708,0.0,0.0
1,-0.55573,0.0,0.0
2,0.092908,0.0,0.769023
3,1.246435,0.0,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [18]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
df.fillna(method='ffill')
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.476985,3.248944,-1.021228
1,-0.577087,0.124121,0.302614
2,0.523772,0.124121,1.34381
3,-0.713544,0.124121,-2.370232
4,-1.860761,,-2.370232
5,-1.265934,,-2.370232


In [19]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## 데이터 변형

### 중복 제거: drop_duplicates()

In [20]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [21]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [22]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [None]:
data['v1'] = range(7)

In [23]:
#k1 컬럼에 기반해서 중복 제거
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [24]:
# 중복값 중, 마지막으로 발견된 값을 유지
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### 함수나 매핑을 이용해서 데이터 변형하기 : map()

In [27]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [28]:
meat_to_animal = {    # '고기'를 '동물'로 바꿔주는 것 ( 매칭시켜주기~)
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

In [29]:
lowercased = data['food'].str.lower()     # 1, food에 있는 값은 str인데 1차적으로 '소문자'로 만들어라. ===> lower
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [30]:
data['animal'] = lowercased.map(meat_to_animal)   # 2. '소문자'로 만든 것을 mapping해서 넣어라!  ===> upper
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [32]:
data['food'].map(lambda x: meat_to_animal[x.lower()])   
# lambda 해석 : x를 인자로 받아서, [x.lower()]를  key값으로 했을 때, 이것의 value값을 매핑해!! 

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

### 값 치환하기 : replace()

In [34]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [35]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [32]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [36]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [38]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### 컬럼/색인 이름 바꾸기: rename

In [39]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [40]:
transform = lambda x: x[:4].upper()     # 1.x를 인자로 받아서 '대문자'로 바꿔줘!
data.index.map(transform)                   # 2. mapping해서 적용해줘!!!  index에1


Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [41]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [42]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [43]:
data.rename(index={'OHIO': 'INDIANA'},      # 오하이오를 인디아로, three를 피카부로 바꿔죵~~
            columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [40]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### 개별화(Discretization) and 양자화(Binning)     ====> 카테고리화 한다는 의미!! . 
## 근데 이해안간다ㅠㅠ

 ### [ 가 이상 , ) 이 미만.. 을 나타내는 기호다

In [28]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [42]:
bins = [18, 25, 35, 60, 100]    #1. 35세를 기준으로 나이의 계층을 나누고 싶어요.
cats = pd.cut(ages, bins)             #2. 기준을 세우려면 cut을 사용한다... ages에 있는 것들(깡통)을 짤라줘!
cats




[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [43]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [44]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [45]:
pd.value_counts(cats)   # 각각의 깡통이 몇개가 있니

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [46]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)   # [ 가 이상 , ) 이 미만.. 을 나타내는 기호다

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [47]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

### 특잇값 찾고 제외하기

In [44]:
data = pd.DataFrame(np.random.randn(1000, 4))
data

Unnamed: 0,0,1,2,3
0,0.476985,3.248944,-1.021228,-0.577087
1,0.124121,0.302614,0.523772,0.000940
2,1.343810,-0.713544,-0.831154,-2.370232
3,-1.860761,-0.860757,0.560145,-1.265934
4,0.119827,-1.063512,0.332883,-2.359419
...,...,...,...,...
995,-0.997174,0.046486,-0.610441,-0.394982
996,1.199915,-0.451814,-0.155385,-0.153514
997,0.011194,-0.050555,0.420211,1.190981
998,1.561488,-1.132925,0.097083,-1.031573


In [46]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.066679,0.020188,-0.000651,-0.067915
std,0.992674,1.004784,0.995859,0.995834
min,-3.548824,-3.184377,-3.745356,-3.428254
25%,-0.596286,-0.648915,-0.642609,-0.77489
50%,0.094503,-0.001593,-0.012007,-0.117489
75%,0.780282,0.674685,0.654328,0.616366
max,2.653656,3.260383,3.927528,3.366626


In [47]:
col = data[2]    
col[np.abs(col) > 3]

92     3.927528
300   -3.399312
395   -3.745356
Name: 2, dtype: float64

In [51]:
# 절대값이 3을 초과하는 값이 들어있는 모든 로우 선택( 데이터가 절댓값이 3보다 큰 어떤 것이라도 모두 보여줘)
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
88,3.927528,-0.255126,0.854137,-0.364807
92,0.194788,-0.655054,-0.56523,3.176873
296,-3.399312,-0.974657,-0.685312,-0.645858
314,-0.817649,0.050188,1.951312,3.260383
391,-3.745356,-1.520113,-0.346839,-0.696918
489,-1.341493,-0.293333,-0.242459,-3.05699
513,0.425384,-3.428254,-0.296336,-0.439938
576,-0.08542,0.275144,1.179227,-3.184377
798,-0.150923,-0.362528,-3.548824,1.553205
890,1.397822,3.366626,-2.372214,0.85101


In [53]:
# 절대값이 3을 초과하는 수는 3 또는 -3으로 설정
# sign(data) : 양수(1) or 음수(-1)인지 판단
data[np.abs(data) > 3] = np.sign(data) * 3     
# 그 데이터의 sign(양수인지 음수인지..) => 양수라면 리턴 1, 음수라면 리턴 -1... 그리고 할당하여, 판단해줘
# 결과는... max 3 과 mix -3 사이에 들어잇다는 것이다.
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.001204,-0.060553,0.07068,0.018904
std,0.989376,0.992459,0.990177,1.000171
min,-3.0,-3.0,-3.0,-3.0
25%,-0.636641,-0.771156,-0.591841,-0.641675
50%,-0.010997,-0.115171,0.094503,-1.5e-05
75%,0.659019,0.624615,0.787953,0.676536
max,3.0,3.0,2.653656,3.0


## 연습문제

- 자유롭게 데이터전처리하기

    [예시]
    - 컬럼명 변경하기
    - 날짜 데이터 파싱하기
    - 이상치 처리하기
    - 계절, 날씨를 text로 변경하기(예: 봄, 여름, 가을, 겨울)
    - 자전거 대여횟수를 카테고리화 하기(예: 매우 적음, 적음, 보통, 많음, 매우 많음)
    - atemp 값을 소수점 2자리까지 보여지도록 바꾸기
    
    
- 데이터전처리 과정을 마친 데이터프레임을 csv 형태로 저장

- 코드제출

    https://drive.google.com/drive/folders/149Ryp8xzvMnklI4bY7kF074tSsr5-H0c?usp=sharing
    
    


In [137]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# 읽어드릴 파일 경로 : /dataset/bike_sharing_demand_train.csv
filepath = "./datasets/exercise/bike_sharing_demand_train.csv"
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


# 1. datetime 컬럼 => year, month, time 으로 나누기

In [41]:
df['datetime']=df['datetime'].astype('datetime64')
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['hour']=df['datetime'].dt.hour
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,2
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,3
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,4


# 2. 몇 월에 평균적으로 많이 타는가...

In [68]:
grouped = df['count'].groupby(df['month'])
grouped
grouped.mean()

month
1      90.366516
2     110.003330
3     148.169811
4     184.160616
5     219.459430
6     242.031798
7     235.325658
8     234.118421
9     233.805281
10    227.699232
11    193.677278
12    175.614035
Name: count, dtype: float64

In [69]:
grouped.mean().idxmax() # 결론 : 6월에 가장 많이 탄다.

6

# 3. 몇 시에 평균적으로 많이 타는가..

In [65]:
grouped = df['count'].groupby(df['hour'])
grouped.mean()


hour
0      55.138462
1      33.859031
2      22.899554
3      11.757506
4       6.407240
         ...    
19    315.278509
20    228.517544
21    173.370614
22    133.576754
23     89.508772
Name: count, Length: 24, dtype: float64

In [66]:
grouped.mean().idxmax()    # 결론 : 17시에 가장 많이 탄다.

17

# 4. 어느 월의 몇 시에 가장 많이 타냐

In [99]:
grouped = df.groupby(['month', 'hour'])
grouped
df_1 = grouped['count'].mean().sort_values()
df_1.tail(10)
# 어느 월의 몇 시에 가장 많이 타는지 top10를 추출했다...! 
# 알아낸 것:  퇴근 시간에 가장 많이 탄다... month는 그다지 상관이 없네..

month  hour
7      18      525.578947
9      18      527.657895
6      18      541.473684
7      17      542.368421
5      17      559.315789
9      17      561.552632
8      18      563.973684
6      17      584.000000
8      17      592.236842
10     17      592.736842
Name: count, dtype: float64

In [74]:
grouped['count'].mean().idxmax()       # 결론 :  10월의 17시에 가장 많이 탄다.

(10, 17)

# 5. 어느 계절의 몇시에 가장 많이 타는지

In [97]:
grouped = df.groupby(['season', 'hour'])
grouped
df_2 = grouped['count'].mean().sort_values()
df_2.tail(10)
# 어느 계절의 몇 시에 가장 많이 타는지 top5를 추출했다...! 
# 알아낸 것 : 겨울에도 사람들이 은근히 많이 타는구나..출근 및 퇴근시간이면...! 근데 길안미끄러운가...

season  hour
2       8       390.192982
4       8       403.070175
3       8       403.508772
        19      413.500000
4       18      426.675439
        17      485.903509
2       18      499.535088
3       18      539.070175
2       17      540.315789
3       17      565.385965
Name: count, dtype: float64

In [79]:
grouped['count'].mean().idxmax()    # 결론 : 가을(3)의 17시에 가장 많이 탄다... = 4번의 답과 동일한 추론(10월은 가을!)이다.

(3, 17)

# 6. 체감온도와 온도의 상관관계

In [81]:
df['temp'].corr(df['atemp'])

#상관계수의 가능한 점수 범위는 –1.0에서 +1.0 사이이다. 부호에 상관없이 숫자의 절댓값이 클수록 관련성이 더 크다.
#체감온도와 온도의 상관관계는 당연히 높은줄 알았는데, 당연히 높은 것으로 나와서 신기하다...ㅎㅎ

0.9849481104817067

# 7. 습도와 온도의 상관관계

In [86]:
df['humidity'].corr(df['temp'])    # 습도와 온도는 상관관계가 별로 안높다.

-0.06494877090120942

# 8. 체감온도와 자전거 타는 횟수의 상관관계

In [100]:
df['count'].corr(df['atemp'])   # 상관관계가 그렇게 높은 것은 아니다..체감온도와 자전거는 엄청 큰 상관은 없다.. 

0.3897844366269766

# 9. 어느 체감온도 때, 사람들은 자전거를 많이 타는가.

In [114]:
grouped = df['count'].groupby(df['atemp'])
grouped.mean().sort_values()
grouped.mean().sort_values().tail(10)
# 역시 체감온도가 높을 때, 자전거를 많이 타는구나... 근데 사람의 체감 평균온도는 36.5도 아닌가..? 

atemp
35.605    312.144654
39.395    319.194030
40.910    324.512821
32.575    331.746324
37.120    334.144068
38.635    335.783784
36.365    349.243902
37.880    351.835052
44.695    354.333333
40.150    369.577778
Name: count, dtype: float64

In [127]:
grouped.mean().sort_values().head(10)    
# 사람의 체감 평균온도가 0일수도 있나... 이 데이터가 어떤 의미를 갖고 있는지 아는 것도 중요하겠네..!

atemp
0.760     1.000000
1.515     3.000000
2.275    38.000000
3.790    39.062500
7.575    55.933333
6.820    56.380952
8.335    58.444444
5.305    63.200000
6.060    64.876712
4.545    66.090909
Name: count, dtype: float64

## 10. 데이터 대표 값 확인하기

In [136]:
df.describe()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,hour
count,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0
mean,0.028569,0.680875,1.418427,20.23086,23.655084,61.88646,12.799395,36.021955,155.552177,191.574132,2011.501929,6.521495,11.541613
std,0.166599,0.466159,0.633839,7.79159,8.474601,19.245033,8.164537,49.960477,151.039033,181.144454,0.500019,3.444373,6.915838
min,0.0,0.0,1.0,0.82,0.76,0.0,0.0,0.0,0.0,1.0,2011.0,1.0,0.0
25%,0.0,0.0,1.0,13.94,16.665,47.0,7.0015,4.0,36.0,42.0,2011.0,4.0,6.0
50%,0.0,1.0,1.0,20.5,24.24,62.0,12.998,17.0,118.0,145.0,2012.0,7.0,12.0
75%,0.0,1.0,2.0,26.24,31.06,77.0,16.9979,49.0,222.0,284.0,2012.0,10.0,18.0
max,1.0,1.0,4.0,41.0,45.455,100.0,56.9969,367.0,886.0,977.0,2012.0,12.0,23.0


## 11. 계절 이름 바꾸기

In [140]:
def cs(x):
    if x==1:
        season = 'spring'
    elif x == 2:
        season = 'summer'
    elif x == 3:
        season = 'autumn'
    elif x == 4:
        season = 'winter'
    return season

df.season = df.season.apply(lambda y : cs(y))


In [141]:
df

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,spring,0,0,1,9.84,14.395,81,0.0000,3,13,16
1,2011-01-01 01:00:00,spring,0,0,1,9.02,13.635,80,0.0000,8,32,40
2,2011-01-01 02:00:00,spring,0,0,1,9.02,13.635,80,0.0000,5,27,32
3,2011-01-01 03:00:00,spring,0,0,1,9.84,14.395,75,0.0000,3,10,13
4,2011-01-01 04:00:00,spring,0,0,1,9.84,14.395,75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,winter,0,1,1,15.58,19.695,50,26.0027,7,329,336
10882,2012-12-19 20:00:00,winter,0,1,1,14.76,17.425,57,15.0013,10,231,241
10883,2012-12-19 21:00:00,winter,0,1,1,13.94,15.910,61,15.0013,4,164,168
10884,2012-12-19 22:00:00,winter,0,1,1,13.94,17.425,61,6.0032,12,117,129


# 12. 체감온도 바꾸기

In [146]:
def at(x):
    if x > 30:
        atemp = 'sohot'
    elif x > 20:
        atemp = 'hot'
    elif x > 10:
        atemp = 'soso'
    else:
        atemp = '???'
    return atemp

df.atemp = df.atemp.apply(lambda y : at(y))

TypeError: '>' not supported between instances of 'str' and 'int'

In [48]:
#### 복습 - 답

In [5]:

df = pd.read_csv('insurance.csv')


df

# bmi를 기준으로 저체중, 정상, 비만.. 이런걸로 연습해보기


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [9]:
df['group'] = df['bmi']

In [16]:
g_group = df.group
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,group
0,19,female,27.9,0,yes,southwest,16884.924,27.9
1,18,male,33.77,1,no,southeast,1725.5523,33.77
2,28,male,33.0,3,no,southeast,4449.462,33.0
3,33,male,22.705,0,no,northwest,21984.47061,22.705
4,32,male,28.88,0,no,northwest,3866.8552,28.88


In [26]:
def f(x):
    if g_group >= 25 :
        return '비만'

    elif 20 <= g_group < 25:
        return '정상'

    else :
        return '저체중'

In [27]:
df['group']=df['group'].apply(f)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
## Cut 이용해서 데이터도 분석해보기