### 자료의 값 변경
- 수치자료의 범주화
- 범주형 자료의 수치화
- 표준화 & 정규화
- 시간자료

### 수치자료를 범주형자료로
https://pandas.pydata.org/docs/reference/api/pandas.cut.html
- 구간으로 조절: pandas.cut
- 갯수로 조절: pandas.qcut
기본적으로 순서자료: 명목자료로 설정하려면 "ordered = False"


In [1]:
import pandas as pd
import numpy as np

In [2]:
np.random.seed(316)
정규난수 = np.random.normal(size=1000)

In [3]:
동일구간 = pd.cut(정규난수, bins=5)
동일구간

[(-0.525, 0.768], (-0.525, 0.768], (-0.525, 0.768], (-0.525, 0.768], (-0.525, 0.768], ..., (-0.525, 0.768], (-1.818, -0.525], (-1.818, -0.525], (-0.525, 0.768], (0.768, 2.062]]
Length: 1000
Categories (5, interval[float64]): [(-3.118, -1.818] < (-1.818, -0.525] < (-0.525, 0.768] < (0.768, 2.062] < (2.062, 3.355]]

In [4]:
동일구간.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"(-3.118, -1.818]",28,0.028
"(-1.818, -0.525]",284,0.284
"(-0.525, 0.768]",484,0.484
"(0.768, 2.062]",180,0.18
"(2.062, 3.355]",24,0.024


In [6]:
구간지정 = pd.cut(정규난수, bins=[-3.5,-1.5,-0.5,0.5,1.5,3.5], labels=["A","B","C","D","E"])
구간지정

['D', 'D', 'C', 'C', 'D', ..., 'C', 'B', 'B', 'C', 'D']
Length: 1000
Categories (5, object): ['A' < 'B' < 'C' < 'D' < 'E']

In [7]:
구간지정.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
A,58,0.058
B,266,0.266
C,380,0.38
D,230,0.23
E,66,0.066


In [8]:
동일개수 = pd.qcut(정규난수, q=5)
동일개수.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"(-3.112, -0.845]",200,0.2
"(-0.845, -0.295]",200,0.2
"(-0.295, 0.191]",200,0.2
"(0.191, 0.783]",200,0.2
"(0.783, 3.355]",200,0.2


In [9]:
비율지정 = pd.qcut(정규난수, q=[0,0.25,0.5,0.75,1.])
비율지정.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"(-3.112, -0.686]",250,0.25
"(-0.686, -0.0377]",250,0.25
"(-0.0377, 0.627]",250,0.25
"(0.627, 3.355]",250,0.25


In [10]:
##"명목자료"로 표시할 떄에는 labels를 설정해야 함
구간지정 = pd.cut(정규난수, bins=[-3.5,-1.5,-0.5,0.5,1.5,3.5],labels=["A","B","C","D","E"],ordered=False)
구간지정

['D', 'D', 'C', 'C', 'D', ..., 'C', 'B', 'B', 'C', 'D']
Length: 1000
Categories (5, object): ['A', 'B', 'C', 'D', 'E']

In [12]:
## 범주값을 활용하고 싶은 경우
구간지정.categories

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

### 범주형 자료의 수치화
- 명목자료를 가변수로: pandas.get_dummies()
- 순서자료를 등간격의 수치로 

In [14]:
주택 = pd.read_csv("housing.csv")
주택.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


명목자료를 가변수로 변환

In [15]:
주택["해변근접성"] = 주택['ocean_proximity'].astype('category')
주택["해변근접성"]

0        NEAR BAY
1        NEAR BAY
2        NEAR BAY
3        NEAR BAY
4        NEAR BAY
           ...   
20635      INLAND
20636      INLAND
20637      INLAND
20638      INLAND
20639      INLAND
Name: 해변근접성, Length: 20640, dtype: category
Categories (5, object): ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']

In [16]:
주택위치 =pd.get_dummies(주택["해변근접성"])
주택위치

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0
...,...,...,...,...,...
20635,0,1,0,0,0
20636,0,1,0,0,0
20637,0,1,0,0,0
20638,0,1,0,0,0


In [17]:
주택위치.columns = ["Ocean", "Inland", "Island", "NearBay", "NearOcean"]
주택위치

Unnamed: 0,Ocean,Inland,Island,NearBay,NearOcean
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0
...,...,...,...,...,...
20635,0,1,0,0,0
20636,0,1,0,0,0
20637,0,1,0,0,0
20638,0,1,0,0,0


순서자료를 동일 간격의 수치로

In [18]:
구간지정
##데이터 프레임 확인 : dtypes
## 데이터 프레임 변경:astype

['D', 'D', 'C', 'C', 'D', ..., 'C', 'B', 'B', 'C', 'D']
Length: 1000
Categories (5, object): ['A', 'B', 'C', 'D', 'E']

In [20]:
##명목자료를 순서자료로
from pandas.api.types import CategoricalDtype

순서지정 = ['A','B','C','D','E']
순서구간 = 구간지정.astype(CategoricalDtype(categories=순서지정,ordered=True))
순서구간

['D', 'D', 'C', 'C', 'D', ..., 'C', 'B', 'B', 'C', 'D']
Length: 1000
Categories (5, object): ['A' < 'B' < 'C' < 'D' < 'E']

### 변수의 종류(수치자료와 범주형자료) 확인
- _get_numeric_data 메서드
- select_dtypes()

In [21]:
변수명= 주택.columns.tolist()
수치변수명 = 주택._get_numeric_data().columns.tolist()
범주변수명 = list(set(변수명)-set(수치변수명))
print("전체변수명:", 변수명)
print("수치변수명:", 수치변수명)
print("범주변수명:", 범주변수명)

전체변수명: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity', '해변근접성']
전체변수명: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']
전체변수명: ['ocean_proximity', '해변근접성']


In [22]:
수치변수명 = 주택.select_dtypes(include=np.number).columns.tolist()
범주변수명 = 주택.select_dtypes(exclude=np.number).columns.tolist()
print("수치변수명:", 수치변수명)
print("범주변수명:", 범주변수명)

수치변수명: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']
범주변수명: ['ocean_proximity', '해변근접성']


### 표준화 & 정규화

In [23]:
주택.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   longitude           20640 non-null  float64 
 1   latitude            20640 non-null  float64 
 2   housing_median_age  20640 non-null  float64 
 3   total_rooms         20640 non-null  float64 
 4   total_bedrooms      20433 non-null  float64 
 5   population          20640 non-null  float64 
 6   households          20640 non-null  float64 
 7   median_income       20640 non-null  float64 
 8   median_house_value  20640 non-null  float64 
 9   ocean_proximity     20640 non-null  object  
 10  해변근접성               20640 non-null  category
dtypes: category(1), float64(9), object(1)
memory usage: 1.6+ MB


In [24]:
수치자료 = 주택.iloc[:,[2,3,4,5,6,7]]# 행, 열 -> 칼럼만 뽑음 
수치자료.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,41.0,880.0,129.0,322.0,126.0,8.3252
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014
2,52.0,1467.0,190.0,496.0,177.0,7.2574
3,52.0,1274.0,235.0,558.0,219.0,5.6431
4,52.0,1627.0,280.0,565.0,259.0,3.8462


In [25]:
수치자료 - 수치자료.mean(axis=0)

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,12.360514,-1755.763081,-408.870553,-1103.476744,-373.53968,4.454529
1,-7.639486,4463.236919,568.129447,975.523256,638.46032,4.430729
2,23.360514,-1168.763081,-347.870553,-929.476744,-322.53968,3.386729
3,23.360514,-1361.763081,-302.870553,-867.476744,-280.53968,1.772429
4,23.360514,-1008.763081,-257.870553,-860.476744,-240.53968,-0.024471
...,...,...,...,...,...,...
20635,-3.639486,-970.763081,-163.870553,-580.476744,-169.53968,-2.310371
20636,-10.639486,-1938.763081,-387.870553,-1069.476744,-385.53968,-1.313871
20637,-11.639486,-381.763081,-52.870553,-418.476744,-66.53968,-2.170671
20638,-10.639486,-775.763081,-128.870553,-684.476744,-150.53968,-2.003471


In [26]:
표준자료 = (수치자료-수치자료.mean(axis=0))/수치자료.std(axis=0)
표준자료.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,0.982119,-0.8048,-0.970301,-0.974405,-0.977009,2.344709
1,-0.607004,2.045841,1.348243,0.861418,1.669921,2.332181
2,1.856137,-0.535733,-0.825541,-0.820757,-0.843616,1.782656
3,1.856137,-0.624199,-0.71875,-0.76601,-0.733764,0.932945
4,1.856137,-0.462393,-0.611959,-0.759828,-0.629142,-0.012881


In [27]:
정규자료 =(수치자료-수치자료.min(axis=0))/(수치자료.max(axis=0)-수치자료.min(axis=0))
정규자료.tail()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
20635,0.470588,0.042296,0.057883,0.023599,0.054103,0.07313
20636,0.333333,0.017676,0.023122,0.009894,0.018582,0.141853
20637,0.313725,0.057277,0.075109,0.02814,0.071041,0.082764
20638,0.333333,0.047256,0.063315,0.020684,0.057227,0.094295
20639,0.294118,0.070782,0.095438,0.03879,0.086992,0.130253


#### 시간자료 처리

In [28]:
# 시간 생성
pd.date_range("2023-3-2", "2023-6-21")

DatetimeIndex(['2023-03-02', '2023-03-03', '2023-03-04', '2023-03-05',
               '2023-03-06', '2023-03-07', '2023-03-08', '2023-03-09',
               '2023-03-10', '2023-03-11',
               ...
               '2023-06-12', '2023-06-13', '2023-06-14', '2023-06-15',
               '2023-06-16', '2023-06-17', '2023-06-18', '2023-06-19',
               '2023-06-20', '2023-06-21'],
              dtype='datetime64[ns]', length=112, freq='D')

In [29]:
pd.date_range("2023-3-2",periods=100)

DatetimeIndex(['2023-03-02', '2023-03-03', '2023-03-04', '2023-03-05',
               '2023-03-06', '2023-03-07', '2023-03-08', '2023-03-09',
               '2023-03-10', '2023-03-11', '2023-03-12', '2023-03-13',
               '2023-03-14', '2023-03-15', '2023-03-16', '2023-03-17',
               '2023-03-18', '2023-03-19', '2023-03-20', '2023-03-21',
               '2023-03-22', '2023-03-23', '2023-03-24', '2023-03-25',
               '2023-03-26', '2023-03-27', '2023-03-28', '2023-03-29',
               '2023-03-30', '2023-03-31', '2023-04-01', '2023-04-02',
               '2023-04-03', '2023-04-04', '2023-04-05', '2023-04-06',
               '2023-04-07', '2023-04-08', '2023-04-09', '2023-04-10',
               '2023-04-11', '2023-04-12', '2023-04-13', '2023-04-14',
               '2023-04-15', '2023-04-16', '2023-04-17', '2023-04-18',
               '2023-04-19', '2023-04-20', '2023-04-21', '2023-04-22',
               '2023-04-23', '2023-04-24', '2023-04-25', '2023-04-26',
      

주기
- s:초
- T:분
- H:시간
- D:일
- B:주말이 아닌 평일
- W:주
- W-MON: 주(월요일)
- M :각 달의 마지막날
- MS: 각 달의 첫날
- BM: 주말이 아닌 평일 중에서 각 달의 마지막 날
- BMS: 주말이 아닌 평일 중에서 각 달의 첫날
- WOM-2THU :각 달의 두번째 목요일
- Q-JAN: 각 분기의 첫달의 마지막 날
- Q-DEC: 각 분기의 마지막 달의 마지막 날

In [30]:
pd.date_range("2023-3-2",periods=5, freq="h")

DatetimeIndex(['2023-03-02 00:00:00', '2023-03-02 01:00:00',
               '2023-03-02 02:00:00', '2023-03-02 03:00:00',
               '2023-03-02 04:00:00'],
              dtype='datetime64[ns]', freq='H')

In [35]:
#date time index 사용
시간자료 = pd.DataFrame({'날짜': ['2023-01-01 01:10:00', '2023-02-25 03:20:20',
                                   '2023-03-02 06:30:00','2023-04-19 10:40:30',
                                    '2023-05-16 12:50:00','2023-06-29 15:00:30',
                                    '2023-07-01 18:10:00','2023-09-15 21:50:30']})
시간자료.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   날짜      8 non-null      object
dtypes: object(1)
memory usage: 192.0+ bytes


In [36]:
# 시간자료로 변환
시간자료['날짜'] = pd.to_datetime(시간자료['날짜'],format='%Y-%m-%d %H:%M:%S', errors='raise')
시간자료.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   날짜      8 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 192.0 bytes


In [38]:
시간자료['date'] = 시간자료['날짜'].dt.date
시간자료['year'] = 시간자료['날짜'].dt.year
시간자료['month'] = 시간자료['날짜'].dt.month
시간자료['월이름'] = 시간자료['날짜'].dt.month_name()
시간자료['day'] = 시간자료['날짜'].dt.day
시간자료['time'] = 시간자료['날짜'].dt.time
시간자료['hour'] = 시간자료['날짜'].dt.hour
시간자료['minute'] = 시간자료['날짜'].dt.minute
시간자료['second'] = 시간자료['날짜'].dt.second
시간자료.head()

Unnamed: 0,날짜,date,year,month,월이름,day,time,hour,minute,second
0,2023-01-01 01:10:00,2023-01-01,2023,1,January,1,01:10:00,1,10,0
1,2023-02-25 03:20:20,2023-02-25,2023,2,February,25,03:20:20,3,20,20
2,2023-03-02 06:30:00,2023-03-02,2023,3,March,2,06:30:00,6,30,0
3,2023-04-19 10:40:30,2023-04-19,2023,4,April,19,10:40:30,10,40,30
4,2023-05-16 12:50:00,2023-05-16,2023,5,May,16,12:50:00,12,50,0


#### 날짜 & 시간 조정

In [39]:
import datetime
시간자료['날짜']+datetime.timedelta(days=5)

0   2023-01-06 01:10:00
1   2023-03-02 03:20:20
2   2023-03-07 06:30:00
3   2023-04-24 10:40:30
4   2023-05-21 12:50:00
5   2023-07-04 15:00:30
6   2023-07-06 18:10:00
7   2023-09-20 21:50:30
Name: 날짜, dtype: datetime64[ns]

#### 시간간격 조정:resample
- up-sampling:시간간격 좁아짐 -> 데이터량 증가
- down-sampling: 시간 간격 넓어짐 -> 데이터량 감소

In [40]:
시계열 = pd.Series(np.random.randn(100), index=pd.date_range("2023-1-1", periods=100,freq="D"))
시계열.head()

2023-01-01    0.618585
2023-01-02    0.365010
2023-01-03    0.872302
2023-01-04    0.060234
2023-01-05    0.349634
Freq: D, dtype: float64

In [41]:
시계열.resample('W').mean()

2023-01-01    0.618585
2023-01-08    0.673604
2023-01-15   -0.188737
2023-01-22   -0.038570
2023-01-29    0.449123
2023-02-05    0.431923
2023-02-12    0.016617
2023-02-19    0.325256
2023-02-26    0.142551
2023-03-05    0.317464
2023-03-12    0.051187
2023-03-19   -0.383341
2023-03-26    0.028789
2023-04-02   -0.288245
2023-04-09   -0.242945
2023-04-16   -0.534956
Freq: W-SUN, dtype: float64

In [42]:
분당자료 =pd.Series(np.random.randn(100), index=pd.date_range("2023-1-1", periods=100,freq="T"))
분당자료.head()

2023-01-01 00:00:00    0.664212
2023-01-01 00:01:00    0.124144
2023-01-01 00:02:00    0.338395
2023-01-01 00:03:00    0.445345
2023-01-01 00:04:00   -0.159555
Freq: T, dtype: float64

In [44]:
##10분간의 자료 중 open, high, low, close 출력
#분당자료.resample('10T').ohlc()