# Data-preprocessing
## 0. Load-data

In [2]:
import pandas as pd

train_raw = pd.read_csv("data/train.csv")
test_raw = pd.read_csv("data/test.csv")
submission_raw = pd.read_csv("data/submission.csv")

train = train_raw.copy()
test = test_raw.copy()
submission = submission_raw.copy()

## 1. Feature-engineering
모든 feature의 타입이 숫자형이므로 숫자로의 변환이 필요하지 않다.

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      1459 non-null   int64  
 1   hour                    1459 non-null   int64  
 2   hour_bef_temperature    1457 non-null   float64
 3   hour_bef_precipitation  1457 non-null   float64
 4   hour_bef_windspeed      1450 non-null   float64
 5   hour_bef_humidity       1457 non-null   float64
 6   hour_bef_visibility     1457 non-null   float64
 7   hour_bef_ozone          1383 non-null   float64
 8   hour_bef_pm10           1369 non-null   float64
 9   hour_bef_pm2.5          1342 non-null   float64
 10  count                   1459 non-null   float64
dtypes: float64(9), int64(2)
memory usage: 125.5 KB


## 2. Remove-outlier
앞의 EDA 과정에서 극 소수의 대여 횟수가 나타났지만, 기존 양상에서 벗어나지 않았기 때문에 모든 값을 포함하여 먼저 진행한다.

In [6]:
train.describe()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
count,1459.0,1459.0,1457.0,1457.0,1450.0,1457.0,1457.0,1383.0,1369.0,1342.0,1459.0
mean,1105.914325,11.493489,16.717433,0.031572,2.479034,52.231297,1405.216884,0.039149,57.168736,30.327124,108.5634
std,631.338681,6.92279,5.23915,0.174917,1.378265,20.370387,583.131708,0.019509,31.771019,14.713252,82.631733
min,3.0,0.0,3.1,0.0,0.0,7.0,78.0,0.003,9.0,8.0,1.0
25%,555.5,5.5,12.8,0.0,1.4,36.0,879.0,0.0255,36.0,20.0,37.0
50%,1115.0,11.0,16.6,0.0,2.3,51.0,1577.0,0.039,51.0,26.0,96.0
75%,1651.0,17.5,20.1,0.0,3.4,69.0,1994.0,0.052,69.0,37.0,150.0
max,2179.0,23.0,30.0,1.0,8.0,99.0,2000.0,0.125,269.0,90.0,431.0


가시성에 대한 값의 분산이 583으로 다른 변수들에 비해 너무 크게 나타나므로 단위를 조정해준다.

In [8]:
train['hour_bef_visibility'] = train['hour_bef_visibility']/10
train.describe()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
count,1459.0,1459.0,1457.0,1457.0,1450.0,1457.0,1457.0,1383.0,1369.0,1342.0,1459.0
mean,1105.914325,11.493489,16.717433,0.031572,2.479034,52.231297,140.521688,0.039149,57.168736,30.327124,108.5634
std,631.338681,6.92279,5.23915,0.174917,1.378265,20.370387,58.313171,0.019509,31.771019,14.713252,82.631733
min,3.0,0.0,3.1,0.0,0.0,7.0,7.8,0.003,9.0,8.0,1.0
25%,555.5,5.5,12.8,0.0,1.4,36.0,87.9,0.0255,36.0,20.0,37.0
50%,1115.0,11.0,16.6,0.0,2.3,51.0,157.7,0.039,51.0,26.0,96.0
75%,1651.0,17.5,20.1,0.0,3.4,69.0,199.4,0.052,69.0,37.0,150.0
max,2179.0,23.0,30.0,1.0,8.0,99.0,200.0,0.125,269.0,90.0,431.0


In [13]:
train.value_counts('hour_bef_precipitation')

hour_bef_precipitation
0.0    1411
1.0      46
dtype: int64

비가 내렸는지에 대한 데이터를 확인해본 결과 과편향되었으므로 따로 처리를 해주거나 쓰지 않는 것이 좋아보인다.

## 3. Train -> Train/Validation

In [12]:
from sklearn.model_selection import train_test_split

X = train.drop(['id', 'count'], axis=1)
y = train['count']

X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.1, shuffle=False)

#데이터 shape 확인
print(f"X_train.shape : {X_train.shape}")
print(f"y_train.shape : {y_train.shape}")
print(f"X_valid.shape : {X_valid.shape}")
print(f"y_valid.shape : {y_valid.shape}")

X_train.shape : (1313, 9)
y_train.shape : (1313,)
X_valid.shape : (146, 9)
y_valid.shape : (146,)
