# Seoul Public Bike rental prediction with DecisionTreeClassifier
> "작성 중"

- toc: true
- branch: master
- badges: true
- comments: true
- author: 최민아
- categories: [python]

### EDA

#### import

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.tree import DecisionTreeClassifier

In [2]:
train = pd.read_csv("seoulbikedata/train.csv")
test = pd.read_csv("seoulbikedata/test.csv")

#### row column 

In [3]:
# 1459행 11열 train set
train.shape

(1459, 11)

In [4]:
# 715행 10열 test set
test.shape

(715, 10)

#### checking data

In [5]:
train.head()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
0,3,20,16.3,1.0,1.5,89.0,576.0,0.027,76.0,33.0,49.0
1,6,13,20.1,0.0,1.4,48.0,916.0,0.042,73.0,40.0,159.0
2,7,6,13.9,0.0,0.7,79.0,1382.0,0.033,32.0,19.0,26.0
3,8,23,8.1,0.0,2.7,54.0,946.0,0.04,75.0,64.0,57.0
4,9,18,29.5,0.0,4.8,7.0,2000.0,0.057,27.0,11.0,431.0


In [6]:
train.tail()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
1454,2174,4,16.8,0.0,1.6,53.0,2000.0,0.031,37.0,27.0,21.0
1455,2175,3,10.8,0.0,3.8,45.0,2000.0,0.039,34.0,19.0,20.0
1456,2176,5,18.3,0.0,1.9,54.0,2000.0,0.009,30.0,21.0,22.0
1457,2178,21,20.7,0.0,3.7,37.0,1395.0,0.082,71.0,36.0,216.0
1458,2179,17,21.1,0.0,3.1,47.0,1973.0,0.046,38.0,17.0,170.0


In [7]:
test.head()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5
0,0,7,20.7,0.0,1.3,62.0,954.0,0.041,44.0,27.0
1,1,17,30.0,0.0,5.4,33.0,1590.0,0.061,49.0,36.0
2,2,13,19.0,1.0,2.1,95.0,193.0,0.02,36.0,28.0
3,4,6,22.5,0.0,2.5,60.0,1185.0,0.027,52.0,38.0
4,5,22,14.6,1.0,3.4,93.0,218.0,0.041,18.0,15.0


In [8]:
test.tail()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5
710,2148,1,24.6,0.0,2.4,60.0,1745.0,,,
711,2149,1,18.1,0.0,1.0,55.0,2000.0,,,
712,2165,9,23.3,0.0,2.3,66.0,1789.0,0.02,17.0,15.0
713,2166,16,27.0,0.0,1.6,46.0,1956.0,0.032,40.0,26.0
714,2177,8,22.3,0.0,1.0,63.0,1277.0,0.007,30.0,24.0


#### checking missing value

In [9]:
train.isnull().sum()

id                          0
hour                        0
hour_bef_temperature        2
hour_bef_precipitation      2
hour_bef_windspeed          9
hour_bef_humidity           2
hour_bef_visibility         2
hour_bef_ozone             76
hour_bef_pm10              90
hour_bef_pm2.5            117
count                       0
dtype: int64

In [10]:
test.isnull().sum()

id                         0
hour                       0
hour_bef_temperature       1
hour_bef_precipitation     1
hour_bef_windspeed         1
hour_bef_humidity          1
hour_bef_visibility        1
hour_bef_ozone            35
hour_bef_pm10             37
hour_bef_pm2.5            36
dtype: int64

### pretreatment

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      1459 non-null   int64  
 1   hour                    1459 non-null   int64  
 2   hour_bef_temperature    1457 non-null   float64
 3   hour_bef_precipitation  1457 non-null   float64
 4   hour_bef_windspeed      1450 non-null   float64
 5   hour_bef_humidity       1457 non-null   float64
 6   hour_bef_visibility     1457 non-null   float64
 7   hour_bef_ozone          1383 non-null   float64
 8   hour_bef_pm10           1369 non-null   float64
 9   hour_bef_pm2.5          1342 non-null   float64
 10  count                   1459 non-null   float64
dtypes: float64(9), int64(2)
memory usage: 125.5 KB


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 715 entries, 0 to 714
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      715 non-null    int64  
 1   hour                    715 non-null    int64  
 2   hour_bef_temperature    714 non-null    float64
 3   hour_bef_precipitation  714 non-null    float64
 4   hour_bef_windspeed      714 non-null    float64
 5   hour_bef_humidity       714 non-null    float64
 6   hour_bef_visibility     714 non-null    float64
 7   hour_bef_ozone          680 non-null    float64
 8   hour_bef_pm10           678 non-null    float64
 9   hour_bef_pm2.5          679 non-null    float64
dtypes: float64(8), int64(2)
memory usage: 56.0 KB


In [13]:
#train set 결측치 제외 열 평균
train[train.notnull()].mean()

id                        1105.914325
hour                        11.493489
hour_bef_temperature        16.717433
hour_bef_precipitation       0.031572
hour_bef_windspeed           2.479034
hour_bef_humidity           52.231297
hour_bef_visibility       1405.216884
hour_bef_ozone               0.039149
hour_bef_pm10               57.168736
hour_bef_pm2.5              30.327124
count                      108.563400
dtype: float64

In [14]:
#test set 결측치 제외 열 평균
test[test.notnull()].mean()

id                        1056.854545
hour                        11.472727
hour_bef_temperature        23.263305
hour_bef_precipitation       0.051821
hour_bef_windspeed           2.388515
hour_bef_humidity           56.668067
hour_bef_visibility       1359.494398
hour_bef_ozone               0.041196
hour_bef_pm10               36.930678
hour_bef_pm2.5              24.939617
dtype: float64

In [15]:
#결측치를 평균으로 대체
train.fillna(train.mean())

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
0,3,20,16.3,1.0,1.5,89.0,576.0,0.027,76.0,33.0,49.0
1,6,13,20.1,0.0,1.4,48.0,916.0,0.042,73.0,40.0,159.0
2,7,6,13.9,0.0,0.7,79.0,1382.0,0.033,32.0,19.0,26.0
3,8,23,8.1,0.0,2.7,54.0,946.0,0.040,75.0,64.0,57.0
4,9,18,29.5,0.0,4.8,7.0,2000.0,0.057,27.0,11.0,431.0
...,...,...,...,...,...,...,...,...,...,...,...
1454,2174,4,16.8,0.0,1.6,53.0,2000.0,0.031,37.0,27.0,21.0
1455,2175,3,10.8,0.0,3.8,45.0,2000.0,0.039,34.0,19.0,20.0
1456,2176,5,18.3,0.0,1.9,54.0,2000.0,0.009,30.0,21.0,22.0
1457,2178,21,20.7,0.0,3.7,37.0,1395.0,0.082,71.0,36.0,216.0


In [16]:
test.fillna(test.mean())

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5
0,0,7,20.7,0.0,1.3,62.0,954.0,0.041000,44.000000,27.000000
1,1,17,30.0,0.0,5.4,33.0,1590.0,0.061000,49.000000,36.000000
2,2,13,19.0,1.0,2.1,95.0,193.0,0.020000,36.000000,28.000000
3,4,6,22.5,0.0,2.5,60.0,1185.0,0.027000,52.000000,38.000000
4,5,22,14.6,1.0,3.4,93.0,218.0,0.041000,18.000000,15.000000
...,...,...,...,...,...,...,...,...,...,...
710,2148,1,24.6,0.0,2.4,60.0,1745.0,0.041196,36.930678,24.939617
711,2149,1,18.1,0.0,1.0,55.0,2000.0,0.041196,36.930678,24.939617
712,2165,9,23.3,0.0,2.3,66.0,1789.0,0.020000,17.000000,15.000000
713,2166,16,27.0,0.0,1.6,46.0,1956.0,0.032000,40.000000,26.000000


### modeling

In [18]:
X_train = train.drop(['count'], axis=1)
Y_train = train['count']

In [22]:
model = DecisionTreeClassifier()

In [23]:
model.fit(X_train, Y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

# **?**