In [1]:
# 필요한 라이브러리 로딩
import pandas as pd
import numpy as np

# StandardScaler, train_test_split, LogisticRegression 로딩
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# 분류 모델을 위한 성능 지표 함수 로딩
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
# URL 통해서 캐글의 자전거 대여 수요 데이터셋 다운로드
url = 'https://codepresso-online-platform-public.s3.ap-northeast-2.amazonaws.com/learning-resourse/python-machine-learning-20210326/bike-demand.csv'
df_bike = pd.read_csv(url)

In [3]:
print(df_bike)
print(type(df_bike))

                  datetime  season  holiday  ...  casual  registered  count
0      2011-01-01 00:00:00       1        0  ...       3          13     16
1      2011-01-01 01:00:00       1        0  ...       8          32     40
2      2011-01-01 02:00:00       1        0  ...       5          27     32
3      2011-01-01 03:00:00       1        0  ...       3          10     13
4      2011-01-01 04:00:00       1        0  ...       0           1      1
...                    ...     ...      ...  ...     ...         ...    ...
10881  2012-12-19 19:00:00       4        0  ...       7         329    336
10882  2012-12-19 20:00:00       4        0  ...      10         231    241
10883  2012-12-19 21:00:00       4        0  ...       4         164    168
10884  2012-12-19 22:00:00       4        0  ...      12         117    129
10885  2012-12-19 23:00:00       4        0  ...       4          84     88

[10886 rows x 12 columns]
<class 'pandas.core.frame.DataFrame'>


In [4]:
# 독립변수 데이터 생성
# temp, atemp, humidity, windspeed	컬럼 데이터만 저장
X_df_bike = df_bike.iloc[:, 5:9]
print(X_df_bike.head(5))

   temp   atemp  humidity  windspeed
0  9.84  14.395        81        0.0
1  9.02  13.635        80        0.0
2  9.02  13.635        80        0.0
3  9.84  14.395        75        0.0
4  9.84  14.395        75        0.0


In [5]:
# 종속변수 데이터를 위한 파생변수 생성
# 총 대여건수(count) 가 500 이상인 경우 1, 미만인 경우 0
df_bike['y'] = 1
df_bike.loc[df_bike['count'] < 500, 'y'] = 0
y = df_bike['y'] 

In [6]:
# StandardScaler 이용한 스케일링
scaler = StandardScaler()
scaler.fit(X_df_bike)
result=scaler.transform(X_df_bike)

In [7]:
result

array([[-1.33366069, -1.09273697,  0.99321305, -1.56775367],
       [-1.43890721, -1.18242083,  0.94124921, -1.56775367],
       [-1.43890721, -1.18242083,  0.94124921, -1.56775367],
       ...,
       [-0.80742813, -0.91395927, -0.04606385,  0.26970368],
       [-0.80742813, -0.73518157, -0.04606385, -0.83244247],
       [-0.91267464, -0.82486544,  0.21375537, -0.46560752]])

In [8]:
# 스케일된 결과 데이터를 DataFrame 으로 저장
X_scaled_bike = pd.DataFrame(data=result,columns=X_df_bike.columns)

In [9]:
X_scaled_bike

Unnamed: 0,temp,atemp,humidity,windspeed
0,-1.333661,-1.092737,0.993213,-1.567754
1,-1.438907,-1.182421,0.941249,-1.567754
2,-1.438907,-1.182421,0.941249,-1.567754
3,-1.333661,-1.092737,0.681430,-1.567754
4,-1.333661,-1.092737,0.681430,-1.567754
...,...,...,...,...
10881,-0.596935,-0.467310,-0.617666,1.617227
10882,-0.702182,-0.735182,-0.253919,0.269704
10883,-0.807428,-0.913959,-0.046064,0.269704
10884,-0.807428,-0.735182,-0.046064,-0.832442


In [10]:
# 데이터셋 분리
x_train, x_test, y_train, y_test = train_test_split(X_scaled_bike,y,test_size=0.3,random_state=12)


In [11]:
# LogisticRegression 모델 객체 생성
clf = LogisticRegression()

# 훈련 데이터를 통한 학습
clf.fit(x_train,y_train)

# 학습된 모델에 테스트 데이터셋 이용하여 예측값 생성
y_pred = clf.predict(x_test)

In [16]:
# score 메소드를 통한 정확도 측정
train_score = clf.score(x_train,y_train)
test_score = clf.score(x_test,y_test)
print('Training Data Accuracy: {:0.3f}'.format(train_score))
print('Testing Data Accuracy: {:0.3f}'.format(test_score))

Training Data Accuracy: 0.927
Testing Data Accuracy: 0.924


In [17]:
# 오차 행렬 생성
confusion = confusion_matrix(y_test,y_pred)
print('Confusion Matrixs : \n', confusion)

Confusion Matrixs : 
 [[3018    0]
 [ 248    0]]


In [18]:
# 정확도, 정밀도, 재현율 계산 
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
  
print('Accuracy: {0:.4f}, Precision: {1:.4f}, Recall: {2:.4f}'
      .format(accuracy , precision ,recall))

Accuracy: 0.9241, Precision: 0.0000, Recall: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))
