# delivery_data 로지스틱 회귀 분석
### 로지스틱 회귀란(Logistic Regression)?
회귀를 사용하여 데이터가 어떤 범주에 속할 확률을 0에서 1사이의 값으로 예측하고 그 확률에 따라 가능성이 더 높은 범주에 속하는 것으로 분류해주는 지도 학습 알고리즘이다.

### import 정의

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
import warnings 
warnings.filterwarnings('ignore')

### data 로드

In [3]:
delivery_data = pd.read_csv('delivery_data.csv', encoding='utf-8')

 - datetime : 날짜 (일별)
 - Day of the week : 요일
 - city : 도시<br>
 <pre>
    :: 서울
    강남구:11680   강동구:11740    강북구:11305    강서구:11500    관악구:11620    광진구:11215    구로구:11530    
    금천구:11545   동대문구:11230   동작구:11590    마포구:11440   서대문구:11410   서초구:11650    성동구:11200    
    성북구:11290   송파구:11710    양천구:11470    영등포구:11560   용산구:11170    은평구:11380   종로구:11110    
    중구:11140    중랑구:11260
    
    :: 경기도
    가평군: 41820  고양시 덕양구: 41281  고양시 일산동구:41285  고양시 일산서구:41287    과천시:41290
    광명시: 41210  광주시:41610    구리시:41310   군포시:41410   김포시:41570   남양주시:41360      
    동두천시:41250  부천시:41190    성남시 수정구: 41131   성남시 중원구:41133    수원시 권선구:41113     
    수원시 장안구:41111  수원시 팔달구: 41115    시흥시:41390     안산시 단원구:41273   안산시 상록구:41271     
    안성시:41270    안양시 동안구:41173    안양시 만안구:41171    양주시:41630    양평군:41830   여주시:41730     
    연천군:41800    오산시:41370    용인시 수지구:41465    용인시 처인구:41461    의왕시:41430   의정부시:41150     
    이천시:41500    파주시:41480    평택시:41220         포천시:41650          하남시:41450   화성시:41590
 </pre>
 - population : 인구수
 - hour : 시간
 - rain : 날씨 (1: 비,눈,진눈깨비 0 없음)
 - dust : 미세먼지
 - humidity : 습도
 - precipitation : 강수량
 - temp : 기온
 - windspeed : 풍량
 - holiday : 휴일 (1: 공휴일, 주말, 2: 주중)
 - sectors : 최대 이용 업종<br>
 <pre>
 1 (한식), 2 (분식) 3(카페/디저트), 4(돈까스/일식), 5(회), 6(치킨), 7(피자), 8(아시안/양식), 9(중식), 10(족발/보쌈) 
 11(야식), 12(찜탕), 13(도시락), 14(패스트푸트)
 </pre>

### 날짜 년,월,일 나누기 (update)

In [4]:
delivery_data['datetime'] = delivery_data['datetime'].apply(pd.to_datetime)
delivery_data['year'] = delivery_data['datetime'].apply(lambda x : x.year)
delivery_data['month'] = delivery_data['datetime'].apply(lambda x : x.month)
delivery_data['day'] = delivery_data['datetime'].apply(lambda x : x.day)
delivery_data = delivery_data.loc[:,['year','month','day','Day of the week','city','population','hour', 'rain','dust','humidity','precipitation','temp','windspeed','holiday','sectors']]

In [5]:
delivery_data

Unnamed: 0,year,month,day,Day of the week,city,population,hour,rain,dust,humidity,precipitation,temp,windspeed,holiday,sectors
0,2020,1,1,Wednesday,41281,467673,0,0,,75,0.0,-2.7,0.9,1,6
1,2020,1,1,Wednesday,41281,467673,1,0,,67,0.0,-1.8,1.2,1,6
2,2020,1,1,Wednesday,41281,467673,11,0,,72,0.0,0.6,1.0,1,1
3,2020,1,1,Wednesday,41281,467673,12,0,,74,0.0,0.2,0.7,1,1
4,2020,1,1,Wednesday,41281,467673,13,0,,73,0.0,0.0,0.9,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100212,2020,6,14,Sunday,11290,441812,19,0,37.0,81,0.0,18.7,1.0,0,3
100213,2020,6,19,Friday,11290,441812,11,0,25.0,62,0.0,24.2,2.6,0,3
100214,2020,6,20,Saturday,11290,441812,21,0,39.0,65,0.0,22.8,1.3,0,3
100215,2020,6,25,Thursday,11290,441812,11,1,15.0,88,0.0,20.6,1.5,0,3


### 요일 숫자형 전환
예시 Monday:1 ~ Sunday:7

In [6]:
label_delivery_data = delivery_data.copy()
label_length = len(label_delivery_data)

In [14]:
for i in range(label_length):
    if label_delivery_data['Day of the week'][i] == 'Monday':
        label_delivery_data['Day of the week'][i] = 1
    elif label_delivery_data['Day of the week'][i] == 'Tuesday':
        label_delivery_data['Day of the week'][i] = 2
    elif label_delivery_data['Day of the week'][i] == 'Wednesday':
        label_delivery_data['Day of the week'][i] = 3
    elif label_delivery_data['Day of the week'][i] == 'Thursday':
        label_delivery_data['Day of the week'][i] = 4
    elif label_delivery_data['Day of the week'][i] == 'Friday':
        label_delivery_data['Day of the week'][i] = 5
    elif label_delivery_data['Day of the week'][i] == 'Saturday':
        label_delivery_data['Day of the week'][i] = 6
    elif label_delivery_data['Day of the week'][i] == 'Sunday':
        label_delivery_data['Day of the week'][i] = 7

In [18]:
print(label_delivery_data['Day of the week'][0])

3


In [8]:
list(delivery_data.keys())

['year',
 'month',
 'day',
 'Day of the week',
 'city',
 'population',
 'hour',
 'rain',
 'dust',
 'humidity',
 'precipitation',
 'temp',
 'windspeed',
 'holiday',
 'sectors']

In [21]:
label_delivery_data

Unnamed: 0,year,month,day,Day of the week,city,population,hour,rain,dust,humidity,precipitation,temp,windspeed,holiday,sectors
0,2020,1,1,3,41281,467673,0,0,,75,0.0,-2.7,0.9,1,6
1,2020,1,1,3,41281,467673,1,0,,67,0.0,-1.8,1.2,1,6
2,2020,1,1,3,41281,467673,11,0,,72,0.0,0.6,1.0,1,1
3,2020,1,1,3,41281,467673,12,0,,74,0.0,0.2,0.7,1,1
4,2020,1,1,3,41281,467673,13,0,,73,0.0,0.0,0.9,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100212,2020,6,14,7,11290,441812,19,0,37.0,81,0.0,18.7,1.0,0,3
100213,2020,6,19,5,11290,441812,11,0,25.0,62,0.0,24.2,2.6,0,3
100214,2020,6,20,6,11290,441812,21,0,39.0,65,0.0,22.8,1.3,0,3
100215,2020,6,25,4,11290,441812,11,1,15.0,88,0.0,20.6,1.5,0,3


In [22]:
### 습도값 100인 행을 제거(update)
label_delivery_data = label_delivery_data[label_delivery_data.humidity<100]

In [23]:
### 바람 세기를 로그 변환(update)
label_delivery_data['windspeed'] = np.log1p(label_delivery_data['windspeed'])

# 습도 log 변환 시 생기는 inf -inf 값을 nan값으로 변경
label_delivery_data['windspeed'] = label_delivery_data['windspeed'].replace([np.inf, -np.inf], np.nan)

# nan 값이 들어가 있는 행 삭제
label_delivery_data = label_delivery_data.dropna()

In [34]:
label_delivery_data

Unnamed: 0,year,month,day,Day of the week,city,population,hour,rain,dust,humidity,precipitation,temp,windspeed,holiday,sectors
10599,2020,1,1,3,11305,313705,0,0,40.0,84,0.0,-4.3,0.470004,1,6
10600,2020,1,1,3,11305,313705,10,0,40.0,72,0.0,0.2,0.693147,1,2
10601,2020,1,1,3,11305,313705,11,0,40.0,73,0.0,0.0,0.336472,1,2
10602,2020,1,1,3,11305,313705,12,0,40.0,74,0.0,-0.2,0.530628,1,2
10603,2020,1,1,3,11305,313705,13,0,40.0,75,0.0,0.0,0.405465,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100212,2020,6,14,7,11290,441812,19,0,37.0,81,0.0,18.7,0.693147,0,3
100213,2020,6,19,5,11290,441812,11,0,25.0,62,0.0,24.2,1.280934,0,3
100214,2020,6,20,6,11290,441812,21,0,39.0,65,0.0,22.8,0.832909,0,3
100215,2020,6,25,4,11290,441812,11,1,15.0,88,0.0,20.6,0.916291,0,3


### 결정 경계

In [24]:
# 특성 
X = label_delivery_data[['year', 'month', 'day', 'Day of the week', 'city', 'population', 'hour', 'rain', 'precipitation', 'temp', 'windspeed', 'holiday']]
# target
y = label_delivery_data['sectors'] 

In [36]:
print(X)
print(y)

        year  month  day Day of the week   city  population  hour  rain  \
10599   2020      1    1               3  11305      313705     0     0   
10600   2020      1    1               3  11305      313705    10     0   
10601   2020      1    1               3  11305      313705    11     0   
10602   2020      1    1               3  11305      313705    12     0   
10603   2020      1    1               3  11305      313705    13     0   
...      ...    ...  ...             ...    ...         ...   ...   ...   
100212  2020      6   14               7  11290      441812    19     0   
100213  2020      6   19               5  11290      441812    11     0   
100214  2020      6   20               6  11290      441812    21     0   
100215  2020      6   25               4  11290      441812    11     1   
100216  2020      6   25               4  11290      441812    13     0   

        precipitation  temp  windspeed  holiday  
10599             0.0  -4.3   0.470004        1  

### train / test 분리 (update) shuffle=True

In [25]:
# 기존 방식
# 데이터 분할(train, test)
# from sklearn.model_selection import train_test_split
# train, test 70:30 나누기
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,shuffle=True, stratify = y, random_state=42)

## 로지스틱 회귀 모델 학습

In [28]:
from sklearn.linear_model import LogisticRegression

In [43]:
log_leg = LogisticRegression()

In [44]:
log_leg.fit(X_train, y_train)

LogisticRegression()

## 성능 평가 (predefined values)
위에서 학습된 모델을 기준으로 test set 데이터에 대하여 target을 예측해보고
실제 class와 일치하는 비율인 정확도를 통하여 성능 평가 진행

### accuracy

In [50]:
from sklearn.metrics import accuracy_score

# 로지스틱 모델 학습 성능 비교
y_pred = log_leg.predict(X_test)

# 정확도 측정
accuracy_score(y_pred, y_test)

0.3324197968724811

### f1_weighted

In [51]:
from sklearn.metrics import f1_score
y_pred = log_leg.predict(X_test)
f1_score(y_test, y_pred, average='weighted')

0.17712218512624

참고 : 
https://jimmy-ai.tistory.com/97
https://scikit-learn.org/stable/modules/model_evaluation.html