# Softmax

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
import warnings 
warnings.filterwarnings('ignore')

### data 로드

In [3]:
delivery_data = pd.read_csv('delivery_data.csv', encoding='utf-8')

In [4]:
delivery_data

Unnamed: 0,datetime,Day of the week,city,population,hour,rain,dust,humidity,precipitation,temp,windspeed,holiday,sectors
0,2020-01-01,Wednesday,41281,467673,0,0,,75,0.0,-2.7,0.9,1,6
1,2020-01-01,Wednesday,41281,467673,1,0,,67,0.0,-1.8,1.2,1,6
2,2020-01-01,Wednesday,41281,467673,11,0,,72,0.0,0.6,1.0,1,1
3,2020-01-01,Wednesday,41281,467673,12,0,,74,0.0,0.2,0.7,1,1
4,2020-01-01,Wednesday,41281,467673,13,0,,73,0.0,0.0,0.9,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
100212,2020-06-14,Sunday,11290,441812,19,0,37.0,81,0.0,18.7,1.0,0,3
100213,2020-06-19,Friday,11290,441812,11,0,25.0,62,0.0,24.2,2.6,0,3
100214,2020-06-20,Saturday,11290,441812,21,0,39.0,65,0.0,22.8,1.3,0,3
100215,2020-06-25,Thursday,11290,441812,11,1,15.0,88,0.0,20.6,1.5,0,3


In [5]:
delivery_data.shape

(100217, 13)

In [6]:
delivery_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100217 entries, 0 to 100216
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   datetime         100217 non-null  object 
 1   Day of the week  100217 non-null  object 
 2   city             100217 non-null  int64  
 3   population       100217 non-null  int64  
 4   hour             100217 non-null  int64  
 5   rain             100217 non-null  int64  
 6   dust             31671 non-null   float64
 7   humidity         100217 non-null  int64  
 8   precipitation    100217 non-null  float64
 9   temp             100217 non-null  float64
 10  windspeed        100217 non-null  float64
 11  holiday          100217 non-null  int64  
 12  sectors          100217 non-null  int64  
dtypes: float64(4), int64(7), object(2)
memory usage: 9.9+ MB


### 날짜 년,월,일 나누기 (update)

In [7]:
delivery_data['datetime'] = delivery_data['datetime'].apply(pd.to_datetime)
delivery_data['year'] = delivery_data['datetime'].apply(lambda x : x.year)
delivery_data['month'] = delivery_data['datetime'].apply(lambda x : x.month)
delivery_data['day'] = delivery_data['datetime'].apply(lambda x : x.day)
delivery_data = delivery_data.loc[:,['year','month','day','Day of the week','city','population','hour', 'rain','dust','humidity','precipitation','temp','windspeed','holiday','sectors']]

### 요일 숫자형 전환 (update)
예시 Monday:1 ~ Sunday:7

In [8]:
label_delivery_data = delivery_data.copy()
label_length = len(label_delivery_data)

In [9]:
print(label_delivery_data['Day of the week'][0])

Wednesday


In [15]:
for i in range(label_length):
    if label_delivery_data['Day of the week'][i] == 'Monday':
        label_delivery_data['Day of the week'][i] = 1
    elif label_delivery_data['Day of the week'][i] == 'Tuesday':
        label_delivery_data['Day of the week'][i] = 2
    elif label_delivery_data['Day of the week'][i] == 'Wednesday':
        label_delivery_data['Day of the week'][i] = 3
    elif label_delivery_data['Day of the week'][i] == 'Thursday':
        label_delivery_data['Day of the week'][i] = 4
    elif label_delivery_data['Day of the week'][i] == 'Friday':
        label_delivery_data['Day of the week'][i] = 5
    elif label_delivery_data['Day of the week'][i] == 'Saturday':
        label_delivery_data['Day of the week'][i] = 6
    elif label_delivery_data['Day of the week'][i] == 'Sunday':
        label_delivery_data['Day of the week'][i] = 7

In [16]:
### 습도값 100인 행을 제거(update)
label_delivery_data = label_delivery_data[label_delivery_data.humidity<100]

In [17]:
### 바람 세기를 로그 변환(update)
label_delivery_data['windspeed'] = np.log1p(label_delivery_data['windspeed'])

# 습도 log 변환 시 생기는 inf -inf 값을 nan값으로 변경
label_delivery_data['windspeed'] = label_delivery_data['windspeed'].replace([np.inf, -np.inf], np.nan)

# nan 값이 들어가 있는 행 삭제
label_delivery_data = label_delivery_data.dropna()

In [18]:
label_delivery_data

Unnamed: 0,year,month,day,Day of the week,city,population,hour,rain,dust,humidity,precipitation,temp,windspeed,holiday,sectors
10599,2020,1,1,3,11305,313705,0,0,40.0,84,0.0,-4.3,0.470004,1,6
10600,2020,1,1,3,11305,313705,10,0,40.0,72,0.0,0.2,0.693147,1,2
10601,2020,1,1,3,11305,313705,11,0,40.0,73,0.0,0.0,0.336472,1,2
10602,2020,1,1,3,11305,313705,12,0,40.0,74,0.0,-0.2,0.530628,1,2
10603,2020,1,1,3,11305,313705,13,0,40.0,75,0.0,0.0,0.405465,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100212,2020,6,14,7,11290,441812,19,0,37.0,81,0.0,18.7,0.693147,0,3
100213,2020,6,19,5,11290,441812,11,0,25.0,62,0.0,24.2,1.280934,0,3
100214,2020,6,20,6,11290,441812,21,0,39.0,65,0.0,22.8,0.832909,0,3
100215,2020,6,25,4,11290,441812,11,1,15.0,88,0.0,20.6,0.916291,0,3


In [19]:
# 특성 
X = label_delivery_data[['year', 'month', 'day', 'Day of the week', 'city', 'population', 'hour', 'rain', 'precipitation', 'temp', 'windspeed', 'holiday']]
# target
y = label_delivery_data['sectors'] 

### train / test 분리 (update) shuffle=True

In [13]:
# 기존 방식
# 데이터 분할(train, test)
# from sklearn.model_selection import train_test_split
# train, test 70:30 나누기
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,shuffle=True, stratify = y, random_state=42)

In [None]:
# 과거_220919
# from sklearn.linear_model import LogisticRegression
# softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=10)
# softmax_reg.fit(X_train, y_train)

In [21]:
from sklearn.linear_model import LogisticRegression
softmax_reg = LogisticRegression(multi_class='multinomial', random_state=42)
softmax_reg.fit(X_train, y_train)

LogisticRegression(multi_class='multinomial', random_state=42)

## 성능 평가 (predefined values)
위에서 학습된 모델을 기준으로 test set 데이터에 대하여 target을 예측해보고
실제 class와 일치하는 비율인 정확도를 통하여 성능 평가 진행

### accuracy

In [22]:
from sklearn.metrics import accuracy_score

y_pred = softmax_reg.predict(X_test)

# 정확도 측정
accuracy_score(y_pred, y_test)

0.3324197968724811

### f1_weighted

In [25]:
from sklearn.metrics import f1_score
y_pred = softmax_reg.predict(X_test)
f1_score(y_test, y_pred, average='weighted')

0.17712218512624