In [1]:
%pip install lightgbm

Defaulting to user installation because normal site-packages is not writeable
Collecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-win_amd64.whl (1.0 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.5
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import train_test_split

### 정밀도 재현율 트레이드 오프
- 사이킷런 분류 알고리즘은 (이진 분류)
- 특정 레이블(1,0)에 속하는지 계산하기 위해서 -> 레이블 결정 확률 구하는 것
- 이진 분류 임계값 조절 시에는 binarizer(threshold=값) 조정 가능, 구글링 진행해 보기!

### 필수과제1. 임계값 조절하여 분류 테스트 변화 확인해 보기
- 임계값을 3개 정도(원하는 값으로) 잡고 각각의 평가 지표 변화 확인해 보기!

### 정밀도와 재현율은 어떤 상황에서 중요한 지표로 선택될 것인가?
- 재현율이 중요한 경우는 실제 positive 양성 데이터를 negative 잘못 판단하게 되면 문제가 되는 경우
- 재현율이 가장 중요한 사례는 암환자 판단
- 양성 암 환자를 양성이 아닌 음성으로 판단하면 큰일이다.
- 보험 사기 (대출 사기)
- 1인 사기 건인데, 0으로 판단하면 안 된다.
- 0을 잘못해서 1로 판단하면? -> 코스트 비용이 많이 든다.

- 반대로 정밀도가 중요한 경우?
- 스팸 메일 판단할 때 positive 스팸 메일을 negative 일반 메일로 분류해도 유저 입장에서 한 번 보면 된다.
- negative 일반 메일 스팸 메일로 분류하면 큰 이슈가 된다.

### F1 스코어
- 정밀도와 재현율을 결합한 지표
- 어느 한 쪽으로 치우치치 않는 수치를 나타낼 때 진행
- F1 2 (precision recall)/(precision+recall)

In [7]:
def f_1(pr, re):
    return (2*((pr*re)/(pr+re)))

In [8]:
# 머신러닝 결과
# pr 0.9 re 0.1 무엇이 잘 된 건가?
# pr 0.5 re 0.5 두 개의 결과를 비교했을 때 어떤 것을 성능이 좋다고 말할 수 있는가?

f_1(0.5,0.5)

0.5

In [9]:
f_1(0.9,0.1)

0.18000000000000002

### ROC 곡선과 AUC
- 이진 분류 예측에서 사용하는 것
- AUC 스코어는 이진 분류의 예측 성능 측정에 중요하게 사용되는 지표
- ROC 커브 (Receiver Operation Characteristic Curve)
- ROC 커브 FPR(False Positive Rate)가 변할 때 TPR(True Positive Rate)이 어떻게 변하는지 나타내는 곡선
- FPR x축, TPR y축, FPR 변화에 따른 TPR 변화 곡선을 나타낸 것
- TPR은 재현율, 민감도
- 특이성(specificity): 민감도에 대응되는 특이성 지표 (1-재현율, 민감도)
- Specificity(true negative rate) is the probability of a negative test result, conditioned on the individual truly being negative.

- 민감도 실제값(양성)이 정확히 예측되어야 하는 수준 (질병이 있으면 질병이 있다고 양성 판정)
- 특이성 지표(TNR) 실제값 Negative(음성)이 정확히 예측되어야 하는 수준 (질병이 없는 사람은 질병이 없다고 판단하는 것)

- FPR = FP/(FP+TN) = 1-TNR = 1-특이성

- ROC 곡선 자체는 FPR, TPR 변화의 값을 보는 데 이용하고, 분류 성능 지표로 사용되는 건 ROC 곡선 면적에 기반한 AUC값
- Area Under Curve AUC
- AUC가 커지려면 어떻게 해야하지? FPR 작은 상태에서 가장 높은 TPR이 나와야 한다. FPR 작은 상태에서 TPR을 얼마나 얻을 수 있나?

### 조별과제 제주 교통량 데이터 확인
- MAE 잔차의 절댓값
- 기본 Base LGBM을 사용, 튜닝은 없고

분석쉽죠(10조)의 csv 파일을 이용해서 진행함

### 기존 raw train.csv

In [2]:
train = pd.read_csv("train.csv")

In [10]:
train

Unnamed: 0.1,Unnamed: 0,lane_count,multi_linked,maximum_speed_limit,start_turn_restricted,end_turn_restricted,target,금,목,수,...,quarter_2,quarter_3,quarter_4,rating_103,rating_106,rating_107,norm_start_latitude,norm_start_longitude,norm_end_latitude,norm_end_longitude
0,0,1,0,60.0,0,0,52.0,0,1,0,...,1,0,0,0,1,0,0.589530,0.641427,0.589534,0.641057
1,1,2,0,60.0,1,0,30.0,0,1,0,...,0,1,0,1,0,0,0.822965,0.463021,0.836017,0.459191
2,2,2,0,80.0,0,0,61.0,0,0,0,...,0,0,1,1,0,0,0.114228,0.248530,0.117193,0.239911
3,3,2,0,50.0,0,0,20.0,1,0,0,...,0,0,0,0,0,1,0.008473,0.513932,0.006824,0.512628
4,4,2,0,80.0,0,0,38.0,0,0,0,...,0,0,1,1,0,0,0.699772,0.192342,0.701251,0.197154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4701212,4701212,1,0,50.0,0,0,20.0,0,1,0,...,0,0,1,0,0,1,0.571610,0.127629,0.567803,0.121783
4701213,4701213,2,0,80.0,0,0,65.0,0,1,0,...,0,0,0,0,0,1,0.732686,0.323057,0.732751,0.323755
4701214,4701214,2,0,60.0,0,0,30.0,0,0,0,...,1,0,0,1,0,0,0.651694,0.975462,0.641900,0.975955
4701215,4701215,2,0,80.0,0,0,73.0,0,0,1,...,0,0,1,1,0,0,0.640222,0.333011,0.644699,0.335035


In [3]:
train_sp = train.select_dtypes(exclude="object")

In [11]:
train_sp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4701217 entries, 0 to 4701216
Data columns (total 16 columns):
 #   Column               Dtype  
---  ------               -----  
 0   base_date            int64  
 1   base_hour            int64  
 2   lane_count           int64  
 3   road_rating          int64  
 4   multi_linked         int64  
 5   connect_code         int64  
 6   maximum_speed_limit  float64
 7   vehicle_restricted   float64
 8   weight_restricted    float64
 9   height_restricted    float64
 10  road_type            int64  
 11  start_latitude       float64
 12  start_longitude      float64
 13  end_latitude         float64
 14  end_longitude        float64
 15  target               float64
dtypes: float64(9), int64(7)
memory usage: 573.9 MB


### 조별과제 데이터

In [4]:
train = pd.read_csv('10조_분석쉽조.csv')

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4701217 entries, 0 to 4701216
Data columns (total 57 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Unnamed: 0             int64  
 1   lane_count             int64  
 2   multi_linked           int64  
 3   maximum_speed_limit    float64
 4   start_turn_restricted  int64  
 5   end_turn_restricted    int64  
 6   target                 float64
 7   금                      int64  
 8   목                      int64  
 9   수                      int64  
 10  월                      int64  
 11  일                      int64  
 12  토                      int64  
 13  화                      int64  
 14  code_0                 int64  
 15  code_103               int64  
 16  weight_0               int64  
 17  weight_32400           int64  
 18  weight_43200           int64  
 19  weight_50000           int64  
 20  road_0                 int64  
 21  road_3                 int64  
 22  hour_0            

In [6]:
train.target.head()

0    52.0
1    30.0
2    61.0
3    20.0
4    38.0
Name: target, dtype: float64

In [12]:
train

Unnamed: 0.1,Unnamed: 0,lane_count,multi_linked,maximum_speed_limit,start_turn_restricted,end_turn_restricted,target,금,목,수,...,quarter_2,quarter_3,quarter_4,rating_103,rating_106,rating_107,norm_start_latitude,norm_start_longitude,norm_end_latitude,norm_end_longitude
0,0,1,0,60.0,0,0,52.0,0,1,0,...,1,0,0,0,1,0,0.589530,0.641427,0.589534,0.641057
1,1,2,0,60.0,1,0,30.0,0,1,0,...,0,1,0,1,0,0,0.822965,0.463021,0.836017,0.459191
2,2,2,0,80.0,0,0,61.0,0,0,0,...,0,0,1,1,0,0,0.114228,0.248530,0.117193,0.239911
3,3,2,0,50.0,0,0,20.0,1,0,0,...,0,0,0,0,0,1,0.008473,0.513932,0.006824,0.512628
4,4,2,0,80.0,0,0,38.0,0,0,0,...,0,0,1,1,0,0,0.699772,0.192342,0.701251,0.197154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4701212,4701212,1,0,50.0,0,0,20.0,0,1,0,...,0,0,1,0,0,1,0.571610,0.127629,0.567803,0.121783
4701213,4701213,2,0,80.0,0,0,65.0,0,1,0,...,0,0,0,0,0,1,0.732686,0.323057,0.732751,0.323755
4701214,4701214,2,0,60.0,0,0,30.0,0,0,0,...,1,0,0,1,0,0,0.651694,0.975462,0.641900,0.975955
4701215,4701215,2,0,80.0,0,0,73.0,0,0,1,...,0,0,1,1,0,0,0.640222,0.333011,0.644699,0.335035


In [13]:
train.columns

Index(['Unnamed: 0', 'lane_count', 'multi_linked', 'maximum_speed_limit',
       'start_turn_restricted', 'end_turn_restricted', 'target', '금', '목', '수',
       '월', '일', '토', '화', 'code_0', 'code_103', 'weight_0', 'weight_32400',
       'weight_43200', 'weight_50000', 'road_0', 'road_3', 'hour_0', 'hour_1',
       'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8',
       'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14',
       'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20',
       'hour_21', 'hour_22', 'hour_23', 'quarter_1', 'quarter_2', 'quarter_3',
       'quarter_4', 'rating_103', 'rating_106', 'rating_107',
       'norm_start_latitude', 'norm_start_longitude', 'norm_end_latitude',
       'norm_end_longitude'],
      dtype='object')

In [14]:
train_x = train[['lane_count', 'multi_linked', 'maximum_speed_limit',
       'start_turn_restricted', 'end_turn_restricted', '금', '목', '수',
       '월', '일', '토', '화', 'code_0', 'code_103', 'weight_0', 'weight_32400',
       'weight_43200', 'weight_50000', 'road_0', 'road_3', 'hour_0', 'hour_1',
       'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8',
       'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14',
       'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20',
       'hour_21', 'hour_22', 'hour_23', 'quarter_1', 'quarter_2', 'quarter_3',
       'quarter_4', 'rating_103', 'rating_106', 'rating_107',
       'norm_start_latitude', 'norm_start_longitude', 'norm_end_latitude',
       'norm_end_longitude']]

In [15]:
train_y = train['target']

In [16]:
train

Unnamed: 0.1,Unnamed: 0,lane_count,multi_linked,maximum_speed_limit,start_turn_restricted,end_turn_restricted,target,금,목,수,...,quarter_2,quarter_3,quarter_4,rating_103,rating_106,rating_107,norm_start_latitude,norm_start_longitude,norm_end_latitude,norm_end_longitude
0,0,1,0,60.0,0,0,52.0,0,1,0,...,1,0,0,0,1,0,0.589530,0.641427,0.589534,0.641057
1,1,2,0,60.0,1,0,30.0,0,1,0,...,0,1,0,1,0,0,0.822965,0.463021,0.836017,0.459191
2,2,2,0,80.0,0,0,61.0,0,0,0,...,0,0,1,1,0,0,0.114228,0.248530,0.117193,0.239911
3,3,2,0,50.0,0,0,20.0,1,0,0,...,0,0,0,0,0,1,0.008473,0.513932,0.006824,0.512628
4,4,2,0,80.0,0,0,38.0,0,0,0,...,0,0,1,1,0,0,0.699772,0.192342,0.701251,0.197154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4701212,4701212,1,0,50.0,0,0,20.0,0,1,0,...,0,0,1,0,0,1,0.571610,0.127629,0.567803,0.121783
4701213,4701213,2,0,80.0,0,0,65.0,0,1,0,...,0,0,0,0,0,1,0.732686,0.323057,0.732751,0.323755
4701214,4701214,2,0,60.0,0,0,30.0,0,0,0,...,1,0,0,1,0,0,0.651694,0.975462,0.641900,0.975955
4701215,4701215,2,0,80.0,0,0,73.0,0,0,1,...,0,0,1,1,0,0,0.640222,0.333011,0.644699,0.335035


In [17]:
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.3, random_state=111)

In [18]:
LR = lgb.LGBMRegressor(random_state=111).fit(X_train, y_train)

In [19]:
pred = LR.predict(X_test)

In [20]:
pred

array([38.05260684, 17.34726947, 30.01463445, ..., 35.40401913,
       61.4830966 , 29.02842805])

In [21]:
from sklearn.metrics import mean_absolute_error

In [22]:
mean_absolute_error(y_test, pred)

4.880460015716158

- 분석쉽조 4.88