# 업 샘플링 로지스틱 회귀

## 종합코드

In [None]:
# 라이브러리 로드
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from scipy.special import expit

# 데이터 로딩
df = pd.read_csv('./data/concat.csv', encoding='utf-8')

# 코드
print(pd.unique(df['특보강도']))

# 데이터 셋
df_input = df[['WIND_SPEED', 'GUST_WIND', 'MAX_WAVE', 'SIG_WAVE', 'HPA']].to_numpy()
df_target = df[['특보강도']].to_numpy()

# 데이터 확인
print(df_input[:5])
print(df_target[:5])

# 훈련세트와 데이터 세트로 나누기
train_input, test_input, train_target, test_target = train_test_split(df_input, df_target, random_state=42)

# 표준화 전처리
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

# k - 최근접 이웃 분류기의 확률 예측
kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(train_scaled, train_target)
print(kn.score(train_scaled, train_target))
print(kn.score(test_scaled, test_target))

# 샘플 예측
proba = kn.predict_proba(test_scaled[:5])
print(np.round(proba,decimals=4))

# 계산한 확률이 가까운 이웃과 비율 맞는지
distanced, indexes = kn.kneighbors(test_scaled[3:4])
print(train_target[indexes])

# plot
z= np.arange(0, 20, 10)
phi = 1 / (1 + np.exp(-z))
plt.plot(z, phi)
plt.xlabel('z')
plt.ylabel('phi')
plt.show() 

# 0/1 행 도출
st_indexes = (train_target == '1') | (train_target == '2')
train_st = train_scaled[st_indexes]
target_st = train_target[st_indexes]

# 로지스틱 회귀 훈련
lr = LogisticRegression(C=20, max_iter=1000)
lr.fit(train_scaled, train_target)
print(lr.score(train_scaled, train_target))
print(lr.score(test_scaled, test_target))

# 5개의 샘플 예측
print(lr.predict(test_scaled[:5]))

# 5개의 샘플 예측 확률
proba = lr.predict_proba(test_scaled[:10])
print(np.round(proba, decimals=3))

# 속성 확인
print(lr.classes_)

# 로지스틱 회귀가 학습한 계수
print(lr.coef_, lr.intercept_)

# z값 출력
decisions = lr.decision_function(train_st[:5])
print(decisions)

# decisions 배열의 값을 확률로 변환
print(expit(decisions))

## 업샘플링 로지스틱 회귀

In [32]:
# 라이브러리 로드
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from scipy.special import expit

In [50]:
# 데이터 로딩
df = pd.read_csv('./data/up_sampling.csv', encoding='utf-8')

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138976 entries, 0 to 138975
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   일시          138976 non-null  object 
 1   WIND_SPEED  138976 non-null  float64
 2   GUST_WIND   138976 non-null  float64
 3   SIG_WAVE    138976 non-null  float64
 4   MAX_WAVE    138976 non-null  float64
 5   HPA         138976 non-null  float64
 6   특보강도        138976 non-null  float64
dtypes: float64(6), object(1)
memory usage: 7.4+ MB


In [65]:
# obj -> datetime
df['일시'] = pd.to_datetime(df['일시'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138976 entries, 0 to 138975
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   일시          138976 non-null  datetime64[ns]
 1   WIND_SPEED  138976 non-null  float64       
 2   GUST_WIND   138976 non-null  float64       
 3   SIG_WAVE    138976 non-null  float64       
 4   MAX_WAVE    138976 non-null  float64       
 5   HPA         138976 non-null  float64       
 6   특보강도        138976 non-null  float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 7.4 MB


In [76]:
# 인덱스
df = df.set_index('일시')
df

Unnamed: 0_level_0,WIND_SPEED,GUST_WIND,SIG_WAVE,MAX_WAVE,HPA,특보강도
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-04-21 18:00:00,5.5,12.7,1.1,1.7,1007.0,1.0
2018-08-17 07:00:00,10.6,15.2,0.7,1.1,1012.9,1.0
2018-06-10 12:00:00,6.5,15.2,0.8,1.2,1005.4,1.0
2018-05-20 12:00:00,10.7,18.8,0.8,1.1,1019.9,1.0
2015-11-25 09:00:00,6.4,13.3,0.9,1.4,1019.2,1.0
...,...,...,...,...,...,...
2020-12-29 22:00:00,12.5,15.9,1.0,1.7,1013.4,0.0
2020-12-29 23:00:00,13.3,17.3,1.1,1.6,1013.1,0.0
2020-12-30 00:00:00,13.6,16.8,1.1,1.8,1013.5,0.0
2020-12-30 01:00:00,12.3,15.8,1.1,1.8,1014.1,0.0


In [77]:
# 코드
print(pd.unique(df['특보강도']))

[1. 0.]


In [88]:
# 데이터 셋
df_input = df[['WIND_SPEED', 'GUST_WIND', 'MAX_WAVE', 'SIG_WAVE', 'HPA']].to_numpy()
df_target = df[['특보강도']].to_numpy()

In [89]:
df_input

array([[5.5000e+00, 1.2700e+01, 1.7000e+00, 1.1000e+00, 1.0070e+03],
       [1.0600e+01, 1.5200e+01, 1.1000e+00, 7.0000e-01, 1.0129e+03],
       [6.5000e+00, 1.5200e+01, 1.2000e+00, 8.0000e-01, 1.0054e+03],
       ...,
       [1.3600e+01, 1.6800e+01, 1.8000e+00, 1.1000e+00, 1.0135e+03],
       [1.2300e+01, 1.5800e+01, 1.8000e+00, 1.1000e+00, 1.0141e+03],
       [9.3000e+00, 1.3100e+01, 1.7000e+00, 1.2000e+00, 1.0140e+03]])

In [90]:
df_target

array([[1.],
       [1.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

In [91]:
# 훈련세트와 데이터 세트로 나누기
train_input, test_input, train_target, test_target = train_test_split(df_input, df_target, random_state=42)

In [92]:
# 표준화 전처리
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)


In [93]:
# k - 최근접 이웃 분류기의 확률 예측
kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(train_scaled, train_target)
print(kn.score(train_scaled, train_target))
print(kn.score(test_scaled, test_target))

  return self._fit(X, y)


0.9764563665668893
0.9549850333870596


In [94]:
# 샘플 예측
proba = kn.predict_proba(test_scaled[:5])
print(np.round(proba,decimals=4))


[[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]


In [95]:
# 계산한 확률이 가까운 이웃과 비율 맞는지
distanced, indexes = kn.kneighbors(test_scaled[3:4])
print(train_target[indexes])

[[[1.]
  [1.]
  [1.]]]


In [96]:
# 0/1 행 도출
st_indexes = (train_target == '1') | (train_target == '0')
train_st = train_scaled[st_indexes]
target_st = train_target[st_indexes]


  st_indexes = (train_target == '1') | (train_target == '0')


In [97]:
scaler = StandardScaler()
train_input = scaler.fit_transform(train_input)
test_input = scaler.transform(test_input)

In [98]:
# 로지스틱 회귀 훈련
lr = LogisticRegression(C=20, max_iter=1000)
lr.fit(train_scaled, train_target)
print(lr.score(train_scaled, train_target))
print(lr.score(test_scaled, test_target))


0.8118715941361578
0.8121402256504721


  y = column_or_1d(y, warn=True)


In [46]:
model = LogisticRegression()
model.fit(train_input, train_target)

  y = column_or_1d(y, warn=True)


In [48]:
print(model.coef_)

[[-0.28911932  1.52060898 -2.75405452  4.56638859 -0.0167087 ]]


In [47]:
print(model.score(train_input, train_target))

0.8120922557372017


In [None]:
# 5개의 샘플 예측
print(lr.predict(test_scaled[:5]))

[0. 0. 0. 1. 0.]


In [None]:
# 5개의 샘플 예측 확률
proba = lr.predict_proba(test_scaled[:10])
print(np.round(proba, decimals=3))


[[0.938 0.062]
 [0.563 0.437]
 [0.66  0.34 ]
 [0.008 0.992]
 [0.683 0.317]
 [0.002 0.998]
 [0.319 0.681]
 [0.164 0.836]
 [0.292 0.708]
 [0.648 0.352]]


In [None]:
print(lr.classes_)

[0. 1.]


In [None]:
# 로지스틱 회귀가 학습한 계수
print(lr.coef_, lr.intercept_)

[[-0.28863885  1.52019745 -2.80454525  4.61985531 -0.01684893]] [0.34361384]


In [None]:
sun = np.array([8.1,11.3,1.3,0.9,1032.9])
rain = np.array([4.6,6.6,1.5,1.0,1012.1,1.0])
hurri = np.array([14.8,20.1,5.5,3.6,991.3])

In [None]:
sample_weather = np.array([sun, rain, hurri])

  sample_weather = np.array([sun, rain, hurri])


In [None]:
from sklearn.preprocessing import StandardScaler