In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
import os
from functools import reduce
import warnings 
warnings.filterwarnings('ignore')

In [2]:
# 분류모델 import 
# Logistic Regression,SGD,KNN,SVM,Decision Tree, Random Forest, LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report,roc_auc_score,f1_score

lr = LogisticRegression()
knn = KNeighborsClassifier()
svm = SVC()
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
lgbm = LGBMClassifier()

In [21]:
df_coldwave = pd.read_csv('./preprocessed/df_coldwave.csv',encoding='cp949',index_col=0)
df_sea = pd.read_csv('./preprocessed/국내_바다.csv',index_col=0)

In [22]:
df_coldwave.drop(['연도','날짜','지점'],axis=1,inplace=True)

In [23]:
# 중복된 날짜 제거 
df_coldwave = df_coldwave.drop_duplicates()

In [24]:
df_sea.rename(columns={'시간':'일시'},inplace=True)

In [25]:
# 주변국과 바다기후를  merge
cold_sea = pd.merge(df_coldwave,df_sea, how='right',on='일시')

In [26]:
# 재난일어난날 컬럼을 바이너리로 만들기 
cold_sea['재난일어난날'] = cold_sea['재난일어난날'].fillna(0)

In [27]:
cold_sea['재난일어난날'] =cold_sea['재난일어난날'].apply(lambda x: x if str(x) == '0' else '1')

In [29]:
# 전부 뉴메릭으로 만들기 
cold_sea['재난일어난날']=cold_sea['재난일어난날'].astype(int)

In [33]:
cold_sea.drop('일시',axis=1,inplace=True)

In [39]:
cold_sea.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38494 entries, 14 to 38507
Data columns (total 35 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   재난일어난날       38494 non-null  float64
 1   평균 기온        38494 non-null  float64
 2   최저 기온        38494 non-null  float64
 3   최고 기온        38494 non-null  float64
 4   1시간 최다강수량    38494 non-null  float64
 5   일강수량         38494 non-null  float64
 6   최대 순간풍속      38494 non-null  float64
 7   최대 순간 풍속 풍향  38494 non-null  float64
 8   최대 풍속        38494 non-null  float64
 9   최대 풍속 풍향     38494 non-null  float64
 10  평균 풍속        38494 non-null  float64
 11  풍정합          38494 non-null  float64
 12  최대 풍향        38494 non-null  float64
 13  평균 이슬점온도     38494 non-null  float64
 14  평균 상대습도      38494 non-null  float64
 15  평균 증기압       38494 non-null  float64
 16  평균 현지기압      38494 non-null  float64
 17  최고 해면 기압     38494 non-null  float64
 18  최저 해면기압      38494 non-null  float64
 19  평균 

In [35]:
# 위치를 원핫인코딩하기
cold_sea = pd.get_dummies(cold_sea)

In [36]:
# 재난일어난날 2주뒤로 미뤄서 예측값 찾기 
cold_sea['재난일어난날'] = cold_sea['재난일어난날'].shift(periods=14, axis=0)

In [37]:
# 뒤로미룬만큼 생긴 nan값 제거 
cold_sea.dropna(inplace=True)

In [40]:
cold_sea

Unnamed: 0,재난일어난날,평균 기온,최저 기온,최고 기온,1시간 최다강수량,일강수량,최대 순간풍속,최대 순간 풍속 풍향,최대 풍속,최대 풍속 풍향,...,합계 3시간 신적설,평균 전운량,평균 중하층운량,평균 지면온도,합계 대형증발량,합계 소형증발량,안개 계속 시간,관측지점_백령도,관측지점_울릉도,관측지점_제주
14,0.0,5.7,4.3,6.5,0.0,0.0,22.4,320.0,15.0,320.0,...,0.0,8.5,-999.0,3.4,2.614596,4.0,0.0,0,0,1
15,0.0,6.4,4.2,8.9,0.0,0.0,15.5,320.0,11.0,320.0,...,0.0,2.5,-999.0,4.9,2.614596,3.1,0.0,0,0,1
16,0.0,3.6,-0.4,9.6,0.0,0.0,19.0,340.0,13.0,360.0,...,0.0,9.3,-999.0,2.4,2.614596,1.4,0.0,0,0,1
17,0.0,1.4,0.2,2.7,0.0,0.0,14.0,360.0,9.0,360.0,...,0.0,10.0,-999.0,0.9,2.614596,0.7,0.0,0,0,1
18,1.0,4.4,2.2,6.5,0.0,1.2,10.6,340.0,7.5,340.0,...,0.0,9.3,-999.0,3.6,2.614596,1.1,0.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38503,0.0,2.0,-1.4,5.8,0.0,0.0,9.5,140.0,7.2,140.0,...,0.0,5.3,4.5,-2.1,1.100000,1.5,0.0,1,0,0
38504,0.0,2.7,-1.7,6.5,0.0,0.0,19.8,340.0,13.3,340.0,...,0.0,5.3,5.3,1.3,2.000000,2.8,0.0,1,0,0
38505,0.0,-0.8,-2.2,1.4,0.0,0.3,21.1,340.0,14.2,340.0,...,0.0,6.1,6.1,-1.2,1.800000,2.5,0.0,1,0,0
38506,0.0,-3.1,-3.9,-1.5,0.0,0.0,20.6,340.0,13.4,340.0,...,0.0,4.8,4.8,-2.7,1.100000,1.6,0.0,1,0,0


In [None]:
# 한파주변국 데이터 전처리 끝 모델돌리기 

In [42]:
y = cold_sea.재난일어난날
x = cold_sea.drop('재난일어난날',axis=1)

In [43]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [44]:
models = [lr,dtc,rfc,lgbm]
# result = []
for i in models:
    i.fit(x_train,y_train)
    pred = i.predict(x_test)
    proba = i.predict_proba(x_test)[:,1]
    
    print(i.__class__.__name__)
    print(classification_report(y_test,pred))
    print('roc_auc_score :',roc_auc_score(y_test,proba))
    print('-'*50)

LogisticRegression
              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93      6740
         1.0       0.55      0.27      0.36       959

    accuracy                           0.88      7699
   macro avg       0.73      0.62      0.65      7699
weighted avg       0.86      0.88      0.86      7699

roc_auc_score : 0.905595591352268
--------------------------------------------------
DecisionTreeClassifier
              precision    recall  f1-score   support

         0.0       0.92      0.92      0.92      6740
         1.0       0.44      0.43      0.43       959

    accuracy                           0.86      7699
   macro avg       0.68      0.67      0.68      7699
weighted avg       0.86      0.86      0.86      7699

roc_auc_score : 0.6745931097860964
--------------------------------------------------
RandomForestClassifier
              precision    recall  f1-score   support

         0.0       0.91      0.97      0.94      6740
 

In [45]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [46]:
scaler.fit(x_train)

StandardScaler()

In [47]:
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [48]:
y_train.value_counts()

0.0    27039
1.0     3756
Name: 재난일어난날, dtype: int64

In [49]:
import imblearn
from imblearn.over_sampling import SMOTE

In [50]:
smote = SMOTE(random_state=0)
x_train_sm,y_train_sm = smote.fit_resample(x_train_scaled,y_train)

In [51]:
y_train_sm.value_counts()

0.0    27039
1.0    27039
Name: 재난일어난날, dtype: int64

In [52]:
models = [lr,dtc,rfc,lgbm]
# result = []
for i in models:
    i.fit(x_train_sm,y_train_sm)
    pred = i.predict(x_test_scaled)
    proba = i.predict_proba(x_test_scaled)[:,1]
    
    print(i.__class__.__name__)
    print(classification_report(y_test,pred))
    print('roc_auc_score :',roc_auc_score(y_test,proba))
    print('-'*50)
    

LogisticRegression
              precision    recall  f1-score   support

         0.0       0.98      0.81      0.89      6740
         1.0       0.41      0.90      0.56       959

    accuracy                           0.82      7699
   macro avg       0.69      0.86      0.72      7699
weighted avg       0.91      0.82      0.85      7699

roc_auc_score : 0.9192316736957079
--------------------------------------------------
DecisionTreeClassifier
              precision    recall  f1-score   support

         0.0       0.93      0.89      0.91      6740
         1.0       0.42      0.55      0.48       959

    accuracy                           0.85      7699
   macro avg       0.68      0.72      0.69      7699
weighted avg       0.87      0.85      0.86      7699

roc_auc_score : 0.720761534486653
--------------------------------------------------
RandomForestClassifier
              precision    recall  f1-score   support

         0.0       0.96      0.89      0.92      6740
 