In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import OneSidedSelection
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold

from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier
from lightgbm import early_stopping, LGBMClassifier
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
import joblib
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.preprocessing import MinMaxScaler
plt.rc("font", family = "Malgun Gothic")
sns.set(font="Malgun Gothic", 
rc={"axes.unicode_minus":False}, style='darkgrid')
%matplotlib inline
warnings.filterwarnings("ignore")

df = pd.read_csv('./dataset/undersampled_data.csv')

In [2]:
df.drop('Unnamed: 0', axis =1, inplace=True)

df['일시'] = pd.to_datetime(df['일시'])

df['day_name'] = df['일시'].dt.day_name()
df['Month'] = df['일시'].dt.month

df.head()

Unnamed: 0,일시,지점명,기온,강수량,풍속,풍향,습도,실효습도,label,day_name,Month
0,2012-01-01 00:00:00,속초,-0.5,0.0,0.9,50.0,57.0,47.44,0,Sunday,1
1,2012-01-01 01:00:00,속초,0.3,0.0,1.2,290.0,54.0,45.95,0,Sunday,1
2,2012-01-01 02:00:00,속초,0.3,0.0,1.4,320.0,53.0,44.15,0,Sunday,1
3,2012-01-01 03:00:00,속초,0.5,0.0,1.5,290.0,52.0,43.85,0,Sunday,1
4,2012-01-01 04:00:00,속초,0.3,0.0,1.5,290.0,51.0,45.0,0,Sunday,1


In [3]:
df_copy = df.copy()
df_copy['Month'] = df_copy['Month'].astype(str)
df_copy.drop(['일시','지점명','풍향','강수량'], axis=1, inplace=True)

temp_scaler = MinMaxScaler()
scaled1 = temp_scaler.fit_transform(df_copy['기온'].values.reshape(-1, 1))    
df_copy.insert(0, 'scaled1', scaled1)
df_copy.drop(['기온'], axis=1, inplace=True)

ws_scaler = MinMaxScaler()
scaled2 = ws_scaler.fit_transform(df_copy['풍속'].values.reshape(-1, 1))    
df_copy.insert(1, 'scaled2', scaled2)   
df_copy.drop(['풍속'], axis=1, inplace=True)

humid_scaler = MinMaxScaler()
scaled3 = humid_scaler.fit_transform(df_copy['습도'].values.reshape(-1, 1))    
df_copy.insert(2, 'scaled3', scaled3)
df_copy.drop(['습도'], axis=1, inplace=True)

effhumid_scaler = MinMaxScaler()
scaled4 = effhumid_scaler.fit_transform(df_copy['실효습도'].values.reshape(-1, 1))    
df_copy.insert(3, 'scaled4', scaled4)
df_copy.drop(['실효습도'], axis=1, inplace=True)

df_copy = pd.get_dummies(df_copy)

df_copy

Unnamed: 0,scaled1,scaled2,scaled3,scaled4,label,day_name_Friday,day_name_Monday,day_name_Saturday,day_name_Sunday,day_name_Thursday,...,Month_11,Month_12,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9
0,0.406107,0.028846,0.561224,0.520842,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0.418321,0.038462,0.530612,0.500871,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0.418321,0.044872,0.520408,0.476746,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0.421374,0.048077,0.510204,0.472725,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0.418321,0.048077,0.500000,0.488138,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1957707,0.787786,0.028846,0.663265,0.612384,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1957708,0.754198,0.028846,0.785714,0.707278,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1957709,0.741985,0.022436,0.775510,0.742796,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1957710,0.741985,0.038462,0.785714,0.745611,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [4]:
df_copy.columns

Index(['scaled1', 'scaled2', 'scaled3', 'scaled4', 'label', 'day_name_Friday',
       'day_name_Monday', 'day_name_Saturday', 'day_name_Sunday',
       'day_name_Thursday', 'day_name_Tuesday', 'day_name_Wednesday',
       'Month_1', 'Month_10', 'Month_11', 'Month_12', 'Month_2', 'Month_3',
       'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9'],
      dtype='object')

In [5]:
joblib.dump(temp_scaler, './temp_scaler.pkl')
joblib.dump(ws_scaler, './ws_scaler.pkl')
joblib.dump(humid_scaler, './humid_scaler.pkl')
joblib.dump(effhumid_scaler, './effhumid_scaler.pkl')

['./effhumid_scaler.pkl']

In [6]:
temp_scaler = joblib.load('./temp_scaler.pkl')
ws_scaler = joblib.load('./ws_scaler.pkl')
humid_scaler = joblib.load('./humid_scaler.pkl')
effhumid_scaler = joblib.load('./effhumid_scaler.pkl')
model = joblib.load('./비교용/final_model.pkl')

In [7]:
test = pd.read_csv('./dataset/test_sample_2020.csv')
test

Unnamed: 0.1,Unnamed: 0,일시,지점명,기온,강수량,풍속,풍향,습도,실효습도
0,0,2020-04-24 00:00,안동,5.0,,1.9,290,40,42.31
1,1,2020-04-24 01:00,안동,4.9,,0.6,250,45,44.85
2,2,2020-04-24 02:00,안동,4.1,,0.8,290,50,47.1
3,3,2020-04-24 03:00,안동,4.3,,0.7,320,51,47.91
4,4,2020-04-24 04:00,안동,3.7,,1.1,290,55,49.25
5,5,2020-04-24 05:00,안동,3.2,,2.3,290,60,51.81
6,6,2020-04-24 06:00,안동,3.0,,1.4,320,63,54.07
7,7,2020-04-24 07:00,안동,3.7,,1.0,320,62,52.63
8,8,2020-04-24 08:00,안동,6.3,,1.6,250,56,48.01
9,9,2020-04-24 09:00,안동,8.6,,2.3,270,53,42.23


In [8]:
test = test.fillna(0)
test

Unnamed: 0.1,Unnamed: 0,일시,지점명,기온,강수량,풍속,풍향,습도,실효습도
0,0,2020-04-24 00:00,안동,5.0,0.0,1.9,290,40,42.31
1,1,2020-04-24 01:00,안동,4.9,0.0,0.6,250,45,44.85
2,2,2020-04-24 02:00,안동,4.1,0.0,0.8,290,50,47.1
3,3,2020-04-24 03:00,안동,4.3,0.0,0.7,320,51,47.91
4,4,2020-04-24 04:00,안동,3.7,0.0,1.1,290,55,49.25
5,5,2020-04-24 05:00,안동,3.2,0.0,2.3,290,60,51.81
6,6,2020-04-24 06:00,안동,3.0,0.0,1.4,320,63,54.07
7,7,2020-04-24 07:00,안동,3.7,0.0,1.0,320,62,52.63
8,8,2020-04-24 08:00,안동,6.3,0.0,1.6,250,56,48.01
9,9,2020-04-24 09:00,안동,8.6,0.0,2.3,270,53,42.23


In [9]:
test['일시'] = pd.to_datetime(test['일시'])

In [10]:
test['day_name'] = test['일시'].dt.day_name()
test['Month'] = test['일시'].dt.month
test.head()

Unnamed: 0.1,Unnamed: 0,일시,지점명,기온,강수량,풍속,풍향,습도,실효습도,day_name,Month
0,0,2020-04-24 00:00:00,안동,5.0,0.0,1.9,290,40,42.31,Friday,4
1,1,2020-04-24 01:00:00,안동,4.9,0.0,0.6,250,45,44.85,Friday,4
2,2,2020-04-24 02:00:00,안동,4.1,0.0,0.8,290,50,47.1,Friday,4
3,3,2020-04-24 03:00:00,안동,4.3,0.0,0.7,320,51,47.91,Friday,4
4,4,2020-04-24 04:00:00,안동,3.7,0.0,1.1,290,55,49.25,Friday,4


In [11]:
test.drop('Unnamed: 0',axis = 1, inplace=True)
test.head()

Unnamed: 0,일시,지점명,기온,강수량,풍속,풍향,습도,실효습도,day_name,Month
0,2020-04-24 00:00:00,안동,5.0,0.0,1.9,290,40,42.31,Friday,4
1,2020-04-24 01:00:00,안동,4.9,0.0,0.6,250,45,44.85,Friday,4
2,2020-04-24 02:00:00,안동,4.1,0.0,0.8,290,50,47.1,Friday,4
3,2020-04-24 03:00:00,안동,4.3,0.0,0.7,320,51,47.91,Friday,4
4,2020-04-24 04:00:00,안동,3.7,0.0,1.1,290,55,49.25,Friday,4


In [12]:
test['Month'] = test['Month'].astype(str)
test.drop(['일시','지점명','풍향','강수량'], axis=1, inplace=True)


In [13]:
scaled1 = temp_scaler.transform(test['기온'].values.reshape(-1, 1))    
test.insert(0, 'scaled1', scaled1)
test.drop(['기온'], axis=1, inplace=True)

scaled2 = ws_scaler.transform(test['풍속'].values.reshape(-1, 1))    
test.insert(1, 'scaled2', scaled2)   
test.drop(['풍속'], axis=1, inplace=True)

scaled3 = humid_scaler.transform(test['습도'].values.reshape(-1, 1))    
test.insert(2, 'scaled3', scaled3)
test.drop(['습도'], axis=1, inplace=True)

scaled4 = effhumid_scaler.transform(test['실효습도'].values.reshape(-1, 1))    
test.insert(3, 'scaled4', scaled4)
test.drop(['실효습도'], axis=1, inplace=True)

test = pd.get_dummies(test)
test.head()

Unnamed: 0,scaled1,scaled2,scaled3,scaled4,day_name_Friday,Month_4
0,0.490076,0.060897,0.387755,0.452084,1,1
1,0.48855,0.019231,0.438776,0.486128,1,1
2,0.476336,0.025641,0.489796,0.516285,1,1
3,0.479389,0.022436,0.5,0.527141,1,1
4,0.470229,0.035256,0.540816,0.545101,1,1


In [14]:
test.columns

Index(['scaled1', 'scaled2', 'scaled3', 'scaled4', 'day_name_Friday',
       'Month_4'],
      dtype='object')

In [15]:
one= pd.concat([df_copy,test])

test = one[one['day_name_Saturday'].isnull()].fillna(0)

In [16]:
test.drop('label',axis=1,inplace=True)

In [17]:
pred = model.predict(test)
pred_proba = model.predict_proba(test)[:, 1]


In [18]:
test['pred'] = pred
test['proba'] = pred_proba
test[['pred', 'proba']]

Unnamed: 0,pred,proba
0,1,0.924172
1,1,0.85481
2,1,0.905786
3,1,0.938258
4,1,0.843726
5,1,0.909614
6,1,0.937524
7,1,0.878671
8,1,0.925622
9,1,0.917702
