## Pipeline 구상

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, SelectFromModel, RFE
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, ComplementNB, CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
import xgboost as xgb

pd.set_option('display.max_columns', None)

# 1. 데이터가져오기
# 2. 데이터 수동 분할
# 3. 스케일링
# 4. 모델
# 5. 3과4 파이프라인 만들기
# 6. 그리드서치CV를 통해서 파이프라인 작동
# 7. 최적의 파라미터값 찾기
# 8. 파이프라인으로 예측하기(cv없이)

  from pandas import MultiIndex, Int64Index


In [2]:
# 데이터 불러오기
df = pd.read_csv('데이터/2차 전처리/전처리part7.1_final_레이블삭제.csv', encoding='CP949')

# 데이터 확인
print('<연결>')
print('행렬 수 : ', df.shape)
print('회사 수 : ', df['회사명'].nunique())

<연결>
행렬 수 :  (12743, 59)
회사 수 :  1992


### 피처, 타겟 설정

In [3]:
# 회계년도 2019년까지 확인-> 11545
df_sort_value = df.sort_values('회계년도', ascending=True)  # 내림차순 정렬
df_sort_value.reset_index(drop=True, inplace=True)

df_sort_value[df_sort_value['회계년도']==201912].tail(1)  # 2019년도 인덱스 확인

Unnamed: 0,회사명,거래소코드,회계년도,상장일,상장폐지일_x,부실기업1,부실기업2,부실기업,업종,경기민감도,총자본증가율,순이익증가율,자기자본증가율,매출액증가율,종업원수증가율,정상영업손익증가율,매출액정상영업이익률,매출액순이익률,총자본순이익률,총자본정상영업이익률,자기자본정상영업이익률,자기자본순이익률,금융비용부담률,수지비율,사내유보 대 자기자본비율,1주당매출액,1주당순이익,1주당 CASH FLOW,1주당순자산,유보율,자기자본구성비율,유동비율,당좌비율,현금비율,매출채권 대 매입채무비율,부채비율,이자보상배율,CASH FLOW 대 부채비율,CASH FLOW 대 차입금비율,CASH FLOW 대 총자본비율,CASH FLOW 대 매출액비율,재고자산대순운전자본비율,총자본투자효율,설비투자효율,기계투자효율,부가가치율,노동소득분배율,종업원1인당부가가치율_계산,노동장비율_계산,기계장비율_계산,자본집약도_계산,총자본회전률,자기자본회전률,타인자본회전률,유동자산회전률,당좌자산회전률,재고자산회전률,매출채권회전률,순운전자본회전률
11545,엔에이치엔한국사이버결제(주),60250,201912,200201,,0,0,0,정보통신업,0,11.514861,45.862453,9.882632,8.607095,19.08,47.407874,6.83,5.21,8.27,10.85,26.39,20.2,-0.16,92.04,73.09,20455.77393,1065.489498,3187.008337,5571.233751,1201.97,40.831138,135.29,133.4,70.57,274.03,144.91,87.14,27.43,335.99,16.23,10.77,5.367437,20.29,216.09,630.96,13.47,28.999387,191.397788,93.592949,11.801253,999.548077,1.59,3.923819,2.7,2.11,2.16,102.3,41.1,1.22


In [4]:
df_1 = df_sort_value.copy()

# 필요 없는 것 제거
df_remove = df_1.drop(columns=['회사명', '거래소코드', '회계년도', '상장일', '상장폐지일_x', '부실기업1', '부실기업2', '부실기업', '업종'])
colnames = df_1.columns.drop(['회사명', '거래소코드', '회계년도', '상장일', '상장폐지일_x', '부실기업1', '부실기업2', '부실기업', '업종'])

# 모델링 데이터프레임 생성
df2 = pd.DataFrame(df_remove, columns=colnames)
df2


Unnamed: 0,경기민감도,총자본증가율,순이익증가율,자기자본증가율,매출액증가율,종업원수증가율,정상영업손익증가율,매출액정상영업이익률,매출액순이익률,총자본순이익률,총자본정상영업이익률,자기자본정상영업이익률,자기자본순이익률,금융비용부담률,수지비율,사내유보 대 자기자본비율,1주당매출액,1주당순이익,1주당 CASH FLOW,1주당순자산,유보율,자기자본구성비율,유동비율,당좌비율,현금비율,매출채권 대 매입채무비율,부채비율,이자보상배율,CASH FLOW 대 부채비율,CASH FLOW 대 차입금비율,CASH FLOW 대 총자본비율,CASH FLOW 대 매출액비율,재고자산대순운전자본비율,총자본투자효율,설비투자효율,기계투자효율,부가가치율,노동소득분배율,종업원1인당부가가치율_계산,노동장비율_계산,기계장비율_계산,자본집약도_계산,총자본회전률,자기자본회전률,타인자본회전률,유동자산회전률,당좌자산회전률,재고자산회전률,매출채권회전률,순운전자본회전률
0,1,13.166482,80.280163,5.219839,56.396905,4.65,1263.950784,3.61,-8.50,-5.58,2.37,3.51,-8.28,0.80,108.35,-245.87,406.280703,-34.519834,-58.600377,418.623493,-17.45,66.046945,200.97,165.03,15.10,602.31,51.41,3.53,-46.08,-172.40,-15.65,-24.07,35.590076,12.72,35.86,128.51,19.56,89.819047,27.179630,71.503704,19.715837,204.066667,0.66,0.984333,2.01,1.25,1.55,6.42,2.45,0.86
1,0,50.993391,17.199150,56.598549,6.109184,-4.47,-15.303586,0.83,3.26,4.38,1.12,2.21,8.63,-0.38,96.00,77.70,340462.101000,11083.203000,14406.853400,129996.600000,1928.80,50.530629,111.66,111.22,21.04,95.38,97.90,11.53,10.70,152.12,5.30,4.10,3.752052,12.60,32.10,124655.81,9.76,37.357803,192.863177,876.851171,0.000000,2159.530100,1.35,0.516553,2.73,3.07,3.08,800.92,4.78,1.62
2,1,22.812578,-94.682454,-2.275177,5.728892,3.57,-93.385211,0.20,0.06,0.09,0.27,0.53,0.22,0.72,101.06,79.09,8450.124026,5.333666,-373.638986,3130.275037,541.55,47.171165,132.48,55.12,8.40,117.11,111.99,0.22,-5.75,-9.71,-3.04,-2.33,238.169526,8.75,24.54,239.52,6.72,85.622501,69.270499,19.535284,4.708845,683.234079,1.37,2.761859,2.77,2.61,5.86,4.71,9.32,1.52
3,1,60.842639,107.374903,71.235982,43.298996,0.00,136.437194,17.64,14.95,13.34,15.75,20.83,17.65,-0.87,81.82,61.01,3659.007143,546.907857,668.925873,3814.206349,662.84,77.461017,366.02,306.01,86.38,105.16,29.10,126.69,41.92,363.98,9.45,12.72,-60.003170,29.48,102.85,210.67,39.67,44.806614,144.016457,141.527559,68.573339,488.527559,0.89,1.826677,3.65,1.38,1.68,7.65,6.40,1.05
4,0,23.881296,733.115107,19.892010,21.570027,7.17,1556.327486,1.73,1.51,1.57,1.79,2.55,2.23,-0.46,97.87,18.40,53538.053110,809.233348,80.582808,37929.117910,3582.63,69.412854,204.74,114.32,16.74,154.47,44.07,5.32,3.85,11.46,1.18,1.17,86.328651,15.63,42.76,175.26,15.49,67.832663,70.519388,161.382409,40.210013,453.665392,1.04,0.629682,3.52,2.29,3.94,5.45,8.85,1.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12738,1,9.384605,-20.477821,11.004260,6.471834,2.10,3.309557,5.63,5.36,3.44,3.62,7.82,7.47,1.21,89.56,59.72,36590.305700,1960.142211,1768.526439,27404.664190,5440.70,46.627975,144.15,107.37,10.41,176.79,114.46,4.20,3.79,7.08,2.02,3.29,83.297145,18.68,98.89,642.47,30.38,54.464256,132.062734,178.734841,24.474493,818.231244,0.64,1.439067,1.20,1.58,2.19,5.64,3.14,0.89
12739,1,35.742890,-2851.087955,42.942004,-9.562032,36.33,-311.151918,-2.67,-3.39,-3.64,-2.86,-8.33,-10.33,0.65,101.62,27.69,6463.647857,-219.315506,203.611282,2618.925137,387.42,35.112741,58.60,43.83,7.44,76.44,184.80,-3.34,5.08,8.57,3.30,3.54,-35.671397,8.74,18.98,47.25,9.38,84.336695,59.229551,173.461741,59.762892,502.598945,1.07,4.218693,1.63,3.05,4.09,11.96,7.01,1.15
12740,1,16.606922,71.752339,32.465290,26.196821,3.70,54.947593,15.42,13.52,18.37,20.95,25.97,22.76,-0.18,84.67,68.48,14917.524810,2016.330482,1711.851870,10094.597900,1985.13,85.412420,440.35,309.50,125.72,239.29,17.08,334.55,87.10,0.00,12.71,10.07,-130.847546,43.78,99.33,314.37,34.68,47.902808,107.985714,112.265306,33.364413,246.683673,1.36,1.551187,7.04,2.54,3.52,9.08,10.55,1.94
12741,0,17.190459,159.745035,29.294075,-17.867093,-8.00,-52.539211,1.20,19.19,13.08,0.82,1.51,22.19,1.81,81.54,0.00,10806.858380,2073.693134,-304.650289,5862.330498,1811.62,56.802985,52.05,51.07,9.27,258.79,76.05,0.52,0.87,1.26,0.38,0.60,-2.061014,24.21,247.72,42441.79,38.32,28.976393,87.903661,96.389016,0.009460,709.963387,0.68,1.851563,1.49,2.81,2.86,180.29,6.96,1.10


#### 파이프라인 완성하기 전에 하나씩 집어넣어서 해보는중

In [15]:
# 타깃
target = df_1['부실기업']

# 피쳐
feature_data = df2

# 학습데이터 랜덤x -> 수동 지정
X_train, X_test, y_train, y_test = train_test_split(feature_data[:11545], target[:11545])


model1 = [
    ('로지스틱', LogisticRegression()),
    # ('svc', SVC())
]

model2 = [
    # ('로지스틱', LogisticRegression()),
    ('svc', SVC())
]

######################################################################
####파라미터를 어떻게 모델에 맞게 넣을것인가 에대해 연구해보세요...

for i in [model1, model2]:
    pipe1 = Pipeline(i)
    params = {}
    if i == model2:
        params ={
            'svc__gamma':[0.01, 0.1, 1, 5, 10],
                'svc__C':[0.01, 0.1, 1, 5, 10]}

    grid = GridSearchCV(pipe1, params, scoring = 'accuracy', cv=5)

    grid.fit(X_train, y_train)

    print(i)
    print(f'베스트 파라미터 : {grid.best_params_}')
    print(f'베스트 스코어 : {grid.best_score_}')
    print("="*60)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[('로지스틱', LogisticRegression())]
베스트 파라미터 : {}
베스트 스코어 : 0.8176470588235294
[('svc', SVC())]
베스트 파라미터 : {'svc__C': 0.01, 'svc__gamma': 0.01}
베스트 스코어 : 0.8676470588235293


#### 여기에 있는 모델들 다 사용하는 파이프라인 구축

In [None]:
# 타깃
target = df_1['부실기업']

# 피쳐
feature_data = df2


X_train, X_test, y_train, y_test = train_test_split(feature_data[:11545], target[:11545])


# 스케일러
minmax = ('민맥스', MinMaxScaler())
standard = ('스탠다드', StandardScaler())
robuste = ('로버스트', RobustScaler())


# 사용할 모델
logistic = ('로지스틱', LogisticRegression())
decisiontree = ('의사결정나무', DecisionTreeClassifier())
randomforest = ('랜덤포레스트', RandomForestClassifier())
gbm = ('GBM', GradientBoostingClassifier())
# xgb = ('xgb', xgb())
knn = ('KNN', KNeighborsClassifier())
bernoulli = ('베르누이 ', BernoulliNB())
multinomial = ('멀티', MultinomialNB())
complement = ('컴플리먼트', ComplementNB())
categorical = ('카테고리칼', CategoricalNB())
svc = ('svc', SVC())


##다중포문돌리기 위해 후처리와 모델 리스트작성
after_work = [minmax,standard,robuste]
ml_models = [logistic, decisiontree, randomforest, gbm, knn, bernoulli, multinomial, complement, categorical, svc]


# 파이프라인
pipe1 = Pipeline([logistic, decisiontree, randomforest, gbm, knn, bernoulli, multinomial, complement, categorical, svc])



# 파라미터 값
penalty = ['l2', 'l1']
C = [0.001, 0.01, 0.1, 1, 10, 100]
values = [0.01, 0.1, 1, 5, 10]
max_depth = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
min_samples_split  = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
min_samples_leaf = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
n_estimators = [10, 50, 100, 150, 300]
learning_rate = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
gamma = [0, 1, 2, 3]
subsample = [0.5, 0.7, 0.9, 1]
colsample_bytree = [0.5, 0.7, 0.9, 1]
n_neighbor = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
p = [1, 2]
degree = [1, 2, 3, 4, 5]


# 모델에 파라미터 적용
param_grid = [
    {
        'logistic__penalty' : penalty,
        'logistic__C' : C
    },
    {
        'decisiontree__max_depth' : max_depth,
        'decisiontree__min_samples_split': min_samples_split, 
        'decisiontree__min_samples_leaf': min_samples_leaf
    },
   {
        'randomforest__n_estimators' : n_estimators,
        'randomforest__min_samples_split' : min_samples_split,
        'randomforest__min_samples_leaf' : min_samples_leaf,
        'randomforest__max_depth' : max_depth
    },
    {
        'gbm__max_depth' : max_depth,
        'gbm__min_learning_rate': learning_rate, 
        'gbm__min_n_estimators': n_estimators
    },
   {
        'xgb__n_estimators' : n_estimators,
        'xgb__min_learning_rate' : learning_rate,
        'xgb__min_gamma' : gamma,
        'xgb__max_depth' : max_depth,
        'xgb__subsample' : subsample,
        'xgb__colsample_bytree' : colsample_bytree
    },
    {
        'knn__n_neighbor' : n_neighbor
    },
     {
        'svm__n_neighbor' : n_neighbor,
        'svm__C' : C,
        'svm__degree' : degree,
        'svm__gamma' : gamma
    }]



pipe1.fit(X_train, y_train)


grid = GridSearchCV(pipe1, param_grid=params, cv=5)
grid.fit(X_train,y_train)



print(f'optimal train score: {grid.best_score_:.3f}')
# # optimal train score: 0.984

print(f'test score: {grid.score(X_test, y_test):.3f}')
# # test score: 0.951

print(f'optimal parameter: {grid.best_params_}')
# # optimal parameter: {'svc__C': 10.0, 'svc__gamma': 0.1}

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'LogisticRegression()' (type <class 'sklearn.linear_model._logistic.LogisticRegression'>) doesn't

##### 튜터링 파이프라인

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

cancer = load_breast_cancer()

x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=0)

고기 = ('민맥스', MinMaxScaler())
된찌 = ('폴리', PolynomialFeatures())

상추 = ('스벡', SVC())
깻잎 = ('로지스틱', LogisticRegression())

# pipe1 = Pipeline([된찌, 깻잎])
pipe1 = Pipeline([('민맥스', MinMaxScaler()), ('로지스틱', LogisticRegression())])
# pipe2 = Pipeline([('민맥스', MinMaxScaler()), ('스벡', SVC())])


pipe1.fit(x_train, y_train)


values = [0.1, 0.2, 0.3]
# include = [True, False]
# params = {'폴리__degree':values, '폴리__include_bias':include}

grid1 = GridSearchCV(pipe1, param_grid={}, cv=5)
grid1.fit(x_train, y_train)

print(f'optimal train score: {grid1.best_score_:.3f}')
print(f'test score: {grid1.score(x_test, y_test):.3f}')
print(f'optimal parameter: {grid1.best_params_}')


##### 의성오빠 파이프라인 수정본

In [None]:
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


# 다중회귀
pipe_test = Pipeline([('poly', PolynomialFeatures()), ('Linear', LinearRegression())])


x_best_train, x_best_test, y_best_train, y_best_test = train_test_split(feature_data_best, target, test_size=0.25, random_state=25)


params = {
        'poly__degree': [1,2,3,4,5,6,7], 
        'poly__include_bias': [False,True]
}


asd = GridSearchCV(pipe_test, params, cv=10, scoring='r2')

asd.fit(x_best_train, y_best_train)

print(f"Best: {asd.best_score_} using {asd.best_params_}")

### 가장 좋은 파라미터를 추출함
### grid에 파이프라인 넣어서 제일 좋은 파라미터값을 set_params에 보내줌
##########################################################################################
### gridCV에서 가장 좋은 파라미터 변수를 가져와서 파이프라인에 활용(gridCV안씀)

# print(asd.best_params_['poly__degree'], asd.best_params_['poly__include_bias'])

pipe_test.set_params(poly__degree=4, poly__include_bias=asd.best_params_['poly__include_bias'])

pipe_test.fit(x_best_train, y_best_train)

pipe_preds = pipe_test.predict(x_best_test)



mse_best_a = mean_squared_error(y_best_test, pipe_preds)
rmse_best_a = np.sqrt(mse_best_a)


print(f'MSE : {mse_best_a} | RMSE : {rmse_best_a}')
print(f'R-Square : {r2_score(y_best_test, pipe_preds)}')