In [2]:
import pandas as pd

import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.api import qqplot, add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.linear_model import Ridge, Lasso, ElasticNet

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

import platform
platform.system()

# 운영체제별 한글 폰트 설정
if platform.system() == 'Darwin': # Mac 환경 폰트 설정
    plt.rc('font', family='AppleGothic')
elif platform.system() == 'Windows': # Windows 환경 폰트 설정
    plt.rc('font', family='Malgun Gothic')

plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정


import warnings
warnings.filterwarnings('ignore')

In [4]:
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'NanumGothic'

matplotlib.rc("axes", unicode_minus = False)

In [5]:
from google.colab import drive
path = drive.mount('/content/drive')

# csv 파일이 저장'된' 경로
csv_path = '/content/drive/MyDrive/개인/포스코/본수업/2주차/수업/실습파일/SCALE불량.csv'

# 파일 읽기
df = pd.read_csv(csv_path, encoding='euc-kr')
df.head(3)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,plate_no,rolling_date,scale,spec_long,spec_country,steel_kind,pt_thick,pt_width,pt_length,hsb,...,fur_heat_temp,fur_heat_time,fur_soak_temp,fur_soak_time,fur_total_time,fur_ex_temp,rolling_method,rolling_temp,descaling_count,work_group
0,PLT_1001,03JAN2023:07:07:53,양품,AB/EH32-TM,미국,T,32,3700,15100,적용,...,1144,116,1133,59,259,1133,TMCP(온도제어),934,8,1조
1,PLT_1002,03JAN2023:07:21:22,양품,AB/EH32-TM,미국,T,32,3700,15100,적용,...,1144,122,1135,53,238,1135,TMCP(온도제어),937,8,1조
2,PLT_1003,03JAN2023:07:31:15,양품,NV-E36-TM,영국,T,33,3600,19200,적용,...,1129,116,1121,55,258,1121,TMCP(온도제어),889,8,1조


In [6]:
df['hsb'].unique()

array(['적용', '미적용'], dtype=object)

In [7]:
# 목표변수를 불량을 1로 변환
df['scale']=df['scale'].replace({'양품' : 0, '불량' : 1})

In [8]:
from sklearn.preprocessing import LabelEncoder

# hsb 인코딩
# LabelEncoder 객체 생성
label_encoder = LabelEncoder()

# 'hsb' 열을 숫자로 인코딩
df['hsb'] = label_encoder.fit_transform(df['hsb'])

# 인코딩된 값 확인
encoded_values = df['hsb'].unique()
print(encoded_values)

[1 0]


In [9]:
# hsb 인코딩
# LabelEncoder 객체 생성
label_encoder = LabelEncoder()

# 'hsb' 열을 숫자로 인코딩
df['hsb'] = label_encoder.fit_transform(df['hsb'])

# 인코딩된 값 확인
encoded_values = df['hsb'].unique()
print(encoded_values)

[1 0]


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   plate_no         1000 non-null   object
 1   rolling_date     1000 non-null   object
 2   scale            1000 non-null   int64 
 3   spec_long        1000 non-null   object
 4   spec_country     1000 non-null   object
 5   steel_kind       1000 non-null   object
 6   pt_thick         1000 non-null   int64 
 7   pt_width         1000 non-null   int64 
 8   pt_length        1000 non-null   int64 
 9   hsb              1000 non-null   int64 
 10  fur_no           1000 non-null   object
 11  fur_input_row    1000 non-null   object
 12  fur_heat_temp    1000 non-null   int64 
 13  fur_heat_time    1000 non-null   int64 
 14  fur_soak_temp    1000 non-null   int64 
 15  fur_soak_time    1000 non-null   int64 
 16  fur_total_time   1000 non-null   int64 
 17  fur_ex_temp      1000 non-null   i

In [11]:
df.isnull().sum()

plate_no           0
rolling_date       0
scale              0
spec_long          0
spec_country       0
steel_kind         0
pt_thick           0
pt_width           0
pt_length          0
hsb                0
fur_no             0
fur_input_row      0
fur_heat_temp      0
fur_heat_time      0
fur_soak_temp      0
fur_soak_time      0
fur_total_time     0
fur_ex_temp        0
rolling_method     0
rolling_temp       0
descaling_count    0
work_group         0
dtype: int64

In [12]:
# 가열로 추출온도 겹치는 column이므로 제거
df.drop('fur_ex_temp', axis=1, inplace=True)

In [13]:
# plate 번호는 수율에 영향을 주지 못한다고 판단하여 제거
df.drop('plate_no', axis=1, inplace=True)

#### rolling_date를 문자를 제거한 후 daytime으로 변환하여 날짜까지 나타내는 Datetime과 시간 값을 가진 time으로 나눔

In [14]:
df['datetime'] = pd.to_datetime(df['rolling_date'], format='%d%b%Y:%H:%M:%S', errors='coerce')
df['time'] = df['datetime'].dt.time

In [15]:
df['datetime']

0     2023-01-03 07:07:53
1     2023-01-03 07:21:22
2     2023-01-03 07:31:15
3     2023-01-03 07:41:01
4     2023-01-03 07:52:40
              ...        
995   2023-01-10 05:32:25
996   2023-01-10 05:39:19
997   2023-01-10 05:52:41
998   2023-01-10 06:01:50
999   2023-01-10 06:16:27
Name: datetime, Length: 1000, dtype: datetime64[ns]

In [16]:
df['time']

0      07:07:53
1      07:21:22
2      07:31:15
3      07:41:01
4      07:52:40
         ...   
995    05:32:25
996    05:39:19
997    05:52:41
998    06:01:50
999    06:16:27
Name: time, Length: 1000, dtype: object

In [17]:
# datetime, time 생성 했으므로 기존 시간데이터 제거
df.drop('rolling_date', axis=1, inplace=True)

In [18]:
# 'datetime' 열에서 시간 정보만 추출하여 새로운 열 생성
df['hour'] = df['datetime'].dt.hour

# 각 시간대별로 'encoded_scale'이 1인 행의 비율 계산
percentage_by_hour = df.groupby('hour')['scale'].mean() * 100
# 시간에 따른 수율 확인
print(percentage_by_hour)

hour
0     34.090909
1     28.571429
2     37.209302
3     20.930233
4     26.190476
5     31.111111
6     30.303030
7     32.500000
8     25.000000
9     43.589744
10    42.500000
11    46.666667
12    51.219512
13    37.500000
14    25.581395
15    25.000000
16    34.146341
17    31.111111
18    27.500000
19    16.666667
20    14.634146
21    13.953488
22    29.268293
23    39.534884
Name: scale, dtype: float64


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   scale            1000 non-null   int64         
 1   spec_long        1000 non-null   object        
 2   spec_country     1000 non-null   object        
 3   steel_kind       1000 non-null   object        
 4   pt_thick         1000 non-null   int64         
 5   pt_width         1000 non-null   int64         
 6   pt_length        1000 non-null   int64         
 7   hsb              1000 non-null   int64         
 8   fur_no           1000 non-null   object        
 9   fur_input_row    1000 non-null   object        
 10  fur_heat_temp    1000 non-null   int64         
 11  fur_heat_time    1000 non-null   int64         
 12  fur_soak_temp    1000 non-null   int64         
 13  fur_soak_time    1000 non-null   int64         
 14  fur_total_time   1000 non-null   int64   

#### 파생변수 생성

In [20]:
# 후판 면적 파생변수 생성
df['pt_area'] = df['pt_length'] * df['pt_width']

In [21]:
# 가열대와 균열대 온도차 파생변수 생성
df['fur_temp_gap'] = df['fur_heat_temp'] - df['fur_soak_temp']

#### 이상치 확인

In [22]:
df.describe()

Unnamed: 0,scale,pt_thick,pt_width,pt_length,hsb,fur_heat_temp,fur_heat_time,fur_soak_temp,fur_soak_time,fur_total_time,rolling_temp,descaling_count,hour,pt_area,fur_temp_gap
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.31,26.782,2831.9,36788.2,0.953,1157.245,85.972,1150.928,71.72,238.589,934.637,8.557,11.493,105196700.0,6.317
std,0.462725,18.13757,494.081478,13912.387116,0.211745,21.245007,26.346297,17.344384,20.602137,38.194828,96.598015,1.604158,6.970979,45097790.0,13.116347
min,0.0,12.0,1800.0,7900.0,0.0,1103.0,55.0,1113.0,35.0,165.0,0.0,5.0,0.0,17640000.0,-45.0
25%,0.0,15.0,2500.0,26650.0,1.0,1140.0,66.0,1135.75,57.75,210.0,893.75,8.0,5.0,69120000.0,-1.0
50%,0.0,19.0,2800.0,40400.0,1.0,1159.0,75.0,1156.0,66.0,230.0,948.0,9.0,11.5,110810000.0,6.0
75%,1.0,34.0,3100.0,49100.0,1.0,1173.0,102.25,1164.0,81.0,263.0,991.0,10.0,17.25,142477500.0,13.0
max,1.0,100.0,4600.0,54900.0,1.0,1206.0,158.0,1185.0,145.0,362.0,1078.0,10.0,23.0,218440000.0,56.0


In [23]:
df['rolling_temp'].unique()

array([ 934,  937,  889,  885,  873,  874,  878,  870,  881,  869,  820,
       1057,  926,  931,  929,  925,  928,  860,  836,  832,  841,  933,
        930,  936,  838,  837,  862,  861,  856,  863,    0,  864,  845,
        808,  850,  852,  847,  849,  844,  853,  851,  840,  846,  834,
        842,  843,  773,  760,  859,  839,  911,  935,  915,  923,  913,
        977,  985,  955,  950,  952,  956,  920,  959,  918,  944,  976,
       1004,  961,  996,  995,  953,  992,  988,  982,  967,  975, 1008,
       1018, 1006,  999, 1005, 1012, 1002, 1023, 1015, 1021, 1011, 1013,
       1019, 1026, 1027, 1020, 1014, 1022, 1007, 1017, 1031, 1030, 1062,
       1078, 1066, 1056, 1054, 1068, 1053,  902,  907, 1037,  901, 1024,
       1035, 1016, 1029, 1010, 1000,  991,  945,  941,  962,  979,  993,
       1038, 1043, 1046, 1034, 1032,  997,  986,  989, 1048, 1003,  998,
        987,  968,  980,  973,  971,  972,  969,  957,  958,  951,  939,
        947,  970,  983,  984,  899, 1009,  981,  9

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   scale            1000 non-null   int64         
 1   spec_long        1000 non-null   object        
 2   spec_country     1000 non-null   object        
 3   steel_kind       1000 non-null   object        
 4   pt_thick         1000 non-null   int64         
 5   pt_width         1000 non-null   int64         
 6   pt_length        1000 non-null   int64         
 7   hsb              1000 non-null   int64         
 8   fur_no           1000 non-null   object        
 9   fur_input_row    1000 non-null   object        
 10  fur_heat_temp    1000 non-null   int64         
 11  fur_heat_time    1000 non-null   int64         
 12  fur_soak_temp    1000 non-null   int64         
 13  fur_soak_time    1000 non-null   int64         
 14  fur_total_time   1000 non-null   int64   

#### rolling_temp = 0인 값은 이상치로 판단하여 제거

In [25]:
df = df[df['rolling_temp'] != 0]
df['rolling_temp'].unique()

array([ 934,  937,  889,  885,  873,  874,  878,  870,  881,  869,  820,
       1057,  926,  931,  929,  925,  928,  860,  836,  832,  841,  933,
        930,  936,  838,  837,  862,  861,  856,  863,  864,  845,  808,
        850,  852,  847,  849,  844,  853,  851,  840,  846,  834,  842,
        843,  773,  760,  859,  839,  911,  935,  915,  923,  913,  977,
        985,  955,  950,  952,  956,  920,  959,  918,  944,  976, 1004,
        961,  996,  995,  953,  992,  988,  982,  967,  975, 1008, 1018,
       1006,  999, 1005, 1012, 1002, 1023, 1015, 1021, 1011, 1013, 1019,
       1026, 1027, 1020, 1014, 1022, 1007, 1017, 1031, 1030, 1062, 1078,
       1066, 1056, 1054, 1068, 1053,  902,  907, 1037,  901, 1024, 1035,
       1016, 1029, 1010, 1000,  991,  945,  941,  962,  979,  993, 1038,
       1043, 1046, 1034, 1032,  997,  986,  989, 1048, 1003,  998,  987,
        968,  980,  973,  971,  972,  969,  957,  958,  951,  939,  947,
        970,  983,  984,  899, 1009,  981,  960,  9

In [26]:
# hour와 겹침, 제거
df.drop('time', axis=1, inplace=True)

In [27]:
# spec_long 제거, 데이터 부족 유의미한 분석 어려움
df.drop('spec_long', axis=1, inplace=True)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 994 entries, 0 to 999
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   scale            994 non-null    int64         
 1   spec_country     994 non-null    object        
 2   steel_kind       994 non-null    object        
 3   pt_thick         994 non-null    int64         
 4   pt_width         994 non-null    int64         
 5   pt_length        994 non-null    int64         
 6   hsb              994 non-null    int64         
 7   fur_no           994 non-null    object        
 8   fur_input_row    994 non-null    object        
 9   fur_heat_temp    994 non-null    int64         
 10  fur_heat_time    994 non-null    int64         
 11  fur_soak_temp    994 non-null    int64         
 12  fur_soak_time    994 non-null    int64         
 13  fur_total_time   994 non-null    int64         
 14  rolling_method   994 non-null    object   

# 카이제곱 검정 - 범주형과 범주형 간의  관계

In [34]:
from scipy.stats import chi2_contingency

categorical_var = ['spec_country', 'steel_kind', 'fur_no', 'fur_input_row', 'rolling_method',
                    'work_group']

for var in categorical_var:
    contingency_table = pd.crosstab(df[var], df['scale'])
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    print(f"Variable: {var}")
    print(f"Chi-square statistic: {chi2}")
    print(f"P-value: {p_value}")
    print("\n")


Variable: spec_country
Chi-square statistic: 69.80075036426383
P-value: 4.492255576188483e-13


Variable: steel_kind
Chi-square statistic: 76.25774182995244
P-value: 2.489547428454086e-18


Variable: fur_no
Chi-square statistic: 3.1186222255276252
P-value: 0.2102808811312071


Variable: fur_input_row
Chi-square statistic: 0.9203672191669445
P-value: 0.3373785709791819


Variable: rolling_method
Chi-square statistic: 44.88003167017692
P-value: 2.0948325380804966e-11


Variable: work_group
Chi-square statistic: 13.90057724000546
P-value: 0.003043655675169399




- fur_no, fur_input_row는 유의수준(0.05)보다 값이 크므로 scale 변수와 유의미한 관련성이 없다고 판단된다.



# 로지스틱 회귀분석

In [63]:
# train_test_split(데이터, test_size = test 데이터 비율, random_state: 랜덤)
df_train, df_test = train_test_split(df, # 데이터
                                     test_size = 0.3, # test 데이터의 비율
                                     random_state = 1234)  # random state

print("train data size : {}".format(df_train.shape))
print("test data size : {}".format(df_test.shape))

train data size : (695, 22)
test data size : (299, 22)


In [36]:
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ C(spec_country) + C(steel_kind) + pt_thick + pt_width + pt_length + hsb + C(fur_no) + C(fur_input_row) + fur_heat_temp + fur_heat_time + fur_soak_temp + fur_soak_time + fur_total_time + C(rolling_method) + rolling_temp + descaling_count + C(work_group) + hour + pt_area + fur_temp_gap"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


         Current function value: 0.294127
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      965
Method:                           MLE   Df Model:                           28
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.5261
Time:                        12:32:50   Log-Likelihood:                -292.36
converged:                      False   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                8.692e-119
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                         -19.7977   4.77e+04     -0.000      1.000   -9.35e+04    9.35e+04
C(

- 로지스틱 회귀분석의 설명력은 52.61%가 나왔으며,

In [39]:
# fur_temp_gap(1.000) 변수 제거
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ C(spec_country) + C(steel_kind) + pt_thick + pt_width + pt_length + hsb + C(fur_no) + C(fur_input_row) + fur_heat_temp + fur_heat_time + fur_soak_temp + fur_soak_time + fur_total_time + C(rolling_method) + rolling_temp + descaling_count + C(work_group) + hour + pt_area"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


         Current function value: 0.294127
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      966
Method:                           MLE   Df Model:                           27
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.5261
Time:                        12:48:53   Log-Likelihood:                -292.36
converged:                      False   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                1.754e-119
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                         -19.5964   6.32e+04     -0.000      1.000   -1.24e+05    1.24e+05
C(

In [40]:
# hsb(0.999) 변수 제거
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ C(spec_country) + C(steel_kind) + pt_thick + pt_width + pt_length + C(fur_no) + C(fur_input_row) + fur_heat_temp + fur_heat_time + fur_soak_temp + fur_soak_time + fur_total_time + C(rolling_method) + rolling_temp + descaling_count + C(work_group) + hour + pt_area"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.403018
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      967
Method:                           MLE   Df Model:                           26
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.3506
Time:                        12:50:02   Log-Likelihood:                -400.60
converged:                       True   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                 2.760e-75
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                         -74.3157     14.689     -5.059  

In [41]:
# pt_area(0.924) 변수 제거
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ C(spec_country) + C(steel_kind) + pt_thick + pt_width + pt_length + C(fur_no) + C(fur_input_row) + fur_heat_temp + fur_heat_time + fur_soak_temp + fur_soak_time + fur_total_time + C(rolling_method) + rolling_temp + descaling_count + C(work_group) + hour"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.403023
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      968
Method:                           MLE   Df Model:                           25
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.3506
Time:                        12:50:37   Log-Likelihood:                -400.60
converged:                       True   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                 6.582e-76
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                         -74.4914     14.576     -5.110  

In [42]:
# fur_soak_time(0.924) 변수 제거
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ C(spec_country) + C(steel_kind) + pt_thick + pt_width + pt_length + C(fur_no) + C(fur_input_row) + fur_heat_temp + fur_heat_time + fur_soak_temp + fur_total_time + C(rolling_method) + rolling_temp + descaling_count + C(work_group) + hour"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.403027
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      969
Method:                           MLE   Df Model:                           24
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.3506
Time:                        12:52:00   Log-Likelihood:                -400.61
converged:                       True   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                 1.537e-76
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                         -74.7826     14.252     -5.247  

In [43]:
# spec_country(0.753) 변수 제거
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ C(steel_kind) + pt_thick + pt_width + pt_length + C(fur_no) + C(fur_input_row) + fur_heat_temp + fur_heat_time + fur_soak_temp + fur_total_time + C(rolling_method) + rolling_temp + descaling_count + C(work_group) + hour"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.404382
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      975
Method:                           MLE   Df Model:                           18
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.3484
Time:                        13:03:22   Log-Likelihood:                -401.96
converged:                       True   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                 5.424e-80
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                         -76.0744     14.088     -5.400  

In [44]:
# work_group(0.675) 변수 제거
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ C(steel_kind) + pt_thick + pt_width + pt_length + C(fur_no) + C(fur_input_row) + fur_heat_temp + fur_heat_time + fur_soak_temp + fur_total_time + C(rolling_method) + rolling_temp + descaling_count + hour"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.406037
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      978
Method:                           MLE   Df Model:                           15
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.3457
Time:                        13:04:16   Log-Likelihood:                -403.60
converged:                       True   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                 1.816e-81
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                         -81.0318     13.843     -5.854  

In [45]:
# pt_thick(0.758) 변수 제거
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ C(steel_kind) + pt_width + pt_length + C(fur_no) + C(fur_input_row) + fur_heat_temp + fur_heat_time + fur_soak_temp + fur_total_time + C(rolling_method) + rolling_temp + descaling_count + hour"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.406085
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      979
Method:                           MLE   Df Model:                           14
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.3456
Time:                        13:05:01   Log-Likelihood:                -403.65
converged:                       True   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                 3.377e-82
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                         -80.6622     13.756     -5.864  

In [46]:
# fur_no(0.660) 변수 제거
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ C(steel_kind) + pt_width + pt_length + C(fur_input_row) + fur_heat_temp + fur_heat_time + fur_soak_temp + fur_total_time + C(rolling_method) + rolling_temp + descaling_count + hour"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.407310
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      981
Method:                           MLE   Df Model:                           12
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.3437
Time:                        13:05:39   Log-Likelihood:                -404.87
converged:                       True   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                 3.108e-83
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                         -81.2045     13.659     -5.945  

In [47]:
# fur_total_time(0.578) 변수 제거
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ C(steel_kind) + pt_width + pt_length + C(fur_input_row) + fur_heat_temp + fur_heat_time + fur_soak_temp + C(rolling_method) + rolling_temp + descaling_count + hour"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.407466
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      982
Method:                           MLE   Df Model:                           11
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.3434
Time:                        13:06:46   Log-Likelihood:                -405.02
converged:                       True   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                 5.679e-84
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                         -79.3575     13.285     -5.973  

In [48]:
# fur_input_row(0.512) 변수 제거
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ C(steel_kind) + pt_width + pt_length + fur_heat_temp + fur_heat_time + fur_soak_temp + C(rolling_method) + rolling_temp + descaling_count + hour"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.407682
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      983
Method:                           MLE   Df Model:                           10
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.3431
Time:                        13:07:29   Log-Likelihood:                -405.24
converged:                       True   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                 1.049e-84
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                         -79.0042     13.262     -5.957  

In [49]:
# hour(0.353) 변수 제거
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ C(steel_kind) + pt_width + pt_length + fur_heat_temp + fur_heat_time + fur_soak_temp + C(rolling_method) + rolling_temp + descaling_count"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.408119
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      984
Method:                           MLE   Df Model:                            9
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.3424
Time:                        13:08:24   Log-Likelihood:                -405.67
converged:                       True   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                 2.273e-85
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                         -78.7854     13.179     -5.978  

In [50]:
# fur_heat_temp(0.062) 변수 제거
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ C(steel_kind) + pt_width + pt_length + fur_heat_time + fur_soak_temp + C(rolling_method) + rolling_temp + descaling_count"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.409871
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      985
Method:                           MLE   Df Model:                            8
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.3395
Time:                        13:09:41   Log-Likelihood:                -407.41
converged:                       True   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                 1.684e-85
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                         -89.3515     12.178     -7.337  

In [51]:
# rolling_method(0.164) 변수 제거
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ C(steel_kind) + pt_width + pt_length + fur_heat_time + fur_soak_temp + rolling_temp + descaling_count"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.410883
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      986
Method:                           MLE   Df Model:                            7
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.3379
Time:                        13:10:52   Log-Likelihood:                -408.42
converged:                       True   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                 5.666e-86
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept            -89.3095     12.212     -7.313      0.000    -113.245     -65.374
C(ste

In [52]:
# steel_kind(0.266) 변수 제거
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ pt_width + pt_length + fur_heat_time + fur_soak_temp + rolling_temp + descaling_count"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.411515
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      987
Method:                           MLE   Df Model:                            6
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.3369
Time:                        13:11:34   Log-Likelihood:                -409.05
converged:                       True   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                 1.212e-86
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept         -96.3392     10.612     -9.078      0.000    -117.138     -75.540
pt_width      

In [53]:
# pt_length(0.106) 변수 제거
import statsmodels.api as sm

# 범주형 변수를 C() 함수를 사용하여 처리
formula = "scale ~ pt_width + fur_heat_time + fur_soak_temp + rolling_temp + descaling_count"

# 모델 적합
model = sm.Logit.from_formula(formula, data=df)
result = model.fit()

# 결과 출력
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.412833
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  scale   No. Observations:                  994
Model:                          Logit   Df Residuals:                      988
Method:                           MLE   Df Model:                            5
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.3348
Time:                        13:12:10   Log-Likelihood:                -410.36
converged:                       True   LL-Null:                       -616.87
Covariance Type:            nonrobust   LLR p-value:                 4.630e-87
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept         -92.4601     10.315     -8.964      0.000    -112.676     -72.244
pt_width      

- pt_width, fur_heat_time, fur_soak_temp, rolling_temp, descaling_count 해당 변수들의 p-value는 유의수준 0.05보다 작으므로 scale에 유의한 영향을 미친다고 할 수 있다.

In [56]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 다중공선성을 평가할 변수 선택
conti_col = ['pt_width', 'fur_heat_time', 'fur_soak_temp','descaling_count', 'rolling_temp']
X = df[conti_col].reset_index(drop=True)

# VIF 계산
vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# 결과 출력
print(vif)

          Variable         VIF
0         pt_width   44.490690
1    fur_heat_time   11.614419
2    fur_soak_temp  436.132395
3  descaling_count   39.951777
4     rolling_temp  334.004054
