In [1]:
import pandas as pd

import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.api import qqplot, add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.linear_model import Ridge, Lasso, ElasticNet

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

import platform
platform.system()

# 운영체제별 한글 폰트 설정
if platform.system() == 'Darwin': # Mac 환경 폰트 설정
    plt.rc('font', family='AppleGothic')
elif platform.system() == 'Windows': # Windows 환경 폰트 설정
    plt.rc('font', family='Malgun Gothic')

plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정


import warnings
warnings.filterwarnings('ignore')

In [3]:
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'NanumGothic'

matplotlib.rc("axes", unicode_minus = False)

In [4]:
df = pd.read_csv("SCALE불량.csv", encoding = "euc-kr")
df.head(3)

Unnamed: 0,plate_no,rolling_date,scale,spec_long,spec_country,steel_kind,pt_thick,pt_width,pt_length,hsb,...,fur_heat_temp,fur_heat_time,fur_soak_temp,fur_soak_time,fur_total_time,fur_ex_temp,rolling_method,rolling_temp,descaling_count,work_group
0,PLT_1001,03JAN2023:07:07:53,양품,AB/EH32-TM,미국,T,32,3700,15100,적용,...,1144,116,1133,59,259,1133,TMCP(온도제어),934,8,1조
1,PLT_1002,03JAN2023:07:21:22,양품,AB/EH32-TM,미국,T,32,3700,15100,적용,...,1144,122,1135,53,238,1135,TMCP(온도제어),937,8,1조
2,PLT_1003,03JAN2023:07:31:15,양품,NV-E36-TM,영국,T,33,3600,19200,적용,...,1129,116,1121,55,258,1121,TMCP(온도제어),889,8,1조


In [5]:
df['scale'].unique()

array(['양품', '불량'], dtype=object)

In [6]:
df['scale'] = df['scale'].replace({'양품': 0, '불량': 1})

In [7]:
# from sklearn.preprocessing import LabelEncoder

# label_encoder = LabelEncoder()

# # 'hsb' 열을 숫자로 인코딩
# df['hsb'] = label_encoder.fit_transform(df['hsb']) # 1: 적용, 0 : 미적용

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   plate_no         1000 non-null   object
 1   rolling_date     1000 non-null   object
 2   scale            1000 non-null   int64 
 3   spec_long        1000 non-null   object
 4   spec_country     1000 non-null   object
 5   steel_kind       1000 non-null   object
 6   pt_thick         1000 non-null   int64 
 7   pt_width         1000 non-null   int64 
 8   pt_length        1000 non-null   int64 
 9   hsb              1000 non-null   object
 10  fur_no           1000 non-null   object
 11  fur_input_row    1000 non-null   object
 12  fur_heat_temp    1000 non-null   int64 
 13  fur_heat_time    1000 non-null   int64 
 14  fur_soak_temp    1000 non-null   int64 
 15  fur_soak_time    1000 non-null   int64 
 16  fur_total_time   1000 non-null   int64 
 17  fur_ex_temp      1000 non-null   i

In [9]:
df['rolling_date'] = pd.to_datetime(df['rolling_date'], format='%d%b%Y:%H:%M:%S', errors='coerce')
df['hour'] = df['rolling_date'].dt.hour
df.drop(columns=["rolling_date",'fur_ex_temp','plate_no'],inplace=True)

In [10]:
numeric_df = df.select_dtypes(include=['float64', 'int64']) 
numeric_df.corr()  

Unnamed: 0,scale,pt_thick,pt_width,pt_length,fur_heat_temp,fur_heat_time,fur_soak_temp,fur_soak_time,fur_total_time,rolling_temp,descaling_count
scale,1.0,-0.09809,-0.189536,0.052115,0.222493,-0.035826,0.327817,-0.219581,-0.127695,0.375704,-0.040011
pt_thick,-0.09809,1.0,-0.319894,-0.867125,-0.525317,0.206644,-0.665964,0.35726,0.338078,-0.363406,-0.845324
pt_width,-0.189536,-0.319894,1.0,0.147979,0.200737,-0.100876,0.216879,-0.097273,-0.133329,-0.06032,0.350132
pt_length,0.052115,-0.867125,0.147979,1.0,0.470846,-0.137895,0.603303,-0.33373,-0.276402,0.290828,0.818483
fur_heat_temp,0.222493,-0.525317,0.200737,0.470846,1.0,-0.163792,0.787203,-0.373154,-0.313632,0.268803,0.488292
fur_heat_time,-0.035826,0.206644,-0.100876,-0.137895,-0.163792,1.0,-0.349176,0.114804,0.736069,-0.083188,-0.166892
fur_soak_temp,0.327817,-0.665964,0.216879,0.603303,0.787203,-0.349176,1.0,-0.59682,-0.560808,0.45921,0.627843
fur_soak_time,-0.219581,0.35726,-0.097273,-0.33373,-0.373154,0.114804,-0.59682,1.0,0.632995,-0.308021,-0.399353
fur_total_time,-0.127695,0.338078,-0.133329,-0.276402,-0.313632,0.736069,-0.560808,0.632995,1.0,-0.208733,-0.339263
rolling_temp,0.375704,-0.363406,-0.06032,0.290828,0.268803,-0.083188,0.45921,-0.308021,-0.208733,1.0,0.250112


In [11]:
df.loc[df['rolling_temp'] >900, 'rolling_temp_>900'] = 1
df.loc[df['rolling_temp'] <=900, 'rolling_temp_>900'] = 0

df['pt_area'] = df['pt_length'] * df['pt_width']
df['fur_temp_gap'] = df['fur_heat_temp'] - df['fur_soak_temp']

df = df[df['rolling_temp'] != 0]

In [14]:
from scipy.stats import chi2_contingency

# object 타입인 변수들의 열 이름 리스트
object_columns = ['spec_long', 'spec_country', 'steel_kind','hsb' , 'fur_no', 'fur_input_row', 'rolling_method', 'work_group']

for column in object_columns:
    # 카이제곱 검정을 위한 교차표 생성
    cross_tab = pd.crosstab(df['scale'], df[column])
    
    # 카이제곱 검정 수행
    chi2, p_value, _, _ = chi2_contingency(cross_tab)
    
    # 결과 출력
    print(f"Chi-square test for {column}:")
    print("Chi-square test statistic:", chi2)
    print("P-value:", p_value)
    if p_value < 0.05:
        print("유의수준 0.05에서 귀무가설 기각: 'scale'과", column, "사이에는 유의한 관련성이 있다.")
    else:
        print("유의수준 0.05에서 귀무가설 채택: 'scale'과", column, "사이에는 유의한 관련성이 없다.")
    print("\n")

Chi-square test for spec_long:
Chi-square test statistic: 235.94685749378158
P-value: 3.113316475144594e-21
유의수준 0.05에서 귀무가설 기각: 'scale'과 spec_long 사이에는 유의한 관련성이 있다.


Chi-square test for spec_country:
Chi-square test statistic: 69.80075036426382
P-value: 4.4922555761885164e-13
유의수준 0.05에서 귀무가설 기각: 'scale'과 spec_country 사이에는 유의한 관련성이 있다.


Chi-square test for steel_kind:
Chi-square test statistic: 76.25774182995244
P-value: 2.489547428454086e-18
유의수준 0.05에서 귀무가설 기각: 'scale'과 steel_kind 사이에는 유의한 관련성이 있다.


Chi-square test for hsb:
Chi-square test statistic: 105.51048606504317
P-value: 9.439705302426995e-25
유의수준 0.05에서 귀무가설 기각: 'scale'과 hsb 사이에는 유의한 관련성이 있다.


Chi-square test for fur_no:
Chi-square test statistic: 3.1186222255276252
P-value: 0.2102808811312071
유의수준 0.05에서 귀무가설 채택: 'scale'과 fur_no 사이에는 유의한 관련성이 없다.


Chi-square test for fur_input_row:
Chi-square test statistic: 0.9203672191669445
P-value: 0.3373785709791819
유의수준 0.05에서 귀무가설 채택: 'scale'과 fur_input_row 사이에는 유의한 관련성이 없다.


C

In [15]:
ㅁㄴㅇ

NameError: name 'ᄆᄂᄋ' is not defined

#### rolling_date를 문자를 제거한 후 daytime으로 변환하여 날짜까지 나타내는 Datetime과 시간 값을 가진 time으로 나눔

In [None]:
df['datetime'] = pd.to_datetime(df['rolling_date'], format='%d%b%Y:%H:%M:%S', errors='coerce')
df['hour'] = df['datetime'].dt.hour

In [None]:
percentage_by_hour = df.groupby('hour')['scale'].mean() * 100

#### 파생변수 생성

In [None]:
# 후판 면적 파생변수 생성


In [None]:
# 가열대와 균열대 온도차 파생변수 생성


In [None]:
df[(df['hsb']=="적용") & (df['scale']==1)]

In [None]:
df_scale_1 = df[df['scale']==1]
df_scale_0 = df[df['scale']==0]

In [None]:
sns.boxplot(df_scale_0)

#### 이상치 확인

In [None]:
df.describe()

In [None]:
df['rolling_temp'].unique()

In [None]:
df.info()

#### rolling_temp = 0인 값은 이상치로 판단하여 제거

In [None]:
df.info()

### 카이제곱 검정

In [None]:
# chi-square test 실행
chi, pval, dof, expected = stats.chi2_contingency(df)

# chi-square test 결과 출력
print( "chi-square test ")
print( ' chisq: {0:0.3f}' .format(chi))
print( ' p: {0:0.3f}'.format(pval))
print( ' degree pf freedom: {}' .format(dof))
print( ' expected value: \n{}' .format(expected.round(3)))