In [2]:
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm

#### 통합 데이터 불러오기

In [5]:
df = pd.read_csv('../data_2/merge_data.csv') 
df.drop(labels=['Unnamed: 0'], axis=1, inplace=True)
df

Unnamed: 0,개월령,생시체중,근내평균,도체범위근내평균,체고,체장,형매도축수,형매도체평균,형매근내평균,형매근내평균가산,6개월내출산,농가근내평균,농가근내평균가산,근내EPD,근내,점수,농가구분,TARGET1
0,46,644.64,3.34,3.23,131.4,158.9,,,,,N,4.16,-1.05,0.18,6,4,농가소,0
1,46,633.93,4.59,4.52,129.7,148.4,,,,,Y,4.16,-1.05,0.17,4,3,농가소,0
2,32,567.86,5.50,4.89,130.3,158.0,,,,,N,5.58,0.53,0.42,7,5,농가소,1
3,33,564.29,6.13,5.56,130.7,155.9,,,,,Y,3.78,-0.91,0.42,2,2,농가소,0
4,48,692.86,4.79,4.61,131.2,146.1,,,,,Y,3.90,-1.35,0.82,4,3,농가소,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99850,30,668.00,4.18,4.06,137.6,161.4,,,,,Y,5.67,1.57,-0.01,2,2,우시장소,0
99851,30,606.00,5.43,4.38,133.7,156.4,,,,,N,5.14,0.65,0.07,3,2,우시장소,0
99852,30,576.00,3.99,3.65,130.6,148.7,,,,,N,,,0.01,4,3,우시장소,0
99853,30,657.00,4.33,4.69,132.3,154.3,,,,,N,5.00,1.40,0.02,4,3,우시장소,0


In [8]:
df_copy = df.copy()
print(df_copy.shape)
df_copy.head(1)

(99855, 18)


Unnamed: 0,개월령,생시체중,근내평균,도체범위근내평균,체고,체장,형매도축수,형매도체평균,형매근내평균,형매근내평균가산,6개월내출산,농가근내평균,농가근내평균가산,근내EPD,근내,점수,농가구분,TARGET1
0,46,644.64,3.34,3.23,131.4,158.9,,,,,N,4.16,-1.05,0.18,6,4,농가소,0


##### 로지스틱 반복문

In [10]:
print(len(df_copy.columns))
df_copy.columns

18


Index(['개월령', '생시체중', '근내평균', '도체범위근내평균', '체고', '체장', '형매도축수', '형매도체평균',
       '형매근내평균', '형매근내평균가산', '6개월내출산', '농가근내평균', '농가근내평균가산', '근내EPD', '근내',
       '점수', '농가구분', 'TARGET1'],
      dtype='object')

In [28]:
df_copy.describe(include='object')

Unnamed: 0,6개월내출산,농가구분
count,99855,99855
unique,2,2
top,Y,농가소
freq,56308,92315


###### train_test_split 7:3

In [55]:
from sklearn.model_selection import train_test_split

X = df_copy.drop(['TARGET1'],axis=1)
Y = df_copy.TARGET1
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 2023)

print(f'데이터 셋 70%:{df_copy.shape[0]*0.7}')
print(f'데이터 셋 30%:{df_copy.shape[0]*0.3}')

train = pd.concat([X_train,Y_train], axis=1)
test = pd.concat([X_test,Y_test], axis=1)

print(f'train set:{train.shape[0]}')
print(f'test set:{test.shape[0]}')

데이터 셋 70%:69898.5
데이터 셋 30%:29956.5
train set:69898
test set:29957


##### train data set, 결측치 처리 전 logistic 결과

In [60]:
for i in range(0,14):
    model = sm.Logit.from_formula('TARGET1 ~ train.iloc[:,i]', train).fit()
    print(f'독립변수 이름: {train.columns[i]}')
    print(model.summary())
    # 오즈비 계산(Confidence interval 포함)
    odds_ratios = pd.DataFrame(
        {
            "OR": model.params,
            "Lower CI": model.conf_int()[0],
            "Upper CI": model.conf_int()[1],
        }
    )
    odds_ratios = np.exp(odds_ratios)
    print(odds_ratios)
    print('============='*3,f'{i+1}번째 결과 끝','============='*3)
    print('\n')

Optimization terminated successfully.
         Current function value: 0.413395
         Iterations 6
독립변수 이름: 개월령
                           Logit Regression Results                           
Dep. Variable:                TARGET1   No. Observations:                69898
Model:                          Logit   Df Residuals:                    69896
Method:                           MLE   Df Model:                            1
Date:                Sun, 15 Oct 2023   Pseudo R-squ.:                 0.02440
Time:                        15:20:56   Log-Likelihood:                -28895.
converged:                       True   LL-Null:                       -29618.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.1473      0.051      2.904      0.004       0.048       0.24

In [34]:
print(df_copy['6개월내출산'].unique())
print(df_copy['농가구분'].unique())

['N' 'Y']
['농가소' '우시장소']


In [63]:
model1 = sm.Logit.from_formula('TARGET1 ~ C(train["6개월내출산"], Treatment("N"))', train).fit()
print(model1.summary())
odds_ratios = pd.DataFrame(
    {
        "OR": model.params,
        "Lower CI": model.conf_int()[0],
        "Upper CI": model.conf_int()[1],
    }
)
odds_ratios = np.exp(odds_ratios)
print(odds_ratios)

Optimization terminated successfully.
         Current function value: 0.411595
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                TARGET1   No. Observations:                69898
Model:                          Logit   Df Residuals:                    69896
Method:                           MLE   Df Model:                            1
Date:                Sun, 15 Oct 2023   Pseudo R-squ.:                 0.02864
Time:                        15:22:25   Log-Likelihood:                -28770.
converged:                       True   LL-Null:                       -29618.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                              coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------------
Intercept                                  -1.3010

In [64]:
model1 = sm.Logit.from_formula('TARGET1 ~ C(농가구분, Treatment("우시장소"))', train).fit()
print(model1.summary())
odds_ratios = pd.DataFrame(
    {
        "OR": model.params,
        "Lower CI": model.conf_int()[0],
        "Upper CI": model.conf_int()[1],
    }
)
odds_ratios = np.exp(odds_ratios)
print(odds_ratios)

Optimization terminated successfully.
         Current function value: 0.423709
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                TARGET1   No. Observations:                69898
Model:                          Logit   Df Residuals:                    69896
Method:                           MLE   Df Model:                            1
Date:                Sun, 15 Oct 2023   Pseudo R-squ.:               5.523e-05
Time:                        15:22:37   Log-Likelihood:                -29616.
converged:                       True   LL-Null:                       -29618.
Covariance Type:            nonrobust   LLR p-value:                   0.07049
                                        coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Intercept                            -1.6639      0.038    -44