## 타이타닉 데이터 VIF 예제
* target: Servived
* Passengerid, name, Ticket, Cabin 컬럼은 삭제
* Nan 데이터는 삭제
* Sex, Embarked는 레이블 인코딩

In [1]:
import numpy as np
import pandas as pd

from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

from library.preprocessing import DataFramePreprocessor
from library.preprocessing import FeatureSelector

In [2]:
titanic = pd.read_csv("data/titanic_train.csv")
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# 전처리

## 컬럼 제거

In [3]:
titanic.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)

In [4]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [5]:
titanic.describe(include="all")

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891,714.0,891.0,891.0,891.0,889
unique,,,2,,,,,3
top,,,male,,,,,S
freq,,,577,,,,,644
mean,0.383838,2.308642,,29.699118,0.523008,0.381594,32.204208,
std,0.486592,0.836071,,14.526497,1.102743,0.806057,49.693429,
min,0.0,1.0,,0.42,0.0,0.0,0.0,
25%,0.0,2.0,,20.125,0.0,0.0,7.9104,
50%,0.0,3.0,,28.0,0.0,0.0,14.4542,
75%,1.0,3.0,,38.0,1.0,0.0,31.0,


## LabelEncoding

### Sex, Embarked는 레이블 인코딩

In [6]:
lbe = LabelEncoder()
dfp = DataFramePreprocessor()

In [7]:
df, tfs = dfp.fit_transform_single_transformer(titanic, lbe, ['Sex', 'Embarked'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Sex       891 non-null    int32
 1   Embarked  891 non-null    int32
dtypes: int32(2)
memory usage: 7.1 KB


In [8]:
titanic.update(df)
titanic[['Sex', 'Embarked']] = titanic[['Sex', 'Embarked']].astype("int32")
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    int32  
dtypes: float64(2), int32(2), int64(4)
memory usage: 48.9 KB


In [9]:
titanic

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.2500,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.9250,2
3,1,1,0,35.0,1,0,53.1000,2
4,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,2
887,1,1,0,19.0,0,0,30.0000,2
888,0,3,0,,1,2,23.4500,2
889,1,1,1,26.0,0,0,30.0000,0


## 널 값 처리

### Case1. Null 제거

In [10]:
no_null_titanic = titanic.dropna(axis=0)
no_null_titanic

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.2500,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.9250,2
3,1,1,0,35.0,1,0,53.1000,2
4,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
885,0,3,0,39.0,0,5,29.1250,1
886,0,2,1,27.0,0,0,13.0000,2
887,1,1,0,19.0,0,0,30.0000,2
889,1,1,1,26.0,0,0,30.0000,0


In [11]:
no_null_titanic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 714 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  714 non-null    int64  
 1   Pclass    714 non-null    int64  
 2   Sex       714 non-null    int32  
 3   Age       714 non-null    float64
 4   SibSp     714 non-null    int64  
 5   Parch     714 non-null    int64  
 6   Fare      714 non-null    float64
 7   Embarked  714 non-null    int32  
dtypes: float64(2), int32(2), int64(4)
memory usage: 44.6 KB


# VIF 분석

In [12]:
def get_vif(formula, df):
    y, X = dmatrices(formula, df, return_type="dataframe")
    
    vif = pd.DataFrame()
    vif["features"] = X.columns
    vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif

In [13]:
no_null_titanic.columns.difference(['Survived'])

Index(['Age', 'Embarked', 'Fare', 'Parch', 'Pclass', 'Sex', 'SibSp'], dtype='object')

In [14]:
formula = "Survived~" + "+".join(no_null_titanic.columns.difference(['Survived']))
formula

'Survived~Age+Embarked+Fare+Parch+Pclass+Sex+SibSp'

In [15]:
get_vif(formula, no_null_titanic)

Unnamed: 0,features,VIF Factor
0,Intercept,32.422909
1,Age,1.31465
2,Embarked,1.107727
3,Fare,1.642829
4,Parch,1.287749
5,Pclass,1.756618
6,Sex,1.117924
7,SibSp,1.277192


In [16]:
fs = FeatureSelector()
fs.vif_analysis(formula, titanic)

                            OLS Regression Results                            
Dep. Variable:               Survived   R-squared:                       0.402
Model:                            OLS   Adj. R-squared:                  0.396
Method:                 Least Squares   F-statistic:                     67.92
Date:                Fri, 08 Sep 2023   Prob (F-statistic):           8.65e-75
Time:                        01:52:22   Log-Likelihood:                -321.61
No. Observations:                 714   AIC:                             659.2
Df Residuals:                     706   BIC:                             695.8
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.4012      0.081     17.223      0.0

## Permutation_importance 분석

In [17]:
from sklearn.inspection import permutation_importance

In [18]:
y, X = dmatrices(formula, titanic, return_type="dataframe")

In [19]:
dtc = DecisionTreeClassifier()
dtc.fit(X, y)

In [20]:
# Permutation Importance 계산
perm_importance = permutation_importance(dtc, X, y)

# 피처별 중요도 출력
print("피처 중요도:", perm_importance.importances_mean)

피처 중요도: [0.         0.25686275 0.04621849 0.16638655 0.02521008 0.20336134
 0.2535014  0.05658263]


In [21]:
X.columns

Index(['Intercept', 'Age', 'Embarked', 'Fare', 'Parch', 'Pclass', 'Sex',
       'SibSp'],
      dtype='object')

In [22]:
pi = pd.DataFrame()
pi["feature"] = X.columns
pi["perm_importance"] = perm_importance.importances_mean

In [23]:
pi

Unnamed: 0,feature,perm_importance
0,Intercept,0.0
1,Age,0.256863
2,Embarked,0.046218
3,Fare,0.166387
4,Parch,0.02521
5,Pclass,0.203361
6,Sex,0.253501
7,SibSp,0.056583


In [24]:
fs.get_permutation_importance(dtc, X, y)

Unnamed: 0,feature,perm_importance
0,Intercept,0.0
1,Age,0.248739
2,Embarked,0.04902
3,Fare,0.163305
4,Parch,0.023529
5,Pclass,0.197479
6,Sex,0.254062
7,SibSp,0.060504
