In [1]:
# 類別型資料通常會以最常出現的類別值或是任意字串增補
import pandas as pd

# to split the data sets:
from sklearn.model_selection import train_test_split

# to impute missing data with sklearn:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# to impute missing data with Feature-engine:
from feature_engine.imputation import CategoricalImputer

  from pandas.core import (


In [2]:
data = pd.read_csv("credit_approval_uci.csv")
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [4]:
# 選擇類別型變數
categorical_vars = X_train.select_dtypes(include="O").columns.to_list()
categorical_vars

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [5]:
# Let's inspect the proportion of missing
# values per variable:

data[categorical_vars].isnull().mean()

A1     0.017391
A4     0.008696
A5     0.008696
A6     0.013043
A7     0.013043
A9     0.133333
A10    0.133333
A12    0.000000
A13    0.000000
dtype: float64

In [6]:
# 使用pandas
# Learn the variables' most frequent values:
frequent_values = X_train[categorical_vars].mode().iloc[0].to_dict()
frequent_values

{'A1': 'b',
 'A4': 'u',
 'A5': 'g',
 'A6': 'c',
 'A7': 'v',
 'A9': 't',
 'A10': 'f',
 'A12': 'f',
 'A13': 'g'}

In [7]:
print(type(X_train))

<class 'pandas.core.frame.DataFrame'>


In [7]:
# Replace missing data by the frequent category:
X_train = X_train.fillna(value=frequent_values)
X_test = X_test.fillna(value=frequent_values)

In [8]:
# Corroborate absence of missing values:
X_train[categorical_vars].isnull().sum()

A1     0
A4     0
A5     0
A6     0
A7     0
A9     0
A10    0
A12    0
A13    0
dtype: int64

In [10]:
# 以字串增補
# Corroborate absence of missing values:
X_test[categorical_vars].isnull().sum()

A1     0
A4     0
A5     0
A6     0
A7     0
A9     0
A10    0
A12    0
A13    0
dtype: int64

In [11]:
# Split the data into train and test sets:

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

In [12]:
imputation_dict = {var: "no_data" for var in categorical_vars}

imputation_dict

{'A1': 'no_data',
 'A4': 'no_data',
 'A5': 'no_data',
 'A6': 'no_data',
 'A7': 'no_data',
 'A9': 'no_data',
 'A10': 'no_data',
 'A12': 'no_data',
 'A13': 'no_data'}

In [13]:
# Replace missing data with a specific string:

X_train.fillna(value=imputation_dict, inplace=True)
X_test.fillna(value=imputation_dict, inplace=True)

In [14]:
# Check the values of an imputed variable:

X_train["A1"].value_counts()

A1
b          335
a          144
no_data      4
Name: count, dtype: int64

In [15]:
# 使用Scikit-learn
# Split data into train and test set:

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

In [16]:
# Make a list with the numerical variables:
remaining_vars = [var for var in X_train.columns if var not in categorical_vars]
remaining_vars

['A2', 'A3', 'A8', 'A11', 'A14', 'A15']

In [17]:
# Set up the imputer to replace missing data with
# the most frequent category:

imputer = SimpleImputer(strategy="most_frequent")

# Indicate which variables to impute:
ct = ColumnTransformer(
    [("imputer", imputer, categorical_vars)], remainder="passthrough"
)

# Find the most frequent value per variable:
ct.fit(X_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [18]:
# Check the most frequent categories:
ct.named_transformers_.imputer.statistics_

array(['b', 'u', 'g', 'c', 'v', 't', 'f', 'f', 'g'], dtype=object)

In [19]:
# Replace missing data:

X_train = ct.transform(X_train)
X_test = ct.transform(X_test)

X_train

array([['a', 'u', 'g', ..., 8.0, 396.0, 4159.0],
       ['a', 'u', 'g', ..., 0.0, 120.0, 0.0],
       ['b', 'y', 'p', ..., 1.0, 50.0, 1187.0],
       ...,
       ['a', 'u', 'g', ..., 0.0, 220.0, 5.0],
       ['a', 'u', 'g', ..., 7.0, 140.0, 2384.0],
       ['b', 'u', 'g', ..., 0.0, 400.0, 0.0]], dtype=object)

In [20]:
# Convert array to a pandas dataframe:

X_train = pd.DataFrame(
    X_train,
    columns=categorical_vars + remaining_vars,
)

X_train.head()

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13,A2,A3,A8,A11,A14,A15
0,a,u,g,c,v,t,t,t,g,46.08,3.0,2.375,8.0,396.0,4159.0
1,a,u,g,q,v,f,f,f,g,15.92,2.875,0.085,0.0,120.0,0.0
2,b,y,p,w,v,t,t,f,g,36.33,2.125,0.085,1.0,50.0,1187.0
3,b,y,p,ff,ff,f,f,f,g,22.17,0.585,0.0,0.0,100.0,0.0
4,b,u,g,m,v,t,t,t,g,57.83,7.04,14.0,6.0,360.0,1332.0


In [21]:
# Corroborate absence of missing values:

X_train[categorical_vars].isnull().sum()

A1     0
A4     0
A5     0
A6     0
A7     0
A9     0
A10    0
A12    0
A13    0
dtype: int64

In [22]:
# Convert array to a pandas dataframe:

X_test = pd.DataFrame(
    X_test,
    columns=categorical_vars + remaining_vars,
)

X_test.head()

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13,A2,A3,A8,A11,A14,A15
0,a,u,g,q,v,t,t,t,g,45.83,10.5,5.0,7.0,0.0,0.0
1,b,u,g,x,h,t,t,t,g,64.08,20.0,17.5,9.0,0.0,1000.0
2,a,u,g,cc,h,t,t,t,g,31.25,3.75,0.625,9.0,181.0,0.0
3,b,u,g,m,v,t,t,f,g,39.25,9.5,6.5,14.0,240.0,4607.0
4,a,u,g,j,j,f,f,t,g,26.17,2.0,0.0,0.0,276.0,1.0


In [23]:
# Corroborate absence of missing values:

X_test[categorical_vars].isnull().sum()

A1     0
A4     0
A5     0
A6     0
A7     0
A9     0
A10    0
A12    0
A13    0
dtype: int64

In [24]:
# 以字串增補
# Split data into train and test set:

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

In [25]:
# Set up the imputer to replace missing data with
# the string "missing":

imputer = SimpleImputer(strategy="constant", fill_value="missing")

# Indicate which variables to impute:
ct = ColumnTransformer(
    [("imputer", imputer, categorical_vars)], remainder="passthrough"
)

# Replace missing data:
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

In [26]:
# Convert array to a pandas dataframe:

X_train = pd.DataFrame(
    X_train,
    columns=categorical_vars + remaining_vars,
)

# Inspect the values in an imputed variable:

X_train["A1"].value_counts()

A1
b          335
a          144
missing      4
Name: count, dtype: int64

In [27]:
# 使用Feature-engine
# Split data into train and test set:

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

In [28]:
# Set up the imputer to replace missing
# data with the most frequent category:

imputer = CategoricalImputer(
    imputation_method="frequent",
    variables=categorical_vars,
)

imputer.fit(X_train)

In [29]:
# Most frequent category per variable:
imputer.imputer_dict_

{'A1': 'b',
 'A4': 'u',
 'A5': 'g',
 'A6': 'c',
 'A7': 'v',
 'A9': 't',
 'A10': 'f',
 'A12': 'f',
 'A13': 'g'}

In [30]:
# Replace missing data with the most
# frequent category:

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [31]:
# Corroborate absence of missing values:

X_train[categorical_vars].isnull().sum()

A1     0
A4     0
A5     0
A6     0
A7     0
A9     0
A10    0
A12    0
A13    0
dtype: int64

In [32]:
# Corroborate absence of missing values:

X_test[categorical_vars].isnull().sum()

A1     0
A4     0
A5     0
A6     0
A7     0
A9     0
A10    0
A12    0
A13    0
dtype: int64

In [33]:
# 以字串增補
# Split data into train and test set:

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

In [34]:
# Set up the imputer to replace missing data
# with the string "other":

imputer = CategoricalImputer(
    imputation_method="missing",
    fill_value="other",
    variables=categorical_vars,
)

# Replace missing data with the string "other":

X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [35]:
# Inspect the values in an imputed variable:

X_train["A1"].value_counts()

A1
b        335
a        144
other      4
Name: count, dtype: int64