In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

import matplotlib.pyplot as plt
%matplotlib inline

import platform

from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':  # 맥OS 
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':  # 윈도우
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system...  sorry~~~')

# 범주형 데이터의 처리

## patsy패키지를 사용한 더미변수화

In [32]:
df = pd.DataFrame(["male", "female"], columns = ["sex"])
df

Unnamed: 0,sex
0,male
1,female


In [5]:
from patsy import dmatrix

dmatrix("sex + 0" , df)

NameError: name 'df' is not defined

In [2]:
df2 = pd.DataFrame(["a","b","o","ab"], columns = ["blood"])
df2

Unnamed: 0,blood
0,a
1,b
2,o
3,ab


In [5]:
pd.DataFrame.info(df2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   blood   4 non-null      object
dtypes: object(1)
memory usage: 160.0+ bytes


In [17]:
dmatrix("blood +0", df2)

DesignMatrix with shape (4, 4)
  blood[a]  blood[ab]  blood[b]  blood[o]
         1          0         0         0
         0          0         1         0
         0          0         0         1
         0          1         0         0
  Terms:
    'blood' (columns 0:4)

In [18]:
df3 = pd.DataFrame([1,2,3,4], columns=["x"])
df3

Unnamed: 0,x
0,1
1,2
2,3
3,4


In [23]:
# 데이터 값이 범주형이지만 정수로 표현된 경우 c() 연산자를 사용한다
dmatrix("C(x) + 0", df3)

DesignMatrix with shape (4, 4)
  C(x)[1]  C(x)[2]  C(x)[3]  C(x)[4]
        1        0        0        0
        0        1        0        0
        0        0        1        0
        0        0        0        1
  Terms:
    'C(x)' (columns 0:4)

In [28]:
dm = dmatrix("C(blood, levels = ['a','b','o','ab']) + 0", df2)
dm

DesignMatrix with shape (4, 4)
  Columns:
    ["C(blood, levels=['a', 'b', 'o', 'ab'])[a]",
     "C(blood, levels=['a', 'b', 'o', 'ab'])[b]",
     "C(blood, levels=['a', 'b', 'o', 'ab'])[o]",
     "C(blood, levels=['a', 'b', 'o', 'ab'])[ab]"]
  Terms:
    "C(blood, levels=['a', 'b', 'o', 'ab'])" (columns 0:4)
  (to view full data, use np.asarray(this_obj))

In [29]:
np.asarray(dm)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

## 축소랭크 방식

In [33]:
dmatrix("sex", df)

DesignMatrix with shape (2, 2)
  Intercept  sex[T.male]
          1            1
          1            0
  Terms:
    'Intercept' (column 0)
    'sex' (column 1)

In [34]:
dmatrix("blood", df2)

DesignMatrix with shape (4, 4)
  Intercept  blood[T.ab]  blood[T.b]  blood[T.o]
          1            0           0           0
          1            0           1           0
          1            0           0           1
          1            1           0           0
  Terms:
    'Intercept' (column 0)
    'blood' (columns 1:4)

In [37]:
df4 = pd.DataFrame([["A", "X"], ["B", "X"], ["A", "Y"], ["B", "Y"]], columns=["x1","x2"])
df4

Unnamed: 0,x1,x2
0,A,X
1,B,X
2,A,Y
3,B,Y


In [42]:
dmatrix("x1 + x2", df4)

DesignMatrix with shape (4, 3)
  Intercept  x1[T.B]  x2[T.Y]
          1        0        0
          1        1        0
          1        0        1
          1        1        1
  Terms:
    'Intercept' (column 0)
    'x1' (column 1)
    'x2' (column 2)

In [43]:
dmatrix("x1:x2+0",df4)

DesignMatrix with shape (4, 4)
  x1[A]:x2[X]  x1[B]:x2[X]  x1[A]:x2[Y]  x1[B]:x2[Y]
            1            0            0            0
            0            1            0            0
            0            0            1            0
            0            0            0            1
  Terms:
    'x1:x2' (columns 0:4)