In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [4]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [5]:
titanic = sns.load_dataset('titanic')
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [6]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [7]:
titanic['sex'].value_counts()

sex
male      577
female    314
Name: count, dtype: int64

In [8]:
# 결측치
titanic.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [9]:
# 타킷추출
dfy = titanic['survived'].copy()
dfy

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: survived, Length: 891, dtype: int64

In [10]:
# 피쳐추출
feature_list = ['pclass', 'sex', 'age']
dfX = titanic[feature_list].copy()
dfX

Unnamed: 0,pclass,sex,age
0,3,male,22.0
1,1,female,38.0
2,3,female,26.0
3,1,female,35.0
4,3,male,35.0
...,...,...,...
886,2,male,27.0
887,1,female,19.0
888,3,female,
889,1,male,26.0


In [11]:
dfX['sex'].value_counts()

sex
male      577
female    314
Name: count, dtype: int64

In [12]:
from sklearn.preprocessing import LabelEncoder
dfX['sex'] = LabelEncoder().fit_transform(dfX['sex'])
dfX['sex'].value_counts()

sex
1    577
0    314
Name: count, dtype: int64

In [13]:
dfX['age'].isnull().sum()


np.int64(177)

In [14]:
dfX['age'].fillna( dfX['age'].mean(), inplace=True)
dfX['age'].isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dfX['age'].fillna( dfX['age'].mean(), inplace=True)


np.int64(0)

In [15]:
dfX.tail()

Unnamed: 0,pclass,sex,age
886,2,1,27.0
887,1,0,19.0
888,3,0,29.699118
889,1,1,26.0
890,3,1,32.0


In [16]:
# 원핫 인코딩

dfX['pclass'].value_counts()

pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [17]:
from sklearn.preprocessing import LabelBinarizer

dfX2 = pd.DataFrame(LabelBinarizer().fit_transform(dfX['pclass']),
                    columns=['c1','c2','c3'],
                    index = dfX.index)

dfX2

Unnamed: 0,c1,c2,c3
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1
...,...,...,...
886,0,1,0
887,1,0,0
888,0,0,1
889,1,0,0


In [18]:
dfX = pd.concat([dfX, dfX2], axis=1)
dfX


Unnamed: 0,pclass,sex,age,c1,c2,c3
0,3,1,22.000000,0,0,1
1,1,0,38.000000,1,0,0
2,3,0,26.000000,0,0,1
3,1,0,35.000000,1,0,0
4,3,1,35.000000,0,0,1
...,...,...,...,...,...,...
886,2,1,27.000000,0,1,0
887,1,0,19.000000,1,0,0
888,3,0,29.699118,0,0,1
889,1,1,26.000000,1,0,0


In [19]:
dfX.drop(['pclass'], axis=1, inplace=True)
dfX

Unnamed: 0,sex,age,c1,c2,c3
0,1,22.000000,0,0,1
1,0,38.000000,1,0,0
2,0,26.000000,0,0,1
3,0,35.000000,1,0,0
4,1,35.000000,0,0,1
...,...,...,...,...,...
886,1,27.000000,0,1,0
887,0,19.000000,1,0,0
888,0,29.699118,0,0,1
889,1,26.000000,1,0,0


# 데이터 분할

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dfX, dfy,
                                                    test_size=0.25, random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((668, 5), (223, 5), (668,), (223,))

# 의사 결정 나무 생성

In [21]:
from sklearn.tree import DecisionTreeClassifier
dtmodel = DecisionTreeClassifier( criterion='gini', max_depth=3, min_samples_leaf=5)

#학습
dtmodel.fit(X_train,y_train)
dtmodel

In [None]:
y_pred = dtmodel.predict(X_train)
y_pred

In [23]:
pd.crosstab(y_train, y_pred, margins=True)

col_0,0,1,All
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,360,50,410
1,73,185,258
All,433,235,668


In [24]:
!pip install graphviz pydotplus

Collecting graphviz
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting pydotplus
  Downloading pydotplus-2.0.2.tar.gz (278 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.7/278.7 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Downloading graphviz-0.20.3-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pydotplus
  Building wheel for pydotplus (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pydotplus: filename=pydotplus-2.0.2-py3-none-any.whl size=24552 sha256=1fb348e67b47df17f2944577a7d5fa69adc9a9bcf7133cba7f537d7baf8e3814
  Stored in directory: /home/codespace/.cache/pip/wheels/69/b2/67/08f0eef649af92df772c09f4515582

In [25]:
from IPython.display import Image# for visualizing the tree
import pydotplus
 
# 의사 결정 나무를 시각화하기 위해서 라이브러리를 임포트합니다.
from sklearn.tree import export_graphviz

In [26]:
# 시각화를 위한 함수를 작성한다.
def tree_graph_to_png(tree, feature_names, class_names, png_file_to_save):
    tree_str = export_graphviz(tree, feature_names=feature_names, class_names=class_names, filled=True, out_file=None)
    graph = pydotplus.graph_from_dot_data(tree_str)
    print(graph)
    graph.write_png(png_file_to_save)
    return Image(graph.create_png())

In [27]:
tree_graph_to_png(tree=dtmodel, feature_names=dfX.columns.values, class_names=['NotSurvived','Survived'],
                  png_file_to_save='decision_tree.png')

<pydotplus.graphviz.Dot object at 0x79125dbd8520>


InvocationException: GraphViz's executables not found