# 뷴류기 만들기
-규칙 : 성별 = 1 생존하지 않은 것으로 분류

In [100]:
from sklearn.base import BaseEstimator
import numpy as np
class MyDummyClassifier(BaseEstimator):
  def fit(self, X, y):
    pass
  
  def predict(self, X):
    pred = np.zeros((X.shape[0],1))
    for i in range(X.shape[0]):
      if X['Sex'].iloc[i] == 1:
        pred[i]=0
      else :
        pred[i]=1
    return pred
    

In [101]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
titanic_df = pd.read_csv('./data/titanic.csv')
#df.head(2)
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)
from sklearn.preprocessing import LabelEncoder

# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 피처 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

# 레이블 인코딩 수행 함수
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

# 앞에서 설정한 데이터 전처리 함수 호출
def transform_features(df):
    df = fillna(df) 
    df = drop_features(df)
    df = format_features(df)
    return df

X_titanic_df = transform_features(X_titanic_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cabin'].fillna('N', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always

In [102]:
#데이터셋 분할
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df,
                                                    y_titanic_df, 
                                                    test_size=0.2, 
                                                    random_state=0 )

myclf = MyDummyClassifier()
myclf.fit(X_train, y_train)
my_pred = myclf.predict(X_test)
accuracy_score(y_test, my_pred)

0.7877094972067039

In [103]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, my_pred)

array([[92, 18],
       [20, 49]])

In [104]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_test, my_pred), recall_score(y_test, my_pred)

(np.float64(0.7313432835820896), np.float64(0.7101449275362319))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine # 예시 데이터셋
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

def get_clf_eval1(y_test, pred):
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred) # 또는 'weighted', 'binary' 등
    recall = recall_score(y_test, pred) # 또는 'weighted', 'binary' 등
    f1 = f1_score(y_test, pred) # 또는 'weighted', 'binary' 등
    confusion = confusion_matrix(y_test, pred)

    print('오차 행렬:\n', confusion)
    print(f'정확도: {accuracy:.4f}')
    print(f'정밀도: {precision:.4f}')
    print(f'재현율: {recall:.4f}')
    print(f'F1 스코어: {f1:.4f}')

In [114]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)

    print(confusion)
    print('20')
    print(accuracy, precision, recall)

In [115]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)

#정확도, 정밀도, 재현율
get_clf_eval(y_test, pred)

[[92 18]
 [16 53]]
20
0.8100558659217877 0.7464788732394366 0.7681159420289855


In [133]:
pred_proba =lr_clf.predict_proba(X_test)

pos_proba = pred_proba[:,1]
threshold = 0.4

custom_proba=(pos_proba>threshold).astype(int) # 임계치보다 크면 1
confusion_matrix(y_test, custom_proba)
#정확도, 정밀도, 재현율
get_clf_eval(y_test, custom_proba)

[[86 24]
 [13 56]]
20
0.7932960893854749 0.7 0.8115942028985508


# 정밀도와 재현율의 변화
정밀도와 재현율의 불균형이 심할 떄,
혹은 비즈니스이 요구사항이 있을때
임계치를 조정해야한다.

임계치를 낮추면 정밀도는 낮아지고 재현율을 올라간다.

In [141]:
from sklearn.metrics import f1_score, classification_report
f1_score(y_test, pred) #정밀도와 재현율의 평균

f1_score(y_test, pred)
#classification_report(y_test, pred)

np.float64(0.7571428571428571)

In [142]:
print(classification_report(y_test, pred))  #평가보고서


              precision    recall  f1-score   support

           0       0.85      0.84      0.84       110
           1       0.75      0.77      0.76        69

    accuracy                           0.81       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [143]:
import pandas as pd
pd.Series(lr_clf.coef_[0]).sort_values() # 피처의 중요도는 계수

0   -2.593416
1   -0.901628
2   -0.368137
3   -0.107352
4   -0.059052
5   -0.058762
6   -0.042756
7    0.001286
dtype: float64

In [6]:
#보팅_분류기
import pandas as pd

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [177]:
cancer = load_breast_cancer()
data_df=pd.DataFrame(cancer.data, columns=cancer.feature_names)
data_df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


# 데이터 분할


In [178]:
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df,
                                                    y_titanic_df, 
                                                    test_size=0.2, 
                                                    random_state=0 )

In [179]:
lr_clf =LogisticRegression(solver='liblinear')
knn_clf =KNeighborsClassifier(n_neighbors=7)
vo_clf=VotingClassifier(estimators=[('LR',lr_clf), ('KNN',knn_clf)], voting='soft')



In [180]:
#분류기 학습 - 예측 -평가
vo_clf.fit(X_train, y_train)
vo_pred = vo_clf.predict(X_test)
accuracy_score(y_test, vo_pred)


0.8156424581005587

In [181]:
confusion_matrix(y_test, vo_pred)

array([[100,  10],
       [ 23,  46]])

In [165]:
vo_clf

In [4]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import LabelEncoder
import graphviz
import os
import re


df=pd.read_csv('./data/redwine1.csv')

In [7]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import LabelEncoder
import graphviz
import re
import os # os 모듈 임포트

# 1. 데이터 로드
df = pd.read_csv('./data/redwine1.csv')

# 2. 'sweet', 'acidity', 'body', 'tannin' 컬럼에서 숫자만 추출하여 변환
def extract_number(text):
    if pd.isna(text):
        return None
    match = re.search(r'\d+', str(text))
    if match:
        return int(match.group(0))
    return None

df['sweet'] = df['sweet'].apply(extract_number)
df['acidity'] = df['acidity'].apply(extract_number)
df['body'] = df['body'].apply(extract_number)
df['tannin'] = df['tannin'].apply(extract_number)

# 3. 필요한 컬럼만 선택 및 결측치 제거
if 'varieties1' not in df.columns:
    print("오류: 'varieties1' 컬럼을 찾을 수 없습니다. 데이터에 해당 컬럼이 있는지 확인해주세요.")
    exit()

df_processed = df[['sweet', 'acidity', 'body', 'tannin', 'varieties1']].dropna()

# 4. 'varieties1' 컬럼 인코딩
le = LabelEncoder()
df_processed['varieties1_encoded'] = le.fit_transform(df_processed['varieties1'])

# 5. 특성(X) 및 타겟(y) 정의
features = ['sweet', 'acidity', 'body', 'tannin']
X = df_processed[features]
y = df_processed['varieties1_encoded']
class_names = le.classes_

# --- !!! 여기서부터 강력한 특수 문자 처리 !!! ---
# 6. class_names의 특수 문자 처리 (더욱 강력하게)
cleaned_class_names = []
for name in class_names:
    # DOT 문자열 리터럴에서 이스케이프가 필요한 문자 처리 (따옴표, 백슬래시, 줄바꿈)
    cleaned_name = str(name).replace('\\', '\\\\')  # 백슬래시 이스케이프
    cleaned_name = cleaned_name.replace('"', '\\"')  # 큰따옴표 이스케이프
    cleaned_name = cleaned_name.replace('\n', '\\n') # 줄바꿈 이스케이프
    cleaned_name = cleaned_name.replace('\r', '')   # 캐리지 리턴 제거

    # 기타 HTML 특수 문자 (앰퍼샌드, 꺽쇠괄호) 처리
    cleaned_name = cleaned_name.replace('&', '&amp;') # HTML 엔티티로 변환
    cleaned_name = cleaned_name.replace('<', '&lt;')
    cleaned_name = cleaned_name.replace('>', '&gt;')

    # 혹시 모를 HTML 태그 잔여물 제거 (정규식 다시 적용)
    cleaned_name = re.sub(r'<[^>]+>', '', cleaned_name)

    cleaned_name = cleaned_name.strip() # 앞뒤 공백 제거
    cleaned_class_names.append(cleaned_name)

# feature_names도 혹시 모를 문제 방지를 위해 처리 (이름이 고정이지만, 습관화)
cleaned_features = []
for name in features:
    cleaned_name = str(name).replace('\\', '\\\\')
    cleaned_name = cleaned_name.replace('"', '\\"')
    cleaned_name = cleaned_name.replace('\n', '\\n')
    cleaned_name = cleaned_name.replace('\r', '')
    cleaned_name = cleaned_name.replace('&', '&amp;')
    cleaned_name = cleaned_name.replace('<', '&lt;')
    cleaned_name = cleaned_name.replace('>', '&gt;')
    cleaned_name = re.sub(r'<[^>]+>', '', cleaned_name)
    cleaned_features.append(cleaned_name.strip())


# 7. 의사결정 트리 모델 훈련
dt_clf = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_clf.fit(X, y)

# 8. 의사결정 트리 시각화 (요청하신 .dot 파일 형식으로)

# 8-1. 의사결정 트리를 'tree.dot' 파일로 내보내기
# class_names와 feature_names에 정제된 리스트 사용
# special_characters=True는 HTML 라이크 레이블을 사용할 때 유용하지만,
# 일반 문자열에서는 때로 혼란을 줄 수 있으므로 문제가 지속되면 False로 변경해볼 수도 있습니다.
export_graphviz(dt_clf, out_file="tree.dot",
                feature_names=cleaned_features, # 정제된 feature_names 사용
                class_names=cleaned_class_names, # 정제된 class_names 사용
                filled=True, rounded=True,
                special_characters=True) # 문제가 지속되면 이 옵션을 False로 바꿔보세요.


# 8-2. 'tree.dot' 파일을 읽어서 dot_graph 변수에 저장
with open("tree.dot", encoding='utf-8') as f: # 인코딩 명시
    dot_graph = f.read()

# 8-3. dot_graph를 사용하여 graphviz 소스 객체 생성
graph = graphviz.Source(dot_graph)

# 8-4. 그래프를 파일로 출력 (권한 문제 방지를 위한 디렉토리 지정 및 자동 열림)
output_dir = './decision_tree_outputs' # 새로운 출력 폴더
os.makedirs(output_dir, exist_ok=True) # 폴더가 없으면 생성

# PDF 출력
pdf_path = os.path.join(output_dir, "wine_varieties_decision_tree_pruned.pdf")
graph.render(os.path.join(output_dir, "wine_varieties_decision_tree_pruned"), format="pdf")
print(f"PDF 파일이 생성되었습니다: {pdf_path}")

# SVG 출력
svg_path = os.path.join(output_dir, "wine_varieties_decision_tree_pruned.svg")
graph.render(os.path.join(output_dir, "wine_varieties_decision_tree_pruned"), format="svg")
print(f"SVG 파일이 생성되었습니다: {svg_path}")


print("의사결정 트리가 'tree.dot' 파일로 생성되었고, 이를 이용해 이미지 파일이 생성되었습니다.")
print(f"클래스 분류 기준은 'varieties1' 컬럼(품종)입니다. 예측된 품종: {class_names.tolist()}")
print(f"Graphviz 오류 방지를 위해 처리된 품종 이름: {cleaned_class_names}")

PDF 파일이 생성되었습니다: ./decision_tree_outputs\wine_varieties_decision_tree_pruned.pdf
SVG 파일이 생성되었습니다: ./decision_tree_outputs\wine_varieties_decision_tree_pruned.svg
의사결정 트리가 'tree.dot' 파일로 생성되었고, 이를 이용해 이미지 파일이 생성되었습니다.
클래스 분류 기준은 'varieties1' 컬럼(품종)입니다. 예측된 품종: ['Abouriou', 'Agiorgitiko', 'Aglianico', 'Aleatico', 'Alicante Bouschet', 'Aragonez', 'Aramon', 'Auxerrois', 'Baga', 'Barbera', 'Bastardo', 'Black Grenache', 'Black Muscat', 'Blackberry', 'Blaufrankisch', 'Blend', 'Bobal', 'Bonarda', 'Bordeaux Blend Red', 'Bovale', 'Bovale Sardo', 'Brachetto', 'Brunello', 'Cabernet Franc', 'Canaiolo', 'Cannonau', 'Carignan', 'Carignano', 'Carinena', 'Carmenere', 'Carricante', 'Castelao', 'Chardonnay', 'Cienna', 'Ciliegiolo', 'Cinsault', 'Colorino', 'Concord', 'Corvina', 'Corvina Veronese', 'Corvinone', 'Croatina', 'Dolcetto', 'Dornfelder', 'Etc', 'Fer Servadou', 'Feteasca Neagra', 'Franconia', 'Frappato', 'Freisa', 'Furmint', 'Gaglioppo', 'Gamay', 'Garnacha', 'Garnacha Negra', 'Garnacha Tintorera'