In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [4]:
# 1. 데이터 셋 로드
df = pd.read_csv('StudentsPerformance.csv')

In [12]:
# 데이터셋: 특성, 타깃
X = df.drop(columns=['math score']) # 학습용 입력 데이터
y = df['math score']                # 타깃: 답

In [13]:
# 범주형 데이터 -> 수치 데이터
ctgy_data = ['gender','race/ethnicity','parental level of education','lunch','test preparation course']

In [14]:
# 전처리
preprocessor = ColumnTransformer(
    transformers=[
    ('cool', OneHotEncoder(handle_unknown='ignore'), ctgy_data)
    ],
    remainder='passthrough'
)

In [15]:
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]
)

In [16]:
# 데이터를 8:2로 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [18]:
# 학습
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cool', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [19]:
# 예측
y_pred = model.predict(X_test)

In [20]:
# 모델 평가
mse =  mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [22]:
print('mse:',mse)
print('r2:',r2)

mse: 29.095169866715516
r2: 0.8804332983749564


In [25]:
# 필요한 라이브러리 불러오기
from sklearn.datasets import load_breast_cancer # 유방암 데이터셋 로드
from sklearn.model_selection import train_test_split # 학습용/테스트용 데이터 분리
from sklearn.linear_model import LogisticRegression # 로지스틱 회귀 모델
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score, confusion_matrix, classification_report # 평가 지표


In [None]:
# 1. 데이터 셋 준비
data = load_breast_cancer() # 유방암 판별 데이터셋 로드

In [None]:
# 특성과 타깃으로 분류
X = data.data       # 특성 데이터(다양한 세포 정보)
y = data.target     # 레이블 (0: 음성, 1: 양성)

In [29]:
print(X)
print(y)

[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
 ...
 [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
 [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]
 [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 

In [30]:
# 훈련 데이터와 테스트 데이터 구분
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [33]:
# 학습 모델 선택, 객체 생성, 학습
model = LogisticRegression(max_iter=10000)

In [34]:
model.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,10000


In [35]:
# 테스트 데이터로 예측
y_pred = model.predict(X_test)

In [38]:
# 평가지표 출력
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print('정확도:',acc)
print('정밀도:',prec)
print('재현율:',recall)
print('f1 스코어:',f1)

정확도: 0.956140350877193
정밀도: 0.9459459459459459
재현율: 0.9859154929577465
f1 스코어: 0.9655172413793104


In [40]:
print('혼동행렬 매트릭스 출력')
print(confusion_matrix(y_test, y_pred))

혼동행렬 매트릭스 출력
[[39  4]
 [ 1 70]]


In [42]:
# 분류 보고서 출력
print('분류 보고서 출력')
print(classification_report(y_test, y_pred))

분류 보고서 출력
              precision    recall  f1-score   support

           0       0.97      0.91      0.94        43
           1       0.95      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [43]:
# 1. 필요한 라이브러리 임포트
from sklearn.datasets import load_iris # Iris 데이터셋 로드
from sklearn.model_selection import train_test_split # 데이터 분리
from sklearn.neighbors import KNeighborsClassifier # KNN 알고리즘 사용
from sklearn.metrics import classification_report, accuracy_score #성능평가

In [46]:
# 데이터 준비
iris = load_iris()

In [48]:
# 특성, 라벨로 구분
X = iris.data
y = iris.target

In [50]:
print(X)
print(y)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [52]:
# 데이터 분리 : 학습용, 테스트용
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [54]:
print(X_train.shape)
print(X_test.shape)

(120, 4)
(30, 4)


In [55]:
# KNN 모델 사용
k = 3 # 가장 가까운 이웃의 갯수
knn = KNeighborsClassifier(n_neighbors=k)

In [None]:
# 모델 학습
knn.fit(X_train, y_train)

0,1,2
,n_neighbors,3
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [57]:
# 모델 평가
y_pred = knn.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('정확도:',acc)

정확도: 1.0


In [63]:
print('분류 보고서')
print(classification_report(y_test, y_pred, target_names = iris.target_names))

분류 보고서
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [68]:
# 새로운 데이터 예측
new_data = [[5.0, 3.5, 1.3, 0.3]]
new_pred=knn.predict(new_data)
print(new_pred)
new_pred_class = iris.target_names[new_pred]
print(new_pred_class)

[0]
['setosa']
