<a href="https://colab.research.google.com/github/tmdcks1103/Machine-Learning-Programming/blob/main/%EA%B8%B0%EA%B3%84%ED%95%99%EC%8A%B5_%ED%94%84%EB%A1%9C%EA%B7%B8%EB%9E%98%EB%B0%8D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week 2


In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score



# -----------------------------

# 1) 데이터 준비

# -----------------------------

cols = ["sepal_length", "sepal_width", "petal_length", "petal_width", "label"]

df = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/data/iris.data", header=None, names=cols).dropna()



X = df.drop(columns=["label"])

y = df["label"]



X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.2, stratify=y, random_state=42

)



# -----------------------------

# 2) 모델 구성

# -----------------------------

dt = DecisionTreeClassifier(random_state=42)

rf = RandomForestClassifier(n_estimators=200, random_state=42)

lr = LogisticRegression(max_iter=500)



# -----------------------------

# 3) 모델 학습

# -----------------------------

dt.fit(X_train, y_train)

rf.fit(X_train, y_train)

lr.fit(X_train, y_train)



# -----------------------------

# 4) 모델 평가

# -----------------------------

dt_acc = accuracy_score(y_test, dt.predict(X_test))

rf_acc = accuracy_score(y_test, rf.predict(X_test))

lr_acc = accuracy_score(y_test, lr.predict(X_test))



print("=== Test Accuracy ===")

print(f"Decision Tree : {dt_acc:.4f}")

print(f"Random Forest : {rf_acc:.4f}")

print(f"Logistic Reg. : {lr_acc:.4f}")



=== Test Accuracy ===
Decision Tree : 0.9333
Random Forest : 0.9000
Logistic Reg. : 0.9667


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# -----------------------------
# 1) 데이터 준비
# -----------------------------
# 헤더를 사용하여 데이터 파일을 올바르게 불러옵니다.
df = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/data/diabetes.csv")

# 입력 데이터(X)와 레이블(y)을 올바르게 정의합니다.
X = df.drop(columns=["Outcome"])
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -----------------------------
# 2) 모델 구성
# -----------------------------
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
lr = LogisticRegression(max_iter=500)

# -----------------------------
# 3) 모델 학습
# -----------------------------
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

# -----------------------------
# 4) 모델 평가
# -----------------------------
dt_acc = accuracy_score(y_test, dt.predict(X_test))
rf_acc = accuracy_score(y_test, rf.predict(X_test))
lr_acc = accuracy_score(y_test, lr.predict(X_test))

print("=== Test Accuracy ===")
print(f"Decision Tree : {dt_acc:.4f}")
print(f"Random Forest : {rf_acc:.4f}")
print(f"Logistic Reg. : {lr_acc:.4f}")

=== Test Accuracy ===
Decision Tree : 0.7273
Random Forest : 0.7468
Logistic Reg. : 0.7143


In [None]:
# outcome 제하고 BMI 예측하는 회귀분석 코드

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1) 데이터 로드
df = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/data/diabetes.csv")

# 2) Outcome 제거
if 'Outcome' in df.columns:
    df = df.drop(columns=['Outcome'])

# 3) BMI 타깃의 0/NaN 행 제거
df = df[~((df['BMI'] == 0) | (df['BMI'].isna()))].reset_index(drop=True)

# 4) 입력 변수 결측/0 처리 (예: Glucose 등)
cols_to_fix = [c for c in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin'] if c in df.columns]
for c in cols_to_fix:
    df[c] = df[c].replace(0, np.nan)
    df[c] = df[c].fillna(df[c].median())

# 5) X, y 분리
X = df.drop(columns=['BMI'])
y = df['BMI']

# 6) 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7) 스케일링 — **반드시 여기서 생성** (이 줄이 없으면 NameError 발생)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)   # <-- fit_transform: 학습 데이터로 fit + transform
X_test_scaled = scaler.transform(X_test)         # <-- transform만

# (옵션) 변수 존재 확인
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)

# 8) 모델 정의 및 학습
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=200, random_state=42)
}

def regression_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}

results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)           # X_train_scaled가 반드시 존재해야 함
    y_pred = model.predict(X_test_scaled)
    results[name] = regression_metrics(y_test, y_pred)
    print(f"\n{name}: RMSE={results[name]['rmse']:.4f}, MAE={results[name]['mae']:.4f}, R2={results[name]['r2']:.4f}")


X_train_scaled shape: (605, 7)
X_test_scaled shape: (152, 7)

LinearRegression: RMSE=5.0510, MAE=4.0198, R2=0.3717

DecisionTree: RMSE=7.7547, MAE=5.8862, R2=-0.4811

RandomForest: RMSE=5.0072, MAE=3.8808, R2=0.3825


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# -----------------------------
# 1) 데이터 준비
# -----------------------------
# 헤더를 사용하여 데이터 파일을 올바르게 불러옵니다.
df = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/data/breast_cancer.csv")

# 입력 데이터(X)와 레이블(y)을 올바르게 정의합니다.
# 'breast_cancer.csv'의 타겟 컬럼은 'label'입니다.
X = df.drop(columns=["label"])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -----------------------------
# 2) 모델 구성
# -----------------------------
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
lr = LogisticRegression(max_iter=500)

# -----------------------------
# 3) 모델 학습
# -----------------------------
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

# -----------------------------
# 4) 모델 평가
# -----------------------------
dt_acc = accuracy_score(y_test, dt.predict(X_test))
rf_acc = accuracy_score(y_test, rf.predict(X_test))
lr_acc = accuracy_score(y_test, lr.predict(X_test))

print("=== Test Accuracy ===")
print(f"Decision Tree : {dt_acc:.4f}")
print(f"Random Forest : {rf_acc:.4f}")
print(f"Logistic Reg. : {lr_acc:.4f}")

=== Test Accuracy ===
Decision Tree : 0.9035
Random Forest : 0.9561
Logistic Reg. : 0.9825


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# -----------------------------
# 1) 데이터 준비
# -----------------------------
# 파일을 불러와서 실제 컬럼명을 확인합니다.
file_path = "/content/drive/MyDrive/ColabNotebooks/data/car_evaluation.csv" # 파일 경로를 지정하세요.
df = pd.read_csv(file_path)

# 마지막 컬럼을 타겟 변수로 자동 설정합니다.
target_column = df.columns[-1]

# 입력 데이터(X)와 레이블(y)을 정의합니다.
X = df.drop(columns=[target_column])
y = df[target_column]

# 만약 데이터에 범주형 변수가 있다면, 원-핫 인코딩으로 전처리합니다.
if X.select_dtypes(include='object').empty:
    pass
else:
    print("범주형 변수를 원-핫 인코딩합니다.")
    X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -----------------------------
# 2) 모델 구성 (3가지)
# -----------------------------
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
lr = LogisticRegression(max_iter=500)

# -----------------------------
# 3) 모델 학습
# -----------------------------
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

# -----------------------------
# 4) 모델 평가
# -----------------------------
dt_acc = accuracy_score(y_test, dt.predict(X_test))
rf_acc = accuracy_score(y_test, rf.predict(X_test))
lr_acc = accuracy_score(y_test, lr.predict(X_test))

print("=== Test Accuracy ===")
print(f"Decision Tree : {dt_acc:.4f}")
print(f"Random Forest : {rf_acc:.4f}")
print(f"Logistic Reg. : {lr_acc:.4f}")

범주형 변수를 원-핫 인코딩합니다.
=== Test Accuracy ===
Decision Tree : 0.9740
Random Forest : 0.9798
Logistic Reg. : 0.9017


# Week 3

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1) 데이터 로드
df = pd.read_excel("/content/drive/MyDrive/ColabNotebooks/data/Raisin_Dataset.xlsx")

# 결측치 확인 및 제거
print("결측치 확인 (Before dropping NaN):")
print(df.isnull().sum())
df = df.dropna()
print("\n결측치 확인 (After dropping NaN):")
print(df.isnull().sum())

from sklearn.preprocessing import LabelEncoder

# 입력과 출력 분리
X = df.drop(columns=["Class"]) # Assuming 'Class' is the target column in the Excel file
y = df["Class"]

# 만약 범주형 변수가 있다면 Label Encoding 수행 (레이블 인코딩은 X에만 적용)
# Excel 파일의 데이터 유형을 확인해야 합니다. 여기서는 모든 컬럼이 수치형이라고 가정합니다.
# 만약 범주형 컬럼이 있다면, 아래 코드를 사용하여 해당 컬럼만 인코딩해야 합니다.
# for column in X.select_dtypes(include='object').columns:
#     label_encoders[column] = LabelEncoder()
#     X[column] = label_encoders[column].fit_transform(X[column])


# 훈련과 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 2) 모델 구성
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
lr = LogisticRegression(max_iter=500)

# 3) 모델 학습
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

# 4) 예측결과 생성
dt_y_pred = dt.predict(X_test)
rf_y_pred = rf.predict(X_test)
lr_y_pred = lr.predict(X_test)

# 예측값 출력 (optional)
# print("Decision Tree Predictions:", dt_y_pred)


# 5) 예측결과와 정답을 비교한 정확도 평가
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

dt_acc = accuracy_score(y_test, dt_y_pred)
rf_acc = accuracy_score(y_test, rf_y_pred )
lr_acc = accuracy_score(y_test, lr_y_pred)

print("=== DT Test Accuracy ===")
print(f"Decision Tree : {dt_acc:.4f}")
print (confusion_matrix(y_test,dt_y_pred))

print("=== RF Test Accuracy ===")
print(f"Random Forest : {rf_acc:.4f}")
print (confusion_matrix(y_test,rf_y_pred))

print("=== LR Test Accuracy ===")
print(f"Logistic Reg. : {lr_acc:.4f}")
print (confusion_matrix(y_test,lr_y_pred))

결측치 확인 (Before dropping NaN):
Area               0
MajorAxisLength    0
MinorAxisLength    0
Eccentricity       0
ConvexArea         0
Extent             0
Perimeter          0
Class              0
dtype: int64

결측치 확인 (After dropping NaN):
Area               0
MajorAxisLength    0
MinorAxisLength    0
Eccentricity       0
ConvexArea         0
Extent             0
Perimeter          0
Class              0
dtype: int64
=== DT Test Accuracy ===
Decision Tree : 0.8444
[[70 20]
 [ 8 82]]
=== RF Test Accuracy ===
Random Forest : 0.8667
[[71 19]
 [ 5 85]]
=== LR Test Accuracy ===
Logistic Reg. : 0.8889
[[74 16]
 [ 4 86]]
