In [None]:
# ========= 掛載與解壓縮 =========
from google.colab import files
uploaded = files.upload()  # 上傳 dog-breed-identification.zip

In [None]:
import zipfile, os
zip_path = 'dog-breed-identification.zip'
extract_path = 'dog_data'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [None]:
# ========= 安裝套件 =========
!pip install seaborn pillow

In [None]:
# ========= 匯入套件 =========
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras import Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
!ls dog_data
!ls dog_data/dog-breed-identification

In [None]:
# ========= 資料讀取與處理 =========
base_dir = 'dog_data'
train_dir = os.path.join(base_dir, 'train')
test_dir = os.path.join(base_dir, 'test')
labels_path = os.path.join(base_dir, 'labels.csv')

df = pd.read_csv(labels_path)
df['filename'] = df['id'] + '.jpg'

In [None]:
# ========= 品種統計與視覺化 =========
print("✅ 品種總數：", df['breed'].nunique())
print("\n 前 5 名常見品種：\n", df['breed'].value_counts().head())

plt.figure(figsize=(12, 6))
top_breeds = df['breed'].value_counts()[:20]
sns.barplot(x=top_breeds.index, y=top_breeds.values)
plt.xticks(rotation=45)
plt.title("前 20 名常見的狗品種")
plt.xlabel("品種")
plt.ylabel("圖片數量")
plt.tight_layout()
plt.show()

In [None]:
# ========= 資料產生器（使用 MobileNetV2 預處理）=========
img_size = 224
batch_size = 32
datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    validation_split=0.2
)

train_generator = datagen.flow_from_dataframe(
    dataframe=df,
    directory=train_dir,
    x_col='filename',
    y_col='breed',
    target_size=(img_size, img_size),
    batch_size=batch_size,
    class_mode='categorical',
    subset='training',
    shuffle=True
)

valid_generator = datagen.flow_from_dataframe(
    dataframe=df,
    directory=train_dir,
    x_col='filename',
    y_col='breed',
    target_size=(img_size, img_size),
    batch_size=batch_size,
    class_mode='categorical',
    subset='validation',
    shuffle=True
)

In [None]:
# ========= 建立 MobileNetV2 模型 =========
base_model = MobileNetV2(
    weights='imagenet',
    include_top=False,
    input_tensor=Input(shape=(img_size, img_size, 3))
)
base_model.trainable = False  # 先凍結整個 base_model

# 自定分類層
x = GlobalAveragePooling2D()(base_model.output)
x = Dropout(0.3)(x)
output = Dense(df['breed'].nunique(), activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=output)

# 第一階段訓練（只訓練分類頭）
model.compile(optimizer=Adam(learning_rate=1e-3), loss='categorical_crossentropy', metrics=['accuracy'])
history_1 = model.fit(
    train_generator,
    validation_data=valid_generator,
    epochs=5,
    callbacks=[EarlyStopping(patience=2, restore_best_weights=True)]
)

# 第二階段微調（解凍最後 50 層）
base_model.trainable = True
for layer in base_model.layers[:-50]:
    layer.trainable = False

# 重新編譯，使用更小學習率
model.compile(optimizer=Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])
history_2 = model.fit(
    train_generator,
    validation_data=valid_generator,
    epochs=5,
    callbacks=[
        EarlyStopping(patience=3, restore_best_weights=True),
        ReduceLROnPlateau(patience=2)
    ]
)

In [None]:
# ========= 畫圖 =========
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title('Accuracy')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import numpy as np

In [None]:
# 預測驗證集
Y_pred = model.predict(valid_generator)
y_pred = np.argmax(Y_pred, axis=1)
y_true = valid_generator.classes
class_names = list(valid_generator.class_indices.keys())

In [None]:
# 分類報告
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=class_names))

In [None]:
# 混淆矩陣（只看前20品種）
top_20_classes = df['breed'].value_counts().head(20).index.tolist()
indices = [valid_generator.class_indices[breed] for breed in top_20_classes]
cm = confusion_matrix(y_true, y_pred)
cm_20 = cm[np.ix_(indices, indices)]

In [None]:
# 繪製熱力圖
plt.figure(figsize=(12, 10))
sns.heatmap(cm_20, annot=True, fmt='d', cmap='Blues',
            xticklabels=top_20_classes,
            yticklabels=top_20_classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('混淆矩陣 (Top 20 類別)')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# ========= 預測測試資料集並輸出 submission.csv =========
test_filenames = os.listdir(test_dir)
test_df = pd.DataFrame({'filename': test_filenames})

test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
test_generator = test_datagen.flow_from_dataframe(
    test_df,
    directory=test_dir,
    x_col='filename',
    y_col=None,
    target_size=(img_size, img_size),
    batch_size=batch_size,
    class_mode=None,
    shuffle=False
)

In [None]:
# 預測
preds = model.predict(test_generator)

In [None]:
# 取得品種標籤順序
breed_labels = train_generator.class_indices
breed_labels = dict((v, k) for k, v in breed_labels.items())
preds_df = pd.DataFrame(preds, columns=[breed_labels[i] for i in range(len(breed_labels))])
preds_df.insert(0, 'id', [fname[:-4] for fname in test_filenames])  # 去掉 .jpg

In [None]:
# 輸出 CSV
submission_path = 'submission.csv'
preds_df.to_csv(submission_path, index=False)

In [None]:
# 下載檔案
files.download(submission_path)

In [None]:
# ========= 儲存模型到 .h5 =========
model.save('mobilenetv2_dogbreed.h5')  # 儲存為 .h5

# ========= 下載 .h5 模型檔 =========
files.download('mobilenetv2_dogbreed.h5')