## 导入库

In [14]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras import regularizers
import warnings



## 数据分析及可视化

In [2]:
df = pd.read_csv(r"C:\Users\admin\Desktop\Training_set.csv")
df.head(10)

Unnamed: 0,filename,label
0,Image_1.jpg,SOUTHERN DOGFACE
1,Image_2.jpg,ADONIS
2,Image_3.jpg,BROWN SIPROETA
3,Image_4.jpg,MONARCH
4,Image_5.jpg,GREEN CELLED CATTLEHEART
5,Image_6.jpg,CAIRNS BIRDWING
6,Image_7.jpg,GREEN CELLED CATTLEHEART
7,Image_8.jpg,EASTERN DAPPLE WHITE
8,Image_9.jpg,BROWN SIPROETA
9,Image_10.jpg,RED POSTMAN


In [6]:
print("查看数据信息")
print(df.describe())
print("查看空值")
print(df.isnull().sum())

查看数据信息
              filename           label
count             6499            6499
unique            6499              75
top     Image_4817.jpg  MOURNING CLOAK
freq                 1             131
查看空值
filename    0
label       0
dtype: int64


**查看各个类别包含的数据量**

In [3]:
labelcounts = df['label'].value_counts().sort_index()
plt.figure(figsize=(14, 8))
sns.barplot(x=labelcounts.index, y=labelcounts.values, palette='viridis')
plt.title('蝴蝶类型数目详细信息')
plt.xlabel('蝴蝶类型')
plt.ylabel('类别数量')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

**随机查看部分图片及其对应的标签**

In [4]:
image_dir = "C:\Users\admin\Desktop\btfl\train"
sample_images = df.sample(12, random_state=43)
fig, axes = plt.subplots(4, 3, figsize=(15, 15))

for i, (index, row) in enumerate(sample_images.iterrows()):
    img_path = os.path.join(image_dir, row['filename'])
    img = load_img(img_path, target_size=(150, 150))
    img_array = img_to_array(img) / 255.0  
    
    ax = axes[i // 3, i % 3]
    ax.imshow(img_array)
    ax.set_title(f"类别: {row['label']}")
    ax.axis('off')

plt.tight_layout()
plt.show()

## 数据预处理

In [24]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_datagen = ImageDataGenerator(
    rescale=1./255, # 将像素值归一化到 [0, 1] 范围
    rotation_range=40, # 随机旋转图片，范围为0到40度
    width_shift_range=0.2, # 随机水平和垂直平移图片，范围为20%
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2, # 随机缩放图片
    horizontal_flip=True,
    fill_mode='nearest' # 在变换时填充空白区域，使用最近邻插值
)

val_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=image_dir,
    x_col='filename',
    y_col='label',
    target_size=(150, 150),
    batch_size=32,
    class_mode='categorical' # 将标签转换为独热编码形式
)

val_generator = val_datagen.flow_from_dataframe(
    dataframe=val_df,
    directory=image_dir,
    x_col='filename',
    y_col='label',
    target_size=(150, 150),
    batch_size=32,
    class_mode='categorical'
)

Found 5199 validated image filenames belonging to 75 classes.
Found 1300 validated image filenames belonging to 75 classes.


In [6]:
images, labels = next(train_generator)

# 设置绘图参数
plt.figure(figsize=(12, 8))

# 显示前10张图片及其标签
for i in range(10):
    plt.subplot(5, 2, i + 1)
    plt.imshow(images[i])  # 显示图片
    plt.title(f'Label: {labels[i]}')  # 显示标签
    plt.axis('off')  # 不显示坐标轴

plt.tight_layout()
plt.show()


## 构建模型

In [7]:
# 获取类别数量
num_classes = len(train_generator.class_indices)

# 构建模型
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))  # 使用 num_classes


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [8]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 148, 148, 32)      896       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 74, 74, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 72, 72, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 36, 36, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 34, 34, 128)       73856     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 17, 17, 128)       0         
_________________________________________________________________
flatten (Flatten)            (None, 36992)             0         
__________

In [9]:
# 编译模型
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [10]:
# 训练模型
history = model.fit(train_generator, 
                    steps_per_epoch=train_generator.n // train_generator.batch_size, 
                    validation_data=val_generator, 
                    validation_steps=val_generator.n // val_generator.batch_size, 
                    epochs=40)

Instructions for updating:
Use tf.cast instead.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


## 模型评估

In [11]:
plt.plot(history.history['acc'], label='Train Accuracy')
plt.plot(history.history['val_acc'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show()

In [12]:
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()

In [21]:
# 保存模型
model.save('butterfly_classifier.h5')


## 使用模型进行预测展示

In [27]:
# 加载之前保存的模型
model = load_model('butterfly_classifier.h5')

val_images, val_labels = next(val_generator)

# 进行预测
predictions = model.predict(val_images)
pred_labels = np.argmax(predictions, axis=1)
true_labels = np.argmax(val_labels, axis=1)

# 获取类别映射
class_indices = val_generator.class_indices
class_names = {v: k for k, v in class_indices.items()}

# 定义显示图像的函数
def display_images(images, true_labels, pred_labels, class_names, num_images=9):
    plt.figure(figsize=(15, 15))
    for i in range(num_images):
        plt.subplot(3, 3, i + 1)
        plt.imshow(images[i])
        true_label = class_names[int(true_labels[i])]
        pred_label = class_names[int(pred_labels[i])]
        plt.title(f"True: {true_label}\nPred: {pred_label}")
        plt.axis('off')
    plt.tight_layout()
    plt.show()

# 调用显示函数
display_images(val_images, true_labels, pred_labels, class_names, num_images=9)

# 总结  
这次这个基于cnn的图像分类，获得了高于 70% 的准确率。可以加载我保存好的模型进行预测试试，感兴趣的还可以继续调参训练