# 预处理数据

## 预处理数据集

参考文章：https://zhuanlan.zhihu.com/p/34068451

### 异常数据处理

In [None]:
# imagenet上的猫和狗的类别
dogs_categories = [
 'n02085620','n02085782','n02085936','n02086079','n02086240','n02086646','n02086910','n02087046'
,'n02087394','n02088094','n02088238','n02088364','n02088466','n02088632','n02089078','n02089867'
,'n02089973','n02090379','n02090622','n02090721','n02091032','n02091134','n02091244','n02091467'
,'n02091635','n02091831','n02092002','n02092339','n02093256','n02093428','n02093647','n02093754'
,'n02093859','n02093991','n02094114','n02094258','n02094433','n02095314','n02095570','n02095889'
,'n02096051','n02096177','n02096294','n02096437','n02096585','n02097047','n02097130','n02097209'
,'n02097298','n02097474','n02097658','n02098105','n02098286','n02098413','n02099267','n02099429'
,'n02099601','n02099712','n02099849','n02100236','n02100583','n02100735','n02100877','n02101006'
,'n02101388','n02101556','n02102040','n02102177','n02102318','n02102480','n02102973','n02104029'
,'n02104365','n02105056','n02105162','n02105251','n02105412','n02105505','n02105641','n02105855'
,'n02106030','n02106166','n02106382','n02106550','n02106662','n02107142','n02107312','n02107574'
,'n02107683','n02107908','n02108000','n02108089','n02108422','n02108551','n02108915','n02109047'
,'n02109525','n02109961','n02110063','n02110185','n02110341','n02110627','n02110806','n02110958'
,'n02111129','n02111277','n02111500','n02111889','n02112018','n02112137','n02112350','n02112706'
,'n02113023','n02113186','n02113624','n02113712','n02113799','n02113978']

cats_categories = ['n02123045','n02123159','n02123394','n02123597','n02124075','n02125311','n02127052']

# top-N
top_num = 50

# 数据集路径
dataset_path = './data/'

In [None]:
from keras.preprocessing import image
import numpy as np

In [None]:
# 判断是否是猫狗中的一种
def is_cat_or_dog(preds):
    for pred in preds:
        if pred[0] in (dogs_categories+cats_categories):
            return True
    return False

# 排除异常值
# model_name --- 模型名称：xception, inception_resnet_v2, densenet201
# top_num --- top-N
# img_dir --- 图片存放路径
def exclude_outliers(model_name='xception', top_num=5, img_dir='train/'):
    # import models
    if model_name == 'xception':
        from keras.applications.xception import Xception, preprocess_input, decode_predictions
        model = Xception(weights='imagenet')
        img_size = (299, 299)
    elif model_name == 'inception_resnet_v2':
        from keras.applications.inception_resnet_v2 import InceptionResNetV2, preprocess_input, decode_predictions
        model = InceptionResNetV2(weights='imagenet')
        img_size = (299, 299)
    elif model_name == 'densenet201':
        from keras.applications.densenet import DenseNet201, preprocess_input, decode_predictions
        model = DenseNet201(weights='imagenet')
        img_size = (224, 224)

    img_dir = dataset_path + img_dir
    img_list = os.listdir(img_dir)
    outliers = []
    for img_name in img_list:
        if img_name == 'Thumbs.db':
            continue
        img_path = img_dir + img_name
        img = image.load_img(img_path, target_size=img_size) #os.readlink(img_path)
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)

        pred = model.predict(x)
        pred = decode_predictions(pred, top=top_num)[0]
        if not is_cat_or_dog(pred):
            outliers.append(img_name)
    return outliers


#     通过generator实现
#     img_dir = dataset_path + 'some_samples/'
#     gen = image.ImageDataGenerator()
#     img_generator = gen.flow_from_directory(img_dir, target_size=img_size, shuffle=False)
#     print(img_generator)
#     preds = model.predict_generator(img_generator, verbose=1)
    
#     decode_preds = decode_predictions(preds, top=top_num)
#     outliers = []

#     for pred, filename in zip(decode_preds, img_generator.filenames):
#         print(filename)
#         print(pred)
#         print(is_cat_or_dog(pred))
#         if not is_cat_or_dog(pred):
#             outliers.append(filename)
#     return outliers

### 利用xception网络识别图片

In [None]:
def exclude_outliers_xception(top_num=top_num, img_dir='train/')：
    from keras.applications.xception import Xception, preprocess_input, decode_predictions
    model = Xception(weights='imagenet')
    img_size = (299, 299)
    
    img_dir = dataset_path + img_dir
    img_list = os.listdir(img_dir)
    outliers = []
    for img_name in img_list:
        if img_name == 'Thumbs.db':
            continue
        img_path = img_dir + img_name
        img = image.load_img(img_path, target_size=img_size) #os.readlink(img_path)
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)

        pred = model.predict(x)
        pred = decode_predictions(pred, top=top_num)[0]
        if not is_cat_or_dog(pred):
            outliers.append(img_name)
    return outliers
    

In [None]:
outliers_xception = exclude_outliers(model_name='xception',top_num=top_num)
print(outliers_xception)

### 利用inception_resnet_v2网络识别图片

In [None]:
outliers_inception_resnet_v2 = exclude_outliers(model_name='inception_resnet_v2',top_num=top_num)
print(outliers_inception_resnet_v2)

### 利用densenet201网络识别图片

In [None]:
outliers_densenet201 = exclude_outliers(model_name='densenet201',top_num=top_num)
print(outliers_densenet201)

### 求并集并将文件名称写入outliers.txt文件

In [None]:
outliers = list(set(outliers_xception+outliers_inception_resnet_v2+ outliers_densenet201))
print(outliers)

outliers_file_path = './outliers.txt'

if os.path.isfile(outliers_file_path):
    os.remove(outliers_file_path)
with open(outliers_file_path, 'w') as f:
    f.write('\n'.join(outliers))


### 从outliers.txt文件中读取所识别的异常文件名称

In [None]:
outliers_file_path = './outliers.txt'
with open(outliers_file_path, 'r') as f:
    outliers_str = f.read()
    outliers = outliers_str.split('\n')
    print(outliers)

### 展示异常图片

In [None]:
def align_figures():
    import matplotlib
    from matplotlib._pylab_helpers import Gcf
    from IPython.display import display_html
    import base64
    from ipykernel.pylab.backend_inline import show

    images = []
    for figure_manager in Gcf.get_all_fig_managers():
        fig = figure_manager.canvas.figure
        png = get_ipython().display_formatter.format(fig)[0]['image/png']
        src = base64.encodebytes(png).decode()
        images.append('<img style="margin:0" align="left" src="data:image/png;base64,{}"/>'.format(src))

    html = "<div>{}</div>".format("".join(images))
    show._draw_called = False
    matplotlib.pyplot.close('all')
    display_html(html, raw=True)
    
def show_images(img_list, img_path, col_num=5):
    plt.figure(dpi=150)
    for i in range(len(img_list)):
        img = plt.imread(img_path+img_list[i])
        plt.subplot(len(img_list)//col_num+1, col_num, i+1)
        plt.title(img_list[i], fontsize=8)
        plt.axis('off')
        plt.imshow(img)
    align_figures()

In [None]:
show_num = 10
for i in range(len(outliers)//show_num+1):
    show_images(outliers[i*show_num:(i+1)*show_num], dataset_path+'train/',5)


> 根据上面的图片，手动排除正确的猫狗（也就是识别错误的），将真正的异常值写入`real_outliers.txt`文件