# VOC数据集分析

In [1]:
import os
import sys
import numpy as np
import pandas as pd

import xml.etree.ElementTree as ET

from mylibs.ProcessBar import ShowProcess

## 参数

In [2]:
data_path='%s/e/dataset_tiptical/VOCdevkit/VOC2012'%(os.getenv('HOME'))#数据集根目录
image_set_file = '%s/ImageSets/Main/val.txt'%(data_path)               #图片数据集
annopath='%s/Annotations'%(data_path)                                  #标注目录
images_path='%s/JPEGImages'%(data_path)                                #图片目录
classes_file='%s/model_data/voc_classes.txt'%(data_path)               #类别文件

## 基本信息统计
* 图片数据集文件个数
* 标注文件个数
* 原始图片个数
* 分类数目

In [3]:
#加载图片数据集
image_ids=[]
with open(image_set_file,'r') as f:
    image_ids=[x.strip() for x in f.readlines()]
print('图片数据集数目:',len(image_ids))

#统计标记文件个数
items=os.listdir(annopath)
print('标记文件数目:',len(items))

#统计原始文件个数
items=os.listdir(images_path)
print('原始图片数目:',len(items))

#检测类别
if os.path.exists(classes_file):
    with open(classes_file,'r') as f:
        classes=[x.strip() for x in f.readlines()]
        print('检测类别：',classes)

图片数据集数目: 5823
标记文件数目: 17125
原始图片数目: 17125


## 标注目录信息统计：
    类别名称: =============********** 文件数/目标数

In [18]:
#读取标注的xml文件
def parse_rec(filename): 
    """ Parse a PASCAL VOC xml file """
    tree = ET.parse(filename)
    objects = []
    for obj in tree.findall('object'):
        obj_struct = {}
        obj_struct['name'] = obj.find('name').text
        obj_struct['pose'] = obj.find('pose').text
        obj_struct['truncated'] = int(obj.find('truncated').text) if obj.find('truncated') else 0
        obj_struct['difficult'] = int(obj.find('difficult').text)
        bbox = obj.find('bndbox')
        obj_struct['bbox'] = [int(float(bbox.find('xmin').text)),
                              int(float(bbox.find('ymin').text)),
                              int(float(bbox.find('xmax').text)),
                              int(float(bbox.find('ymax').text))]
        objects.append(obj_struct)
 
    return objects


#收集标注信息
def voc_data_gather_annos(annopath,image_ids):
    '''收集标注信息, 以DataFrame格式返回
    @param annopath  [str ]标注目录
    @param image_ids [list]图片ID列表
    @return df
        df=pd.DataFrame(columns=['classname','filename','xmin','ymin','xmax','ymax','pose','truncated','difficult'])
    '''
    pb = ShowProcess(100,'','', '收集标注信息完成') 
    num_images=len(image_ids)
    df=pd.DataFrame(columns=['classname','filename','xmin','ymin','xmax','ymax','pose','truncated','difficult'])
    key_index=0
    for i,image_id in enumerate(image_ids):
        sfile='%s/%s.xml'%(annopath,image_id)
        objs=parse_rec(sfile)
        for obj in objs:
            rd=pd.DataFrame({
                'classname':obj['name'],
                'filename':image_id,
                'xmin':obj['bbox'][0],
                'ymin':obj['bbox'][1],
                'xmax':obj['bbox'][2],
                'ymax':obj['bbox'][3],
                'pose':obj['pose'],
                'truncated':obj['truncated'],
                'difficult':obj['difficult']
                },index=[key_index])
            key_index+=1
            df=df.append(rd)
        pb.show_process(int(100. * i / num_images),'','%d/%d'%(i,num_images)) 
        
    pb.show_process(100,'','%d/%d'%(num_images,num_images)) 
    return df


#分析标注信息
def voc_data_statistics_annos(df):
    '''分析标注信息
    @param df  标注信息DataFrame数据格式
        df=pd.DataFrame(columns=['classname','filename','xmin','ymin','xmax','ymax','pose','truncated','difficult'])
    @return classes_info
        [dict{"classname":[file_num,obj_num]}]
    '''
    pb = ShowProcess(100,'','', '分析标注信息完成') 
    index=df.index
    #统计类别数
    sets_class=set()
    for n in range(df.shape[0]):
        sets_class.add(df.loc[index[n],'classname'])
    #检测类别统计
    classes_info={}
    num_classes=len(sets_class)
    for i,classname in enumerate(sets_class):
        #类别筛选
        df_cls=df[df['classname']==classname]
        #目标数目统计
        obj_num=df_cls.shape[0] 
        #文件数目统计
        sets_file=set()
        index_cls=df_cls.index
        for n in range(df_cls.shape[0]):
            sets_file.add(df_cls.loc[index_cls[n],'filename'])
        file_num=len(sets_file) #文件数目统计
        classes_info[classname]=[file_num,obj_num]
        pb.show_process(int(100. * i / num_classes),'','%d/%d'%(i,num_classes)) 
        
    pb.show_process(100,'','%d/%d'%(num_classes,num_classes)) 
    return classes_info

#标注目录统计
def voc_data_statistics_annopath(annopath):
    '''标注目录统计
    @param annopath 标注目录
    @return classes_info
        [dict{"classname":[file_num,obj_num]}]
    '''
    image_ids=[os.path.splitext(os.path.split(x)[1])[0] for x in os.listdir(annopath)]
    df=voc_data_gather_annos(annopath,image_ids[:1000])
    return voc_data_statistics_annos(df)

#图片集统计
def voc_data_statistics_imagesetfile(annopath,sfile):
    '''图片集统计
    @param sfile 图片集文件，文本文件，每行一个图片id，不含扩展名，如:
        2008_000001
        2008_000002
    @return classes_info
        [dict{"classname":[file_num,obj_num]}]    
    '''
    with open(sfile,'r') as f:
        image_ids=[x.strip() for x in f.readlines()]
    df=voc_data_gather_annos(annopath,image_ids)
    return voc_data_statistics_annos(df) 

In [19]:
classes_info=voc_data_statistics_annopath(annopath)

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>]1000/1000
收集标注信息完成
[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>]20/20
分析标注信息完成


In [23]:
#统计文件总数
#统计对象总数
#类别:文件数/对象数

for k,v in classes_info.items():
    print('%s:%s'%(k,v))

train:[34, 41]
aeroplane:[45, 59]
cow:[20, 46]
bird:[38, 48]
cat:[68, 78]
tvmonitor:[34, 40]
bus:[29, 44]
horse:[32, 39]
sofa:[45, 54]
dog:[78, 92]
boat:[29, 56]
chair:[83, 183]
diningtable:[39, 44]
pottedplant:[19, 53]
sheep:[25, 76]
bicycle:[26, 39]
person:[569, 1013]
car:[79, 169]
motorbike:[32, 48]
bottle:[54, 126]


In [None]:
#df=pd.DataFrame(columns=['classname','filename','xmin','ymin','xmax','ymax','pose','truncated','difficult'],
#               dtype={'classname':np.str,'filename':np.str,
#                      'xmin':np.int32,'ymin':np.int32,'xmax':np.int32,'ymax':np.int32,
#                      'pose':np.int32,'truncated':np.int32,'difficult':np.int32})
df=pd.DataFrame(columns=['classname','filename','xmin','ymin','xmax','ymax','pose','truncated','difficult'])
key_index=0
items=os.listdir(annopath)
for image_id in items:
    sfile='%s/%s'%(annopath,image_id)
    objs=parse_rec(sfile)
    for obj in objs:
        rd=pd.DataFrame({
            'classname':obj['name'],
            'filename':image_id,
            'xmin':obj['bbox'][0],
            'ymin':obj['bbox'][1],
            'xmax':obj['bbox'][2],
            'ymax':obj['bbox'][3],
            'pose':obj['pose'],
            'truncated':obj['truncated'],
            'difficult':obj['difficult']
            },index=[key_index])
        key_index+=1
        df=df.append(rd)

In [24]:
classes_info2=voc_data_statistics_imagesetfile(annopath,image_set_file)

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>]5823/5823
收集标注信息完成
[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>]20/20
分析标注信息完成


In [25]:
for k,v in classes_info2.items():
    print('%s:%s'%(k,v))

sheep:[155, 485]
pottedplant:[279, 542]
aeroplane:[348, 484]
cow:[154, 347]
bird:[374, 629]
cat:[544, 618]
tvmonitor:[296, 414]
bus:[211, 320]
horse:[245, 373]
sofa:[336, 387]
dog:[661, 773]
boat:[252, 491]
chair:[642, 1449]
diningtable:[323, 374]
train:[275, 329]
motorbike:[262, 376]
car:[608, 1173]
person:[2232, 5110]
bicycle:[290, 380]
bottle:[369, 733]


In [75]:
index=df.index
print('共 {%d} 记录'%(df.shape[0]))
#统计类别数
sets_class=set()
for n in range(df.shape[0]):
    sets_class.add(df.loc[index[n],'classname'])
print('检测类别{%d}:%s'%(len(sets_class),sets_class))
print(sets_class)
#检测类别统计
class_static={}
for classname in sets_class:
    #类别筛选
    df_cls=df[df['classname']==classname]
    #目标数目统计
    obj_num=df_cls.shape[0] 
    #文件数目统计
    sets_file=set()
    index_cls=df_cls.index
    for n in range(df_cls.shape[0]):
        sets_file.add(df_cls.loc[index_cls[n],'filename'])
    file_num=len(sets_file) #文件数目统计
    print('%s:%d/%d'%(classname,file_num,obj_num))


共 {40138} 记录
检测类别{20}:{'chair', 'cat', 'sofa', 'diningtable', 'bicycle', 'cow', 'train', 'car', 'bottle', 'boat', 'person', 'pottedplant', 'bird', 'sheep', 'tvmonitor', 'dog', 'aeroplane', 'horse', 'bus', 'motorbike'}
{'chair', 'cat', 'sofa', 'diningtable', 'bicycle', 'cow', 'train', 'car', 'bottle', 'boat', 'person', 'pottedplant', 'bird', 'sheep', 'tvmonitor', 'dog', 'aeroplane', 'horse', 'bus', 'motorbike'}
chair:1366/3056
cat:1128/1277
sofa:742/841
diningtable:691/800
bicycle:603/837
cow:340/771
train:589/704
car:1284/2492
bottle:812/1561
boat:549/1059
person:9583/17401
pottedplant:613/1202
bird:811/1271
sheep:357/1084
tvmonitor:645/893
dog:1341/1598
aeroplane:716/1002
horse:526/803
bus:467/685
motorbike:575/801
