In [1]:
%reload_ext autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import itertools
import os
import pandas as pd
import shutil
import sqlalchemy as sa

import database as db
from images import get_list_of_images_invalid
from models import County, DataTrustedIdentifier, Image
from unaccent import unaccent

In [3]:
engine, session=db.connect()
engine.echo=False

2023-03-29 19:57:25,556 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2023-03-29 19:57:25,556 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-29 19:57:25,557 INFO sqlalchemy.engine.Engine select current_schema()
2023-03-29 19:57:25,557 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-29 19:57:25,558 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2023-03-29 19:57:25,558 INFO sqlalchemy.engine.Engine [raw sql] {}


In [4]:
query = session.query(County).distinct().all()

list_uf = [unaccent(sa.func.lower(q.uf)) for q in query]
list_state = [unaccent(sa.func.lower(q.state)) for q in query]
list_county = [unaccent(sa.func.lower(q.county)) for q in query]

uf_unaccented_lower = unaccent(sa.func.lower(DataTrustedIdentifier.state_province)).in_(list_uf)
state_unaccented_lower = unaccent(sa.func.lower(DataTrustedIdentifier.state_province)).in_(list_state)
county_unaccented_lower = unaccent(sa.func.lower(DataTrustedIdentifier.county)).in_(list_county)

In [5]:
list_minimum_image=[5, 10, 20]
list_color=['rgb', 'grayscale']
list_image_size=['256', '400', '512']
path_out = '/home/xandao/Imagens/br_dataset'
list_images_invalid=get_list_of_images_invalid()
list_images_invalid=list_images_invalid['barcode']
list_images_invalid

['INPA0248526',
 'INPA0248523',
 'INPA0248528',
 'NY01421926_01',
 'NY01421575_01',
 'HUFSJ001689_v00',
 'HUFSJ001133_v00',
 'HUFSJ002198_v00',
 'HUFSJ003255_v00',
 'HVASF000487_v01',
 'HUFSJ001689_v01',
 'INPA0019084_nd',
 'INPA0022379_nd',
 'INPA0032742_nd',
 'INPA0023115',
 'NL-U1484137',
 'NY01397568_01',
 'INPA0012286',
 'INPA0146998']

In [9]:
def save_metadata(list_count_samples, list_level, list_path_images_final, list_seq_final, out):
    df = pd.DataFrame({
        'levels': list_level,
        'paths': list_path_images_final,
        'count': list_count_samples,
        'seq': list_seq_final,
    })
    print('total of images: %d' % df['count'].sum())
    display(df.head(4))
    filename=os.path.join(out, 'info_dataset.csv')
    df.to_csv(filename, sep=';', index=None, lineterminator='\n', doublequote=True)

    list_seq = list(itertools.chain(*list_seq_final))
    query = session.query(DataTrustedIdentifier)\
                    .filter(DataTrustedIdentifier.seq.in_(list_seq))\
                    .all()

    data = [(q.seq, q.genus, q.specific_epithet, q.genus_trusted, q.specific_epithet, q.list_src) for q in query]
    columns = ['seq', 'genus', 'specific_epithet', 'genus_trusted', 'specific_epithet_trusted', 'urls']
    df = pd.DataFrame(data, columns=columns)
    display(df.head(4))
    filename=os.path.join(out, 'info_samples.csv')
    df.to_csv(filename, sep=';', index=None, lineterminator='\n', doublequote=True)


def copy_images(list_level, list_path_images_final, out):
    if not os.path.exists(out):
        os.makedirs(out)

    for i, ff in enumerate(zip(list_level, list_path_images_final), start=1):
        level = ff[0]
        list_images = ff[1]

        out_level = os.path.join(out, 'f%d' % i)
        if not os.path.exists(out_level):
            os.makedirs(out_level)

        for i, image in enumerate(list_images, start=1):
            shutil.copy(image, out_level)


for color in list_color:
    for image_size in list_image_size:
        for minimum_image in list_minimum_image:
            list_level = []
            list_path_images_final = []
            list_seq_final = []
            list_count_samples = []

            level=DataTrustedIdentifier.specific_epithet_trusted
            out=os.path.join(path_out, color.upper(), level.name, image_size, str(minimum_image))

            condition = sa.and_(DataTrustedIdentifier.country_trusted == 'Brasil',
                                level.is_not(None),
                                sa.or_(uf_unaccented_lower, state_unaccented_lower))

            columns = [level,
                       sa.func.array_agg(DataTrustedIdentifier.seq).label('list_seq')]
            query = session.query(*columns) \
                .filter(condition) \
                .distinct() \
                .group_by(level) \
                .order_by(level) \
                .having(sa.func.count(level) >= minimum_image) \
                .all()

            for i, q in enumerate(query):
                print('color: %s image_size: %s minimum_image: %d (%d of %d)' % (color.upper(), image_size, minimum_image, i, len(query)))
                qzao = session.query(Image.seq_id, sa.func.array_agg(Image.path_segmented).label('list_path_segmented')) \
                    .filter(sa.and_(Image.seq_id.in_(q.list_seq),
                                    Image.color_mode.__eq__(color.upper()),
                                    Image.height.__eq__(image_size),
                                    Image.width.__eq__(image_size),
                                    sa.not_(Image.filename.in_(list_images_invalid)),
                                    sa.not_(Image.filename.ilike('%_v0%')),
                                    sa.not_(Image.filename.like('%_e%')),
                                    sa.not_(Image.filename.like('%_nd%')),
                                    ))\
                    .group_by(Image.seq_id)\
                    .all()

                list_one_image_per_seq = []
                list_seq_one_image = []
                for qzinho in qzao:
                    list_seq_one_image.append(qzinho.seq_id)
                    list_one_image_per_seq.append(sorted(qzinho.list_path_segmented)[0])

                if len(list_one_image_per_seq) >= minimum_image:
                    list_count_samples.append(len(list_one_image_per_seq))
                    list_level.append(q.specific_epithet_trusted)
                    list_path_images_final.append(list_one_image_per_seq)
                    list_seq_final.append(list_seq_one_image)

            # print('total %d' % np.sum([len(p) for p in list_path_images_final]))
            copy_images(list_level, list_path_images_final, out)
            save_metadata(list_count_samples, list_level, list_path_images_final, list_seq_final, out)

color: RGB image_size: 256 minimum_image: 5 (0 of 246)
color: RGB image_size: 256 minimum_image: 5 (1 of 246)
color: RGB image_size: 256 minimum_image: 5 (2 of 246)
color: RGB image_size: 256 minimum_image: 5 (3 of 246)
color: RGB image_size: 256 minimum_image: 5 (4 of 246)
color: RGB image_size: 256 minimum_image: 5 (5 of 246)
color: RGB image_size: 256 minimum_image: 5 (6 of 246)
color: RGB image_size: 256 minimum_image: 5 (7 of 246)
color: RGB image_size: 256 minimum_image: 5 (8 of 246)
color: RGB image_size: 256 minimum_image: 5 (9 of 246)
color: RGB image_size: 256 minimum_image: 5 (10 of 246)
color: RGB image_size: 256 minimum_image: 5 (11 of 246)
color: RGB image_size: 256 minimum_image: 5 (12 of 246)
color: RGB image_size: 256 minimum_image: 5 (13 of 246)
color: RGB image_size: 256 minimum_image: 5 (14 of 246)
color: RGB image_size: 256 minimum_image: 5 (15 of 246)
color: RGB image_size: 256 minimum_image: 5 (16 of 246)
color: RGB image_size: 256 minimum_image: 5 (17 of 246)
co

Unnamed: 0,levels,paths,count,seq
0,abutiloides,[/media/xandao/c2f58d30-ff2c-47f7-95af-91ad6fd...,8,"[9416, 10009, 10651, 12428, 25112, 37910, 3847..."
1,aduncum,[/media/xandao/c2f58d30-ff2c-47f7-95af-91ad6fd...,377,"[7220, 7221, 7222, 7234, 7742, 7757, 8224, 868..."
2,aequale,[/media/xandao/c2f58d30-ff2c-47f7-95af-91ad6fd...,30,"[8727, 9316, 10163, 11517, 12577, 20200, 21125..."
3,alata,[/media/xandao/c2f58d30-ff2c-47f7-95af-91ad6fd...,81,"[6857, 6861, 6862, 6863, 6864, 7565, 7859, 986..."


Unnamed: 0,seq,genus,specific_epithet,genus_trusted,specific_epithet_trusted,urls
0,9735,Piper,arboreum,Piper,arboreum,[https://storage.googleapis.com/cria-zoomify/h...
1,19770,Ottonia,anisum,Piper,anisum,[https://storage.googleapis.com/cria-zoomify/h...
2,40891,Ottonia,taperana,Piper,taperana,[https://storage.googleapis.com/cria-zoomify/h...
3,7333,Piper,corcovadense,Piper,corcovadense,[https://storage.googleapis.com/cria-zoomify/h...


In [7]:
session.close()
engine.dispose()