In [6]:
import collections
import os

import pandas as pd
import sqlalchemy.orm
from sqlalchemy.orm import Session
%reload_ext autoreload
%autoreload 2

from database import *
from file import *
from tables import *

# cfg

In [7]:
user = os.environ['POSTGRE_USER']
password = os.environ['POSTGRE_PASSWORD']
cfg = {
    'host': '192.168.0.144',
    'user': user,
    'password': password,
    'port': '5432',
    'database': 'herbario'
}

session: Session
engine, session = connect(cfg)

2022-11-20 16:57:07,305 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2022-11-20 16:57:07,306 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-11-20 16:57:07,307 INFO sqlalchemy.engine.Engine select current_schema()
2022-11-20 16:57:07,308 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-11-20 16:57:07,309 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2022-11-20 16:57:07,309 INFO sqlalchemy.engine.Engine [raw sql] {}


In [8]:
def make_operation_a(session):
    try:
        session.commit()
        session.flush()
    except Exception as e:
        session.rollback()
        print(e)
        raise
    finally:
        session.close()

## Count images identified per identificator

In [4]:
list_images = list([file for file in pathlib.Path('/home/xandao/Documentos/dataset-52k-sp-2021/fotos').rglob('*') if file.is_file()])
result = session.query(DataSP.identified_by).filter(DataSP.george==True).distinct().all()

len(list_images), len(result)

2022-11-20 14:29:04,080 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-11-20 14:29:04,084 INFO sqlalchemy.engine.Engine SELECT DISTINCT data.identified_by AS data_identified_by 
FROM data 
WHERE data.george = true
2022-11-20 14:29:04,085 INFO sqlalchemy.engine.Engine [generated in 0.00068s] {}


(52605, 49)

In [None]:
data_count_identified = {'identified': [], 'count': []}
for r in result:
    identified = r[0]

    count_samples_of_identified = session.query(DataSP.genus, DataSP.specific_epithet, DataSP.barcode).filter(DataSP.genus!=None, DataSP.specific_epithet!=None, DataSP.identified_by.ilike(f'%{identified}%')).distinct().count()
    make_operation(session)

    data_count_identified['identified'].append(identified)
    data_count_identified['count'].append(count_samples_of_identified)

sheet = pd.DataFrame(data_count_identified)
sheet.loc['Total'] = pd.Series(sheet['count'].sum(), index=['count'])
sheet.to_excel('count_imgs_especialista_george.xlsx', na_rep='', engine='xlsxwriter', index=None)
sheet

## Create csv with count samples of dataset george and dataset BR

In [None]:
query_george = session.query(DataSP.specific_epithet, sqlalchemy.func.count(DataSP.specific_epithet).label('total')).filter(DataSP.genus!=None, DataSP.specific_epithet!=None, DataSP.george==True).group_by(DataSP.specific_epithet).order_by('total').having(sqlalchemy.func.count(DataSP.specific_epithet) >= 0).all()

list_species_datasetgeorge_datasetbr = {'species': [], 'george': [], 'br': []}
for q in query_george:
    rs = session.query(DataSP.specific_epithet, sqlalchemy.func.count(DataSP.specific_epithet).label('total')).filter(DataSP.genus!=None, DataSP.specific_epithet==q[0], or_(*list_identified)).group_by(DataSP.specific_epithet).order_by('total').having(sqlalchemy.func.count(DataSP.specific_epithet) >= 0).all()
    list_species_datasetgeorge_datasetbr['species'].append(q[0])
    list_species_datasetgeorge_datasetbr['george'].append(q[1])
    if len(rs) > 0:
        list_species_datasetgeorge_datasetbr['br'].append(rs[0][1])
    else:
        list_species_datasetgeorge_datasetbr['br'].append(0)

df = pd.DataFrame(list_species_datasetgeorge_datasetbr)
df.to_csv('classes-datasetgeorge-datasetbr.csv')

In [None]:
session.close()
engine.dispose()

## Separate images by threshold

In [9]:
import shutil
list_dir = [p for p in pathlib.Path('out').glob('*') if p.is_dir()]

for threshold in ['5', '10', '20']:
    pathlib.Path(threshold).mkdir(exist_ok=True, parents=True)

for p in list_dir:
    if len(os.listdir(p)) >= 5:
        shutil.copytree(p, os.path.join('5', p.name))
    if len(os.listdir(p)) >= 10:
        shutil.copytree(p, os.path.join('10', p.name))
    if len(os.listdir(p)) >= 20:
        shutil.copytree(p, os.path.join('20', p.name))

## Create file label2.txt

In [10]:
for threshold in ['5', '10', '20']:
    l = []
    for i, p in enumerate(pathlib.Path(threshold).glob('*'), 1):
        l.append('\"%s\";\"f%d\";\"%d\"\n' % (p.name, i, len(os.listdir(p))))
        dst = str(p.resolve()).replace(f'{p.name}', f'f{i}')
        os.rename(p, dst)

    f = open(f'{threshold}/label2.txt', mode='w')
    for lines in l:
        f.write(lines)
    f.close()

## Create out

In [5]:
import shutil
from sqlalchemy import or_

list_identified = [DataSP.identified_by.ilike('%{}%'.format(r[0])) for r in result]

query = session.query(DataSP.specific_epithet, DataSP.barcode).filter(DataSP.genus!=None, DataSP.specific_epithet!=None, or_(*list_identified)).all()

path = 'out'
pathlib.Path(path).mkdir(parents=True, exist_ok=True)
for barcode in query:
    result = sorted([f for f in list_images if barcode[1] in str(f.stem)])
    if len(result) > 0:
        if not os.path.exists(os.path.join(path, barcode[0])):
            pathlib.Path(os.path.join(path, barcode[0])).mkdir(parents=True, exist_ok=True)

        # print(result[0].name)
        result = [result[0]]
        for r in result:
            dst = os.path.join(path, barcode[0], r.name)
            shutil.copy(r, dst)

2022-11-20 14:29:07,612 INFO sqlalchemy.engine.Engine SELECT data.specific_epithet AS data_specific_epithet, data.barcode AS data_barcode 
FROM data 
WHERE data.genus IS NOT NULL AND data.specific_epithet IS NOT NULL AND (data.identified_by ILIKE %(identified_by_1)s OR data.identified_by ILIKE %(identified_by_2)s OR data.identified_by ILIKE %(identified_by_3)s OR data.identified_by ILIKE %(identified_by_4)s OR data.identified_by ILIKE %(identified_by_5)s OR data.identified_by ILIKE %(identified_by_6)s OR data.identified_by ILIKE %(identified_by_7)s OR data.identified_by ILIKE %(identified_by_8)s OR data.identified_by ILIKE %(identified_by_9)s OR data.identified_by ILIKE %(identified_by_10)s OR data.identified_by ILIKE %(identified_by_11)s OR data.identified_by ILIKE %(identified_by_12)s OR data.identified_by ILIKE %(identified_by_13)s OR data.identified_by ILIKE %(identified_by_14)s OR data.identified_by ILIKE %(identified_by_15)s OR data.identified_by ILIKE %(identified_by_16)s OR dat