In [1]:
%reload_ext autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import os
import pandas as pd
import re
import sqlalchemy as sa
import sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('main.ipynb'))))

import database as db
from models import DataTrustedIdentifier

## Database connection

In [3]:
engine, session = db.connect()
engine.echo = False

2023-03-27 16:47:50,476 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2023-03-27 16:47:50,476 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-27 16:47:50,478 INFO sqlalchemy.engine.Engine select current_schema()
2023-03-27 16:47:50,478 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-27 16:47:50,479 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2023-03-27 16:47:50,479 INFO sqlalchemy.engine.Engine [raw sql] {}


## CSV with correct identification

In [4]:
df = pd.read_csv('../csv/list_genus_species_correct.csv', sep=';', index_col=None, header=0).astype(str)
df = df.replace('nan', None)
df

Unnamed: 0,kingdom,phylum,order,classe,family,genus,specific_epithet,infraspecific_epithet,scientific_name_authorship,kingdom_trusted,phylum_trusted,order_trusted,classe_trusted,family_trusted,genus_trusted,specific_epithet_trusted,infraspecific_epithet_trusted,scientific_name_authorship_trusted
0,,,,,,Ottonia,anisum,,Spreng.,,,,,,Piper,anisum,,(Spreng.) Angely
1,,,,,,Ottonia,anisum,f. glabrior,Miq.,,,,,,Piper,anisum,,(Spreng.) Angely
2,,,,,,Ottonia,armondii,,Trel.,,,,,,Piper,anisum,,(Spreng.) Angely
3,,,,,,Ottonia,burchellii,,Trel.,,,,,,Piper,anisum,,(Spreng.) Angely
4,,,,,,Ottonia,carpinifolia,f. hirtipedicellata,Yunck.,,,,,,Piper,anisum,,(Spreng.) Angely
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,,,,,,Ottonia,brevistipitata,,(C. DC.) Trel.,,,,,,Piper,ovatum,var. hirtellum,C. DC.
114,,,,,,Ottonia,leptostachya,var. brevistipitata,(C. DC.) E.F. Guim.,,,,,,Piper,ovatum,var. hirtellum,C. DC.
115,,,,,,Piper,frutescens,,C. DC.,,,,,,Piper,ovatum,var. hirtellum,C. DC.
116,,,,,,Ottonia frutescens (C. DC.) Trel.,,,,,,,,,Piper,ovatum,var. hirtellum,C. DC.


## Below genus level

In [5]:
for d in df[['genus', 'specific_epithet', 'infraspecific_epithet', 'scientific_name_authorship', 'genus_trusted', 'specific_epithet_trusted', 'infraspecific_epithet_trusted','scientific_name_authorship_trusted']].iterrows():
    columns=[
        DataTrustedIdentifier.genus,
        DataTrustedIdentifier.specific_epithet,
        sa.func.replace(sa.func.replace(DataTrustedIdentifier.infraspecific_epithet, 'f. ', ''), 'var. ', '').label('infraspecific_epithet'),
        sa.func.regexp_replace(DataTrustedIdentifier.scientific_name_authorship, '!| |(|)|.|&', '').label('scientific_name_authorship'),
    ]
    sub = session.query(*columns).subquery('sub')

    infraspecific_epithet = d[1].infraspecific_epithet
    if infraspecific_epithet:
        infraspecific_epithet = infraspecific_epithet.replace('f. ', '').replace('var. ', '')

    scientific_name_authorship = d[1].scientific_name_authorship
    if scientific_name_authorship:
        scientific_name_authorship = re.sub('\W+', '', scientific_name_authorship)

    print('genus (old): %s - (new): %s' % (d[1].genus, d[1].genus_trusted))
    print('specific_epithet (old): %s - (new): %s' % (d[1].specific_epithet, d[1].specific_epithet_trusted))
    print('infraspecific_epithet (old): %s - (new): %s' % (d[1].infraspecific_epithet, d[1].infraspecific_epithet_trusted))
    print('scientific_name_authorship (old): %s - (new): %s' % (d[1].scientific_name_authorship, d[1].scientific_name_authorship_trusted))

    session.query(DataTrustedIdentifier)\
        .filter(sa.and_(DataTrustedIdentifier.genus.__eq__(d[1].genus),
                        DataTrustedIdentifier.specific_epithet.__eq__(d[1].specific_epithet),
                        sa.or_(sub.c.infraspecific_epithet.__eq__(infraspecific_epithet),
                               sub.c.scientific_name_authorship.__eq__(scientific_name_authorship))))\
        .update(values={DataTrustedIdentifier.genus_trusted: d[1].genus_trusted,
                        DataTrustedIdentifier.specific_epithet_trusted: d[1].specific_epithet_trusted,
                        DataTrustedIdentifier.infraspecific_epithet_trusted: d[1].infraspecific_epithet_trusted,
                        DataTrustedIdentifier.scientific_name_authorship_trusted: d[1].scientific_name_authorship_trusted}, synchronize_session=False)

    session.commit()

genus (old): Ottonia - (new): Piper
specific_epithet (old): anisum - (new): anisum
infraspecific_epithet (old): None - (new): None
scientific_name_authorship (old): Spreng. - (new): (Spreng.) Angely
genus (old): Ottonia - (new): Piper
specific_epithet (old): anisum - (new): anisum
infraspecific_epithet (old): f. glabrior - (new): None
scientific_name_authorship (old): Miq. - (new): (Spreng.) Angely
genus (old): Ottonia - (new): Piper
specific_epithet (old): armondii - (new): anisum
infraspecific_epithet (old): None - (new): None
scientific_name_authorship (old):  Trel. - (new): (Spreng.) Angely
genus (old): Ottonia - (new): Piper
specific_epithet (old): burchellii - (new): anisum
infraspecific_epithet (old): None - (new): None
scientific_name_authorship (old): Trel. - (new): (Spreng.) Angely
genus (old): Ottonia - (new): Piper
specific_epithet (old): carpinifolia - (new): anisum
infraspecific_epithet (old): f. hirtipedicellata - (new): None
scientific_name_authorship (old): Yunck. - (n

In [6]:
query = session.query(DataTrustedIdentifier)\
    .filter(sa.or_(DataTrustedIdentifier.genus_trusted.is_not(None),
                   DataTrustedIdentifier.specific_epithet_trusted.is_not(None),
                   DataTrustedIdentifier.infraspecific_epithet_trusted.is_not(None),
                   DataTrustedIdentifier.scientific_name_authorship_trusted.is_not(None)))\
    .all()

print('records updated in table %s was: %d' % (DataTrustedIdentifier.__tablename__, len(query)))

records updated in table data_trusted_identifier was: 253


In [7]:
df = pd.DataFrame({
    'seq': [q.seq for q in query],
    'genus': [q.genus for q in query],
    'specific_epithet': [q.specific_epithet for q in query],
    'infraspecific_epithet': [q.infraspecific_epithet for q in query],
    'scientific_name_authorship': [q.scientific_name_authorship for q in query],
    'genus_new': [q.genus_trusted for q in query],
    'specific_epithet_new': [q.specific_epithet_trusted for q in query],
    'infraspecific_epithet_new': [q.infraspecific_epithet_trusted for q in query],
    'scientific_name_authorship_new': [q.scientific_name_authorship_trusted for q in query],
})
df.to_csv('records_updated.csv', sep=';')
display(df.head(5))

Unnamed: 0,seq,genus,specific_epithet,infraspecific_epithet,scientific_name_authorship,genus_new,specific_epithet_new,infraspecific_epithet_new,scientific_name_authorship_new
0,7786,Ottonia,propinqua,,Kunth,Piper,grazielae,,M. Carv.-Silva & E.F. Guim.
1,9728,Piper,corcovadensis,,(Miq.) DC.,Piper,grazielae,,M. Carv.-Silva & E.F. Guim.
2,31987,Piper,corcovadensis,blanchetii,(Moric.) Callejas,Piper,grazielae,,M. Carv.-Silva & E.F. Guim.
3,13054,Piper,corcovadensis,,(Miq.) C. DC.,Piper,grazielae,,M. Carv.-Silva & E.F. Guim.
4,8886,Piper,corcovadensis,,(Miq.) DC.,Piper,grazielae,,M. Carv.-Silva & E.F. Guim.


In [8]:
session.query(DataTrustedIdentifier)\
    .filter(sa.and_(DataTrustedIdentifier.genus_trusted.__eq__(None),
                   DataTrustedIdentifier.specific_epithet_trusted.__eq__(None),
                   DataTrustedIdentifier.infraspecific_epithet_trusted.__eq__(None),
                   DataTrustedIdentifier.scientific_name_authorship_trusted.__eq__(None)))\
    .update(values={DataTrustedIdentifier.genus_trusted: DataTrustedIdentifier.genus,
                        DataTrustedIdentifier.specific_epithet_trusted: DataTrustedIdentifier.specific_epithet,
                        DataTrustedIdentifier.infraspecific_epithet_trusted: DataTrustedIdentifier.infraspecific_epithet,
                        DataTrustedIdentifier.scientific_name_authorship_trusted: DataTrustedIdentifier.scientific_name_authorship}, synchronize_session=False)
session.commit()

## Genus level

In [9]:
old_genus = [['Sarcorhachis'], ['Ottonia', 'Pothomorphe'], ['Piperomia', 'Peperonia']]
new_genus = ['Manekia', 'Piper', 'Peperomia']

for g in zip(old_genus, new_genus):
    list_old_genus = g[0]
    new = g[1]
    for old in list_old_genus:
        session.query(DataTrustedIdentifier)\
            .filter(DataTrustedIdentifier.genus.__eq__(old))\
            .update(values={DataTrustedIdentifier.genus_trusted: new}, synchronize_session=False)
        session.commit()

## Close connection

In [10]:
session.close()
engine.dispose()