In [1]:
import sqlite3
import csv

# Correspondance nom scientifique - nom commun

In [61]:
# Tableau de correspondance Nom latin, Nom commun :
nomlatin_csv = 'data/nomlatin_lesarbresfr.csv'

with open(nomlatin_csv, 'r') as f:
    reader = csv.reader(f, delimiter=';')
    
    nomslatins = {}
    for row in reader:
        nomvernaculaire = row[0]
            
        # Remarque: il y a parfois plusieur nom latin proposé
        all_latinname = [ n.strip() for n in  row[1].split(',') ]

        for latinname in all_latinname:
            nomslatins[latinname] = nomvernaculaire


print(len(nomslatins))

556


In [62]:
# Lecture des triplets ( genre, espece, variete  ) :

db_filename = 'data/arbres.db'
db = sqlite3.connect(db_filename)


cursor = db.cursor()

cursor.execute("""
SELECT count(*) AS c, genre_bota, espece, variete FROM arbres
GROUP BY genre_bota, espece, variete
ORDER BY genre_bota
             """)

#nametuples = cursor.fetchmany(10)
nametuples = cursor.fetchall()

In [63]:
import unidecode

In [85]:
def filterlist( text, liste, startswith=False ):
    """ retourne une nouvelle liste ne comprenant que les élements (string)
        dans lesquels est présent text (string)
        
        startswith=True : force 'text' à être present au début
    """
    
    if not text: return []
    
    # case insensitive
    text = text.lower()
    liste = [ m.lower() for m in liste ]
    # no accent
    text = unidecode.unidecode( text )
    liste = [ unidecode.unidecode(m) for m in liste ]
    
    if startswith:
        crible = lambda A, B: B.startswith( A )
    else:
        crible = lambda A, B: A in B

    
    filtered_list = [ ele for ele in liste if crible( text, ele ) ]
    return filtered_list

In [87]:
filterlist( None, ['u', 'ui', 'dfqU', 'r'], startswith=True )

[]

In [88]:
# def keep_commonpart

def keep_only_commonpart( liste ):
    """ ne garde que la partie présente dans toute les mots de la 'liste'
    """
    
    N = min( [len(mot) for mot in liste] )

    common = []
    for i in range(N):
        if len( { mot[i] for mot in liste } ) == 1:
            common.append( liste[0][i] )
        else:
            break

    common = ''.join( common ).strip(' ')

    return common

In [93]:
def findmatch( triplet ):
    genre, espece, variete = triplet
        
    nomspossibles = list( nomslatins.keys() )
    
    # filter
  
    nomspossibles_G = filterlist( genre, nomspossibles, startswith=True )
        
    if len(nomspossibles_G) == 1 :
        return nomspossibles_G[0]
    elif len(nomspossibles_G) == 0 :
        return False
    else:
        
        nomspossibles_GE = filterlist( espece, nomspossibles_G )
        
        if len(nomspossibles_GE) ==  1:
            return nomspossibles_GE[0]
        elif len(nomspossibles_GE) == 0 :
            match = keep_only_commonpart( nomspossibles_G )
            if match : match += '*'
            return match
        
        else:
            
            nomspossibles_GEV = filterlist( variete, nomspossibles_GE )
            
            if len(nomspossibles_GEV) ==  1:
                return nomspossibles_GEV[0]
            elif len(nomspossibles_GEV) == 0 :
                match = keep_only_commonpart( nomspossibles_GE )
                if match : match += '*'
                return match
            else:
                match = keep_only_commonpart( nomspossibles_GEV )
                if match : match += '*'
                return match
            

    #tuple_str = ' '.join([ str(r) if r else '-' for r in row[1:]])
    #print( tuple_str, len( nomspossibles ) )

In [94]:
identified = []
non_identified = []
for row in nametuples:
   
    match = findmatch( row[1:] )
    if match:
        identified.append(( row, match ))
    else:
        non_identified.append( row )
        
print( len(identified), 'trouvés' )
print( len(nametuples), 'total' )

517 trouvés
547 total


non unique name...

In [95]:
sorted( non_identified, key=lambda x:x[0], reverse=True )

[(783, None, None, None),
 (77, 'Tetradium', 'danielli', None),
 (45, 'Cladastris', 'lutea', None),
 (31, 'Sambuccus', 'nigra', None),
 (24, 'Thuja', None, None),
 (22, 'Trachycarpus', 'fortunei', None),
 (18, 'Eleagnus', 'angustifolia', None),
 (18, 'Thuja', 'plicata', None),
 (9, 'Pteroceltis', 'tatarinowii', None),
 (7, 'Pistacia', 'chinensis', None),
 (7, 'Tetradium', 'danielli', 'Hupehensis'),
 (6, None, 'kobus', None),
 (6, 'Pterostyrax', 'hispida', None),
 (6, 'Thuja', 'occidentalis', None),
 (4, 'Chitalpa', 'tashkentensis', None),
 (3, None, 'fragmantissima', None),
 (3, 'Clerodendron', 'tricotonum', None),
 (3, 'Euodia', 'danielli', None),
 (3, 'Fontanesia', 'phillyreoides', 'Fortunei'),
 (2, 'Clerodendron', None, 'Fargesii'),
 (2, 'Hovenia', 'dulcis', None),
 (2, 'Pteroceltis', 'tatarinowii', 'Maxim'),
 (1, 'Eleagnus', None, None),
 (1, 'Fontanesia', 'phillyreoides', None),
 (1, 'Nerprun', 'alaternus', None),
 (1, 'Philodendron', 'erubescens', None),
 (1, 'Poncinos', None, No

In [70]:
nametuples

[(783, None, None, None),
 (3, None, 'fragmantissima', None),
 (6, None, 'kobus', None),
 (1, 'Abies', None, None),
 (5, 'Abies', 'concolor', None),
 (19, 'Abies', 'nordmaniana', None),
 (2, 'Abies', 'pinsapo', 'Glauca'),
 (1507, 'Acer', None, None),
 (3, 'Acer', None, 'Elsrike'),
 (1, 'Acer', None, 'Fastigiata'),
 (19, 'Acer', 'buergerianum', None),
 (109, 'Acer', 'campestre', None),
 (15, 'Acer', 'capillipes', None),
 (153, 'Acer', 'cappadocicum', None),
 (3, 'Acer', 'cappadocicum', 'Aureum'),
 (38, 'Acer', 'cappadocicum', 'Rubrum'),
 (3, 'Acer', 'davidii', None),
 (21, 'Acer', 'freemanii', None),
 (11, 'Acer', 'freemanii', 'Autumn blaze'),
 (15, 'Acer', 'freemanii', 'Celsam'),
 (46, 'Acer', 'ginnala', None),
 (5, 'Acer', 'griseum', None),
 (3, 'Acer', 'grosseri', 'Hersii'),
 (2, 'Acer', 'heldreichii', None),
 (3, 'Acer', 'japonicum', None),
 (77, 'Acer', 'monspessulanum', None),
 (169, 'Acer', 'negundo', None),
 (5, 'Acer', 'negundo', 'Variegata'),
 (1, 'Acer', 'negundo', 'Wierii'),

In [9]:
db.close()

In [None]:
# Tableau de correspondance Nom latin, Nom commun :
nomlatin_csv = 'data/nomlatin_lesarbresfr.csv'

with open(nomlatin_csv, 'r') as f:
    reader = csv.reader(f)
    nomslatins = { row[1]: row[0] for row in reader }
    
    # Remarque: il y a parfois plus de 3 colonnes 

In [None]:
nomslatins

In [None]:
for row in reader:
    print( row )