## Loading the MEE data and the associated metadata

In [1]:
import pandas as pd
import numpy as np
from mab2rec.utils import print_interaction_stats
import matplotlib.pyplot as plt
import seaborn as sns
import json
from tqdm.auto import tqdm

sns.set_style("whitegrid")

In [2]:
dir_data = "/home/tlefort/Documents/phd/mee_paper/created_data/"  # replace by your path
# the crowdsourced answers are in the files on the Pl@ntNet-CrowdSWE zenodo
# classes are in the zenodo too
# the species_to_families have been comited to this repo
with open(dir_data+"species_to_families_genus.json", "r") as f:
    species_to_upper = json.load(f)
with open(dir_data+"classes_1011.json", "r") as f:
    classes = json.load(f)

- the species are the classification classes (be careful, they might not match between two versions / they often need relabeling)
- for the recsys, the obs (not the species) are the arms.
- for phylocrowrec, the genera (pl. of genus) are the arms
- We also need the genus of each species: BE CAREFUL here I infer the genus from the species name using the pygbif library. Pl@ntNet might accept some species that are not recognized by the GBIF. Hence we need to do the relabeling manually. This means remove obs with only "unrecognized" species as we will never recommend their genus.

In [3]:
classes

{'Erinus alpinus L.': 0,
 'Buddleja japonica Hemsl.': 1,
 'Sedum sexangulare L.': 2,
 'Sambucus ebulus L.': 3,
 'Mentha × piperita L.': 4,
 'Allium sphaerocephalon L.': 5,
 'Clematis flammula L.': 6,
 'Spiraea chamaedryfolia L.': 7,
 'Prunella vulgaris L.': 8,
 'Ambrosia artemisiifolia L.': 9,
 'Pulmonaria officinalis L.': 10,
 'Calamagrostis arenaria (L.) Roth': 11,
 'Betula pubescens Ehrh.': 12,
 "Pelargonium zonale (L.) L'Hér.": 13,
 'Campanula trachelium L.': 14,
 'Saxifraga paniculata Mill.': 15,
 'Lotus pedunculatus Cav.': 16,
 'Malva arborea (L.) Webb & Berthel.': 17,
 'Tradescantia sillamontana Matuda': 18,
 'Rosa agrestis Savi': 19,
 'Arum cylindraceum Gasp.': 20,
 'Cyperus papyrus L.': 21,
 'Rubus fruticosus L.': 22,
 'Malva olbia (L.) Alef.': 23,
 'Origanum majorana L.': 24,
 'Potentilla alchimilloides Lapeyr.': 25,
 'Citrus × aurantium L.': 26,
 'Erythrostemon gilliesii (Hook.) Klotzsch': 27,
 "Pelargonium inquinans (L.) L'Hér.": 28,
 'Crocosmia × crocosmiiflora (Lemoine) N

We also have the families if needed

In [4]:
species_to_upper

{'Leysera leyseroides (Desf.) Maire': ['Asteraceae', 'Leysera'],
 'Loxostylis alata A. Spreng. ex Rchb.': ['Anacardiaceae', 'Loxostylis'],
 'Eugenia noumeensis Guillaumin': ['Myrtaceae', 'Eugenia'],
 'Sedum amplexicaule DC.': ['Crassulaceae', 'Petrosedum'],
 'Cavendishia grandifolia Herold': ['Ericaceae', 'Cavendishia'],
 'Drymonia alloplectoides Hanst.': ['Gesneriaceae', 'Drymonia'],
 'Microseris lanceolata (Walp.) Sch.Bip.': ['Asteraceae', 'Microseris'],
 'Caldesia parnassifolia (L.) Parl.': ['Alismataceae', 'Caldesia'],
 'Hamamelis × intermedia Rehder': ['Hamamelidaceae', 'Hamamelis'],
 'Oreomyrrhis andicola (Kunth) Endl. ex Hook. f.': ['Apiaceae',
  'Chaerophyllum'],
 'Erinacea anthyllis Link': ['Fabaceae', 'Erinacea'],
 'Chimonobambusa quadrangularis Makino': ['Poaceae', 'Chimonobambusa'],
 'Dryopteris arguta (Kaulf.) Watt': ['Dryopteridaceae', 'Dryopteris'],
 'Oxalis spiralis Ruiz & Pav. ex G.Don': ['Oxalidaceae', 'Oxalis'],
 'Barbarea orthoceras Ledeb.': ['Brassicaceae', 'Barbar

In [7]:
from pygbif import species

lab_to_genus_upd = {}
rejects = {}
for k in tqdm(classes.keys()):
    out = species.name_backbone(name=k)
    try:
        lab_to_genus_upd[k] = out["genus"]
    except:
        rejects[k] = out["matchType"]


  0%|          | 0/11425 [00:00<?, ?it/s]

In [8]:
print(len(rejects))
print(rejects)

27
{'Jacobaea alpina x Senecio ovatus subsp. alpestris': 'NONE', 'Phalaenopsis spp.': 'NONE', 'Citrus reticulata x Citrus sinensis': 'NONE', 'Dactylorhiza elata subsp. brennensis x Dactylorhiza maculata': 'NONE', 'Anchusa officinalis x Anchusa undulata subsp. hybrida': 'NONE', 'Dahlia spp.': 'NONE', 'Ficus spp.': 'NONE', 'Ophrys scolopax subsp. apiformis x Ophrys speculum': 'NONE', 'Sagina subulata (Sw.) C.Presl': 'HIGHERRANK', 'Fuchsia spp.': 'NONE', 'Rumex cristatus x Rumex pulcher': 'NONE', 'Cymbidium spp.': 'NONE', 'Salix atrocinerea x Salix myrsinifolia': 'NONE', 'Ophrys dyris x Ophrys lupercalis': 'NONE', 'Salix purpurea x Salix daphnoides': 'NONE', 'Orchis mascula x Orchis militaris': 'NONE', 'Rhododendron kaempferi x Rhododendron kiusianum': 'NONE', 'Echinochloa crus-galli x Echinochloa oryzoides': 'NONE', 'Fumaria agraria x Fumaria officinalis': 'NONE', 'Cytisus scoparius x Cytisus striatus': 'NONE', 'Rosa caesia x Rosa rugosa': 'NONE', 'Heliconia spp.': 'NONE', 'Gazania rigen

In [9]:
species_to_upper["Jacobaea alpina x Senecio ovatus subsp. alpestris"]

[None, None]

Relabeling step

In [14]:
classes_new = {k: classes[k] for k in classes if k not in rejects}

In [16]:
for i, (cl, number) in enumerate(classes_new.items()):
    classes_new[cl] = i
with open("./outputs/new_classes.json", "w") as f:
    json.dump(classes_new, f, indent=4)

We only loose 27 species (not many)

In [17]:
len(classes_new), len(classes)

(11398, 11425)

In [13]:
with open(dir_data+"answers_cleaned_relab.json", "r") as f:
    answers = json.load(f)

In [18]:
inv_old_classes = {v:k for k, v in classes.items()}

Relabel the obs and workers based on the obtained votes

In [24]:
answers_pandas = {"task": [], "worker": [], "label": []}
task_dic, task_counter = {}, 0
worker_dic, worker_counter = {}, 0

for task in tqdm(answers, total=len(answers)):
    if task_dic.get(task, None) is None:
        task_dic[task] = task_counter
        task_counter += 1
    task_number = task_dic.get(task)
    for worker, label in answers[task].items():
        if inv_old_classes[label] not in rejects:
            label = classes_new[inv_old_classes[label]]
            if worker_dic.get(worker, None) is None:
                worker_dic[worker] = worker_counter
                worker_counter += 1
            worker_number = worker_dic.get(worker)
            answers_pandas["task"].append(task_number)
            answers_pandas["worker"].append(worker_number)
            answers_pandas["label"].append(label)
        else:
            continue
answers_pandas = pd.DataFrame(answers_pandas)

  0%|          | 0/6699593 [00:00<?, ?it/s]

Here this is WIP: as we use contextual bandits and the context is the user profile, we need to create user features.
I thought about:
- The user id
- The number of times the user casted a vote
- The number of different species proposed by the user
- The number of observations the user participated in with x-number of other users

In [None]:
# build user features

user_features = {
    "user_id": [],
    "votes>1": [],
    "votes>5": [],
    "votes>10": [],
    "votes>50": [],
    "votes>100": [],
    "n_species>1": [],
    "n_species>5": [],
    "n_species>10": [],
    "n_species>50": [],
    "n_species>100": [],
    "n_wothers>1": [],
    "n_wothers>5": [],
    "n_wothers>10": [],
    "n_wothers>50": [],
    "n_wothers>100": [],
}

for user_id in answers_pandas["user_id"].unique():
    tmp = answers_pandas[answers_pandas["user_id"] == user_id]
    n_votes = tmp.shape[0]
    n1 = 1 if n_votes > 1 else 0
    n5 = 1 if n_votes > 5 else 0
    n10 = 1 if n_votes > 10 else 0
    n50 = 1 if n_votes > 50 else 0
    n100 = 1 if n_votes > 100 else 0

    n_species = tmp["label"].nunique()
    nsp1 = 1 if n_species > 1 else 0
    nsp5 = 1 if n_species > 5 else 0
    nsp10 = 1 if n_species > 10 else 0
    nsp50 = 1 if n_species > 50 else 0
    nsp100 = 1 if n_species > 100 else 0


