In [1]:
import pandas as pd
import json
import os
import matplotlib.pyplot as plt
import random
from tqdm.notebook import tqdm, trange

### Create data splits

In [2]:
root = "/data/nicola/WSH/"

In [3]:
## Load occurences
speciesHabitatsRecords = pd.read_json("./processed_data/speciesHabitatsRecords.json", orient="records")
speciesHabitatsRecords.head(10)

Unnamed: 0,zoneID,gridID,TypoCH_NUM,speciesKey,shapeArea,canton
0,1156,1626,625,7960979,162617.325261,VD
1,1156,1626,625,8179794,162617.325261,VD
2,1156,1626,625,8119241,162617.325261,VD
3,1156,1626,625,5674152,162617.325261,VD
4,1156,1626,625,8952395,162617.325261,VD
5,1156,1626,625,2974786,162617.325261,VD
6,1156,1626,625,8998684,162617.325261,VD
7,1156,1626,625,2705975,162617.325261,VD
8,1156,1626,625,7749287,162617.325261,VD
9,1156,1626,625,2927324,162617.325261,VD


In [4]:
## Load habitat types
habitatsData = pd.read_json("./WikiSpeciesHabitats/habitatsData.json", orient="records").set_index("TypoCH_NUM")
habitatsData.head(10)

Unnamed: 0_level_0,TypoCH_DE,TypoCH_FR,TypoCH_IT,TypoCH_Sci,Class,Group_,Type,Source
TypoCH_NUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1 Gewässer,1 Eaux libres,1 Ambienti acquatici,,1,1,1,1
11,1.1 Stehende Gewässer,1.1 Eaux calmes,1.1 Acque ferme,,1,11,11,1
12,1.2 Fliessgewässer,1.2 Eaux courantes,1.2 Acque correnti,,1,12,12,1
2,2 Ufer und der Feuchtgebiete,2 Rivages et lieux humides,2 Rive e luoghi umidi,,2,2,2,1
21,2.1 Ufer mit Vegetation,2.1 Rivages avec végétation,2.1 Rive con copertura vegetale,,2,21,21,1
211,2.1.1 Moortümpelgesellschaft,2.1.1 Dépression inondée à utriculaires,2.1.1 Depressioni allagate con Erba-vescica,Sphagno-Utricularion,2,21,211,1
212,2.1.2 Röhricht,2.1.2 Roselière,2.1.2 Canneti,,2,21,212,2
2121,2.1.2.1 Stillwasser-Röhricht,2.1.2.1 Roselière lacustre,2.1.2.1 Canneti lacustri,Phragmition,2,21,212,2
2122,2.1.2.2 Flussufer- und Landröhricht,2.1.2.2 Roselière terrestre,"2.1.2.2 Canneti terrestri, ripariali",Phalaridion,2,21,212,2
22,2.2 Flachmoore,2.2 Bas-marais,2.2 Paludi (torbiere basse),,2,22,22,2


In [5]:
## Merge both sources
speciesHabitatsRecords = speciesHabitatsRecords.join(habitatsData[["Class","Group_","Type"]], on="TypoCH_NUM", how="left")
speciesHabitatsRecords = speciesHabitatsRecords.rename(columns={"Class": "class", "Group_":"group", "Type": "type"})
speciesHabitatsRecords.head(10)

Unnamed: 0,zoneID,gridID,TypoCH_NUM,speciesKey,shapeArea,canton,class,group,type
0,1156,1626,625,7960979,162617.325261,VD,6,62,625
1,1156,1626,625,8179794,162617.325261,VD,6,62,625
2,1156,1626,625,8119241,162617.325261,VD,6,62,625
3,1156,1626,625,5674152,162617.325261,VD,6,62,625
4,1156,1626,625,8952395,162617.325261,VD,6,62,625
5,1156,1626,625,2974786,162617.325261,VD,6,62,625
6,1156,1626,625,8998684,162617.325261,VD,6,62,625
7,1156,1626,625,2705975,162617.325261,VD,6,62,625
8,1156,1626,625,7749287,162617.325261,VD,6,62,625
9,1156,1626,625,2927324,162617.325261,VD,6,62,625


### Aggregate data using Class as labels

In [6]:
## Grouping among grid cells and class
dataset_unbalanced = speciesHabitatsRecords.groupby(by=["gridID","class"])["speciesKey"].agg("unique").reset_index()
dataset_unbalanced["speciesCount"] = dataset_unbalanced["speciesKey"].apply(lambda x : len(x))
dataset_unbalanced.head()

Unnamed: 0,gridID,class,speciesKey,speciesCount
0,33,4,"[2441055, 2753146, 3114727, 2159474]",4
1,34,1,[5409958],1
2,34,3,"[8207244, 3170040, 3928139]",3
3,34,4,"[8207244, 8322056, 8008282]",3
4,120,4,"[4458814, 1096909, 2926945]",3


In [7]:
dataset_unbalanced.describe(percentiles=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.98])

Unnamed: 0,gridID,class,speciesCount
count,14174.0,14174.0,14174.0
mean,4599.138705,5.124242,21.918936
std,2724.257914,2.253531,110.044694
min,33.0,1.0,1.0
10%,1292.0,3.0,1.0
20%,2016.0,3.0,1.0
30%,2558.0,4.0,2.0
40%,3264.0,4.0,3.0
50%,4235.0,5.0,4.0
60%,5126.4,6.0,6.0


In [8]:
dataset_unbalanced["class"].value_counts()

4    3569
6    2657
3    2127
9    2029
5    1187
7     867
1     841
8     522
2     375
Name: class, dtype: int64

In [9]:
## Remove zones with too much observed species
dataset_unbalanced = dataset_unbalanced[dataset_unbalanced["speciesCount"]<=100].reset_index().drop("index", axis=1)
dataset_unbalanced.describe()

Unnamed: 0,gridID,class,speciesCount
count,13875.0,13875.0,13875.0
mean,4590.395171,5.132396,9.300901
std,2722.399509,2.263107,13.544051
min,33.0,1.0,1.0
25%,2289.0,4.0,2.0
50%,4229.0,5.0,4.0
75%,6748.5,6.0,11.0
max,11406.0,9.0,100.0


In [10]:
def split(list_a, chunk_size):
    """From https://www.programiz.com/python-programming/examples/list-chunks"""
    for i in range(0, len(list_a), chunk_size):
        yield list_a[i:i + chunk_size]

In [11]:
## Divide large samples into smaller ones
allowedSize = 10
dataset = pd.DataFrame()
for i in trange(len(dataset_unbalanced)):
    entry = dataset_unbalanced.loc[i]
    ## If lenght is ok, then just keep the sample
    if len(entry["speciesKey"])<=allowedSize:
        dataset = pd.concat([dataset, pd.DataFrame(entry).T])
    ## Otherwise, shuffle species keys and make chunks of wanted size
    else:
        newline = entry.copy()
        keys = list(entry["speciesKey"])
        random.shuffle(keys)
        for chunk in list(split(keys,chunk_size=allowedSize)):
            newline["speciesKey"] = chunk
            dataset = pd.concat([dataset, pd.DataFrame(newline).T])

dataset["speciesCount"] = dataset["speciesKey"].apply(lambda x : len(x))
dataset = dataset.reset_index().drop("index", axis=1)

  0%|          | 0/13875 [00:00<?, ?it/s]

In [12]:
dataset.dtypes

gridID          object
class           object
speciesKey      object
speciesCount     int64
dtype: object

In [13]:
dataset.head()

Unnamed: 0,gridID,class,speciesKey,speciesCount
0,33,4,"[2441055, 2753146, 3114727, 2159474]",4
1,34,1,[5409958],1
2,34,3,"[8207244, 3170040, 3928139]",3
3,34,4,"[8207244, 8322056, 8008282]",3
4,120,4,"[4458814, 1096909, 2926945]",3


In [14]:
dataset.dtypes

gridID          object
class           object
speciesKey      object
speciesCount     int64
dtype: object

In [15]:
dataset.drop("speciesKey", axis=1).astype(int).describe(percentiles=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

Unnamed: 0,gridID,class,speciesCount
count,21542.0,21542.0,21542.0
mean,4697.232662,5.190326,5.990623
std,2698.544836,2.127561,3.717523
min,33.0,1.0,1.0
10%,1379.0,3.0,1.0
20%,2084.0,4.0,2.0
30%,2644.0,4.0,3.0
40%,3522.4,4.0,4.0
50%,4493.5,5.0,6.0
60%,5305.0,6.0,9.0


In [16]:
## Mean number of articles per class
meanCounts = pd.DataFrame(dataset.groupby("class")["speciesCount"].mean()).rename(columns={"speciesCount":"avgSpeciesCount"})

In [17]:
## Checking classes balance
classesRepresentation = pd.DataFrame(dataset["class"].value_counts()).reset_index().rename(columns={"index":"class","class":"occurenceCount"})
classesRepresentation = classesRepresentation.join(meanCounts, on="class", how="inner")
classesRepresentation

Unnamed: 0,class,occurenceCount,avgSpeciesCount
0,4,6562,7.096617
1,6,4924,7.199431
2,9,2781,5.566703
3,3,2526,4.177751
4,5,1260,2.778571
5,7,1222,5.596563
6,1,987,3.536981
7,8,835,6.639521
8,2,445,3.647191


In [18]:
## All present species
all_species = []
for i in trange(len(dataset)):
    all_species = all_species+ list(dataset["speciesKey"].loc[i])

all_species = pd.DataFrame(all_species)

  0%|          | 0/21542 [00:00<?, ?it/s]

In [19]:
all_species = all_species.drop_duplicates().reset_index(drop=True).reset_index().rename(columns={"index":"ID", 0:"speciesKey"})
all_species.to_json("./WikiSpeciesHabitats/speciesKeys.json", orient="records")

In [20]:
all_species.head()

Unnamed: 0,ID,speciesKey
0,0,2441055
1,1,2753146
2,2,3114727
3,3,2159474
4,4,5409958


In [21]:
## Get classes keys and names
classesdf = pd.DataFrame(dataset["class"].unique()).rename(columns={0:"class"})
classesdf = classesdf.join(habitatsData[["TypoCH_DE","TypoCH_FR","TypoCH_IT"]], on="class", how="left")
classesdf.sort_values(by="class").reset_index().drop("index", axis=1).to_json("./WikiSpeciesHabitats/habitatsKeys.json", orient="records")
classesdf

Unnamed: 0,class,TypoCH_DE,TypoCH_FR,TypoCH_IT
0,4,"4 Grünland (Naturrasen, Wiesen und Weiden)",4 Pelouses et prairies,4 Praterie
1,1,1 Gewässer,1 Eaux libres,1 Ambienti acquatici
2,3,"3 Sand, Kies, Stein, Schotter usw","3 Sable, gravier, pierre, pierre concassée, etc.","3 Sabbia, ghiaia, pietra, pietrisco, ecc"
3,6,6 Wälder,6 Forêts,6 Ambienti boscati
4,9,"9 Bauten, Anlagen",9 Milieux construits,9 Ambienti edificati e infrastrutture
5,5,"5 Krautsäume, Hochstaudenfluren und Gebüsche","5 Landes, lisières et mégaphorbiaies","5 Margini di bosco, radure, aggregati di alte ..."
6,7,7 Pioniervegetation gestörter Plätze (Ruderals...,7 Végétation pionnière des endroits perturbés ...,7 Ambienti ruderali e perturbati dall'uomo
7,8,"8 Pflanzungen, Äcker und Kulturen","8 Plantations, champs et cultures",8 Ambienti coltivati
8,2,2 Ufer und der Feuchtgebiete,2 Rivages et lieux humides,2 Rive e luoghi umidi


In [22]:
## Training data (70% train, 15% val, 15% test)
trainData = dataset.sample(frac=0.7)
remaining = dataset.drop(trainData.index)
valData = remaining.sample(frac=0.5)
testData = remaining.drop(valData.index)


In [23]:
trainData.drop("speciesKey", axis=1).astype(int).describe()

Unnamed: 0,gridID,class,speciesCount
count,15079.0,15079.0,15079.0
mean,4705.687313,5.177664,6.015717
std,2704.276317,2.118975,3.717984
min,34.0,1.0,1.0
25%,2354.5,4.0,2.0
50%,4495.0,5.0,6.0
75%,6833.0,6.0,10.0
max,11406.0,9.0,10.0


In [24]:
trainData.head()

Unnamed: 0,gridID,class,speciesKey,speciesCount
6353,2617,6,"[5131910, 2688495, 5217160, 7749287]",4
15687,6570,6,"[4201709, 5304344, 3033289, 5667864, 2888948, ...",10
2248,1403,4,"[8322056, 5139156, 1933138, 1925221, 7768674, ...",8
14111,5812,4,"[3133692, 2492371, 7931979, 2889299, 7890713, ...",10
18968,8312,6,"[9220780, 2810078, 2925892, 3032837, 7589756, ...",10


In [25]:
valData.drop("speciesKey", axis=1).astype(int).describe()

Unnamed: 0,gridID,class,speciesCount
count,3232.0,3232.0,3232.0
mean,4676.119121,5.216275,5.89078
std,2687.520384,2.165958,3.728252
min,34.0,1.0,1.0
25%,2357.75,4.0,2.0
50%,4483.0,5.0,6.0
75%,6791.0,6.0,10.0
max,11226.0,9.0,10.0


In [26]:
testData.drop("speciesKey", axis=1).astype(int).describe()

Unnamed: 0,gridID,class,speciesCount
count,3231.0,3231.0,3231.0
mean,4678.895079,5.22346,5.973383
std,2683.352464,2.128871,3.703973
min,33.0,1.0,1.0
25%,2355.0,4.0,2.0
50%,4492.0,5.0,6.0
75%,6801.5,6.0,10.0
max,11226.0,9.0,10.0


In [27]:
## Save splits
trainData.reset_index().drop("index", axis=1).to_json("./WikiSpeciesHabitats/trainData.json", orient="records")
valData.reset_index().drop("index", axis=1).to_json("./WikiSpeciesHabitats/valData.json", orient="records")
testData.reset_index().drop("index", axis=1).to_json("./WikiSpeciesHabitats/testData.json", orient="records")

In [28]:
## Divide large samples into smaller ones
allowedSize = 1
dataset2 = pd.DataFrame()
for i in trange(len(dataset)):
    entry = dataset.loc[i]
    ## If lenght is ok, then just keep the sample
    if len(entry["speciesKey"])<=allowedSize:
        dataset2 = pd.concat([dataset2, pd.DataFrame(entry).T])
    ## Otherwise, shuffle species keys and make chunks of wanted size
    else:
        newline = entry.copy()
        keys = list(entry["speciesKey"])
        random.shuffle(keys)
        for chunk in list(split(keys,chunk_size=allowedSize)):
            newline["speciesKey"] = chunk
            dataset2 = pd.concat([dataset2, pd.DataFrame(newline).T])

dataset2["speciesKey"] = dataset2["speciesKey"].apply(lambda x : x[0])
dataset2 = dataset2.reset_index().drop("index", axis=1)

  0%|          | 0/21542 [00:00<?, ?it/s]

In [29]:
dataset2 = pd.DataFrame(dataset2.groupby("class")["speciesKey"].unique())
dataset2["numSpecies"] = dataset2["speciesKey"].apply(lambda x : len(x))
dataset2

Unnamed: 0_level_0,speciesKey,numSpecies
class,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[5409958, 2888948, 5329212, 2865448, 5410907, ...",901
2,"[3033289, 2730127, 2883073, 3032585, 9490132, ...",600
3,"[3170040, 8207244, 3928139, 7270316, 2891147, ...",1539
4,"[3114727, 2753146, 2441055, 2159474, 8322056, ...",2894
5,"[5149438, 5405976, 7270427, 5357013, 5352367, ...",912
6,"[5389017, 5295911, 5334259, 3173338, 5420853, ...",2848
7,"[9515886, 2495414, 5231190, 2489214, 3021496, ...",1575
8,"[3033558, 2481800, 2773942, 3173338, 2889173, ...",1335
9,"[1862719, 5229168, 3173338, 3170040, 5420853, ...",2035
