In [1]:
import pandas as pd
import json
import os
import matplotlib.pyplot as plt
import random
from tqdm.notebook import tqdm, trange

### Create data splits

In [2]:
root = "/data/nicola/WSH/"

In [3]:
## Load occurences
speciesHabitatsRecords = pd.read_json("./processed_data/speciesHabitatsRecords.json", orient="records")
speciesHabitatsRecords.head(10)

Unnamed: 0,zoneID,gridID,TypoCH_NUM,speciesKey,shapeArea,canton
0,96603,1537,454,2891147,102800.130038,VD
1,95033,1625,625,3170040,57464.263336,VD
2,94466,1713,452,2891147,69785.151723,VD
3,94462,1713,453,5371685,104539.091134,VD
4,94901,1804,625,1703827,36810.08939,VD
5,94842,2071,662,2987999,308874.666368,VD
6,94842,2071,662,2482613,308874.666368,VD
7,94841,2071,453,8324121,409587.557003,VD
8,94841,2071,453,8324121,409587.557003,VD
9,94841,2071,453,2704395,409587.557003,VD


In [4]:
## Load habitat types
habitatsData = pd.read_json("./WikiSpeciesHabitats/habitatsData.json", orient="records").set_index("TypoCH_NUM")
habitatsData.head(10)

Unnamed: 0_level_0,TypoCH_DE,TypoCH_FR,TypoCH_IT,TypoCH_Sci,Class,Group_,Type,Source
TypoCH_NUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1 Gewässer,1 Eaux libres,1 Ambienti acquatici,,1,1,1,1
11,1.1 Stehende Gewässer,1.1 Eaux calmes,1.1 Acque ferme,,1,11,11,1
12,1.2 Fliessgewässer,1.2 Eaux courantes,1.2 Acque correnti,,1,12,12,1
2,2 Ufer und der Feuchtgebiete,2 Rivages et lieux humides,2 Rive e luoghi umidi,,2,2,2,1
21,2.1 Ufer mit Vegetation,2.1 Rivages avec végétation,2.1 Rive con copertura vegetale,,2,21,21,1
211,2.1.1 Moortümpelgesellschaft,2.1.1 Dépression inondée à utriculaires,2.1.1 Depressioni allagate con Erba-vescica,Sphagno-Utricularion,2,21,211,1
212,2.1.2 Röhricht,2.1.2 Roselière,2.1.2 Canneti,,2,21,212,2
2121,2.1.2.1 Stillwasser-Röhricht,2.1.2.1 Roselière lacustre,2.1.2.1 Canneti lacustri,Phragmition,2,21,212,2
2122,2.1.2.2 Flussufer- und Landröhricht,2.1.2.2 Roselière terrestre,"2.1.2.2 Canneti terrestri, ripariali",Phalaridion,2,21,212,2
22,2.2 Flachmoore,2.2 Bas-marais,2.2 Paludi (torbiere basse),,2,22,22,2


In [5]:
## Merge both sources
speciesHabitatsRecords = speciesHabitatsRecords.join(habitatsData[["Class","Group_","Type"]], on="TypoCH_NUM", how="left")
speciesHabitatsRecords = speciesHabitatsRecords.rename(columns={"Class": "class", "Group_":"group", "Type": "type"})
speciesHabitatsRecords.head(10)

Unnamed: 0,zoneID,gridID,TypoCH_NUM,speciesKey,shapeArea,canton,class,group,type
0,96603,1537,454,2891147,102800.130038,VD,4,45,454
1,95033,1625,625,3170040,57464.263336,VD,6,62,625
2,94466,1713,452,2891147,69785.151723,VD,4,45,452
3,94462,1713,453,5371685,104539.091134,VD,4,45,453
4,94901,1804,625,1703827,36810.08939,VD,6,62,625
5,94842,2071,662,2987999,308874.666368,VD,6,66,662
6,94842,2071,662,2482613,308874.666368,VD,6,66,662
7,94841,2071,453,8324121,409587.557003,VD,4,45,453
8,94841,2071,453,8324121,409587.557003,VD,4,45,453
9,94841,2071,453,2704395,409587.557003,VD,4,45,453


In [34]:
len(speciesHabitatsRecords.zoneID.unique())

22787

In [35]:
len(speciesHabitatsRecords[["zoneID","class"]].drop_duplicates())

22787

### Aggregate data using Class as labels

In [36]:
## Grouping among grid cells and class
dataset_unbalanced = speciesHabitatsRecords.groupby(by=["zoneID","class"])["speciesKey"].agg("unique").reset_index()
dataset_unbalanced["speciesCount"] = dataset_unbalanced["speciesKey"].apply(lambda x : len(x))
dataset_unbalanced.head()

Unnamed: 0,zoneID,class,speciesKey,speciesCount
0,9,4,"[3032837, 3170807, 3105433, 2883073]",4
1,10,4,[4299368],1
2,13,6,[2891147],1
3,26,4,"[5137582, 7799370]",2
4,29,5,"[5352367, 3170807, 8207244]",3


In [37]:
dataset_unbalanced.describe(percentiles=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.98])

Unnamed: 0,zoneID,class,speciesCount
count,22787.0,22787.0,22787.0
mean,48494.03129,5.137886,8.899592
std,28218.470841,1.920325,68.208714
min,9.0,1.0,1.0
10%,9658.6,3.0,1.0
20%,21022.2,4.0,1.0
30%,26733.8,4.0,1.0
40%,35618.2,4.0,1.0
50%,48877.0,5.0,2.0
60%,59375.8,6.0,3.0


In [38]:
dataset_unbalanced["class"].value_counts()

4    7939
6    6133
9    2225
3    2137
5    1441
7     912
8     859
1     656
2     485
Name: class, dtype: int64

In [39]:
## Remove zones with too much observed species
dataset_unbalanced = dataset_unbalanced[dataset_unbalanced["speciesCount"]<=100].reset_index().drop("index", axis=1)
dataset_unbalanced.describe()

Unnamed: 0,zoneID,class,speciesCount
count,22638.0,22638.0,22638.0
mean,48504.465015,5.139412,4.401626
std,28221.356301,1.921322,7.504196
min,9.0,1.0,1.0
25%,23743.75,4.0,1.0
50%,48914.0,5.0,2.0
75%,71796.75,6.0,4.0
max,98118.0,9.0,98.0


In [40]:
def split(list_a, chunk_size):
    """From https://www.programiz.com/python-programming/examples/list-chunks"""
    for i in range(0, len(list_a), chunk_size):
        yield list_a[i:i + chunk_size]

In [41]:
## Divide large samples into smaller ones
allowedSize = 10
dataset = pd.DataFrame()
for i in trange(len(dataset_unbalanced)):
    entry = dataset_unbalanced.loc[i]
    ## If lenght is ok, then just keep the sample
    if len(entry["speciesKey"])<=allowedSize:
        dataset = pd.concat([dataset, pd.DataFrame(entry).T])
    ## Otherwise, shuffle species keys and make chunks of wanted size
    else:
        newline = entry.copy()
        keys = list(entry["speciesKey"])
        random.shuffle(keys)
        for chunk in list(split(keys,chunk_size=allowedSize)):
            newline["speciesKey"] = chunk
            dataset = pd.concat([dataset, pd.DataFrame(newline).T])

dataset["speciesCount"] = dataset["speciesKey"].apply(lambda x : len(x))
dataset = dataset.reset_index().drop("index", axis=1)

  0%|          | 0/22638 [00:00<?, ?it/s]

In [42]:
dataset.dtypes

zoneID          object
class           object
speciesKey      object
speciesCount     int64
dtype: object

In [43]:
dataset.head()

Unnamed: 0,zoneID,class,speciesKey,speciesCount
0,9,4,"[3032837, 3170807, 3105433, 2883073]",4
1,10,4,[4299368],1
2,13,6,[2891147],1
3,26,4,"[5137582, 7799370]",2
4,29,5,"[5352367, 3170807, 8207244]",3


In [44]:
dataset.dtypes

zoneID          object
class           object
speciesKey      object
speciesCount     int64
dtype: object

In [45]:
dataset.drop("speciesKey", axis=1).astype(int).describe(percentiles=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

Unnamed: 0,zoneID,class,speciesCount
count,26319.0,26319.0,26319.0
mean,48589.247768,5.140013,3.78601
std,27962.973723,1.90141,3.295482
min,9.0,1.0,1.0
10%,10242.4,3.0,1.0
20%,21509.6,4.0,1.0
30%,26933.0,4.0,1.0
40%,36008.4,4.0,2.0
50%,49293.0,5.0,2.0
60%,59400.8,6.0,3.0


In [46]:
## Mean number of articles per class
meanCounts = pd.DataFrame(dataset.groupby("class")["speciesCount"].mean()).rename(columns={"speciesCount":"avgSpeciesCount"})

In [47]:
## Checking classes balance
classesRepresentation = pd.DataFrame(dataset["class"].value_counts()).reset_index().rename(columns={"index":"class","class":"occurenceCount"})
classesRepresentation = classesRepresentation.join(meanCounts, on="class", how="inner")
classesRepresentation

Unnamed: 0,class,occurenceCount,avgSpeciesCount
0,4,9598,4.23234
1,6,6962,3.758977
2,9,2455,3.220774
3,3,2284,2.872592
4,5,1488,2.142473
5,7,1232,5.342532
6,8,1038,4.745665
7,1,723,2.832642
8,2,539,3.042672


In [48]:
## All present species
all_species = []
for i in trange(len(dataset)):
    all_species = all_species+ list(dataset["speciesKey"].loc[i])

all_species = pd.DataFrame(all_species)

  0%|          | 0/26319 [00:00<?, ?it/s]

In [49]:
all_species = all_species.drop_duplicates().reset_index(drop=True).reset_index().rename(columns={"index":"ID", 0:"speciesKey"})
all_species.to_json("./WikiSpeciesHabitats/speciesKeys.json", orient="records")

In [50]:
all_species.head()

Unnamed: 0,ID,speciesKey
0,0,3032837
1,1,3170807
2,2,3105433
3,3,2883073
4,4,4299368


In [51]:
## Get classes keys and names
classesdf = pd.DataFrame(dataset["class"].unique()).rename(columns={0:"class"})
classesdf = classesdf.join(habitatsData[["TypoCH_DE","TypoCH_FR","TypoCH_IT"]], on="class", how="left")
classesdf.sort_values(by="class").reset_index().drop("index", axis=1).to_json("./WikiSpeciesHabitats/habitatsKeys.json", orient="records")
classesdf

Unnamed: 0,class,TypoCH_DE,TypoCH_FR,TypoCH_IT
0,4,"4 Grünland (Naturrasen, Wiesen und Weiden)",4 Pelouses et prairies,4 Praterie
1,6,6 Wälder,6 Forêts,6 Ambienti boscati
2,5,"5 Krautsäume, Hochstaudenfluren und Gebüsche","5 Landes, lisières et mégaphorbiaies","5 Margini di bosco, radure, aggregati di alte ..."
3,3,"3 Sand, Kies, Stein, Schotter usw","3 Sable, gravier, pierre, pierre concassée, etc.","3 Sabbia, ghiaia, pietra, pietrisco, ecc"
4,9,"9 Bauten, Anlagen",9 Milieux construits,9 Ambienti edificati e infrastrutture
5,7,7 Pioniervegetation gestörter Plätze (Ruderals...,7 Végétation pionnière des endroits perturbés ...,7 Ambienti ruderali e perturbati dall'uomo
6,8,"8 Pflanzungen, Äcker und Kulturen","8 Plantations, champs et cultures",8 Ambienti coltivati
7,1,1 Gewässer,1 Eaux libres,1 Ambienti acquatici
8,2,2 Ufer und der Feuchtgebiete,2 Rivages et lieux humides,2 Rive e luoghi umidi


In [52]:
## Training data (70% train, 15% val, 15% test)
trainData = dataset.sample(frac=0.7)
remaining = dataset.drop(trainData.index)
valData = remaining.sample(frac=0.5)
testData = remaining.drop(valData.index)


In [53]:
trainData.drop("speciesKey", axis=1).astype(int).describe()

Unnamed: 0,zoneID,class,speciesCount
count,18423.0,18423.0,18423.0
mean,48511.725615,5.141834,3.768604
std,27978.014039,1.896325,3.285567
min,9.0,1.0,1.0
25%,23934.5,4.0,1.0
50%,49269.0,5.0,2.0
75%,71379.5,6.0,6.0
max,98101.0,9.0,10.0


In [54]:
trainData.head()

Unnamed: 0,zoneID,class,speciesKey,speciesCount
9339,31557,9,"[2482553, 3034825]",2
6527,23848,4,"[2927007, 2891147]",2
1252,4259,4,[3149879],1
146,419,5,[2996525],1
6212,23208,7,"[2987999, 2927007]",2


In [55]:
valData.drop("speciesKey", axis=1).astype(int).describe()

Unnamed: 0,zoneID,class,speciesCount
count,3948.0,3948.0,3948.0
mean,49276.245947,5.141084,3.828774
std,27897.238161,1.894642,3.318963
min,67.0,1.0,1.0
25%,24453.0,4.0,1.0
50%,51011.0,5.0,2.0
75%,71810.75,6.0,6.0
max,98085.0,9.0,10.0


In [56]:
testData.drop("speciesKey", axis=1).astype(int).describe()

Unnamed: 0,zoneID,class,speciesCount
count,3948.0,3948.0,3948.0
mean,48264.0,5.130446,3.824468
std,27954.663359,1.932076,3.318166
min,65.0,1.0,1.0
25%,23790.75,4.0,1.0
50%,47439.0,4.0,2.0
75%,71225.75,6.0,6.0
max,98118.0,9.0,10.0


In [57]:
## Save splits
trainData.reset_index().drop("index", axis=1).to_json("./WikiSpeciesHabitats/trainData.json", orient="records")
valData.reset_index().drop("index", axis=1).to_json("./WikiSpeciesHabitats/valData.json", orient="records")
testData.reset_index().drop("index", axis=1).to_json("./WikiSpeciesHabitats/testData.json", orient="records")

In [29]:
## Divide large samples into smaller ones
allowedSize = 1
dataset2 = pd.DataFrame()
for i in trange(len(dataset)):
    entry = dataset.loc[i]
    ## If lenght is ok, then just keep the sample
    if len(entry["speciesKey"])<=allowedSize:
        dataset2 = pd.concat([dataset2, pd.DataFrame(entry).T])
    ## Otherwise, shuffle species keys and make chunks of wanted size
    else:
        newline = entry.copy()
        keys = list(entry["speciesKey"])
        random.shuffle(keys)
        for chunk in list(split(keys,chunk_size=allowedSize)):
            newline["speciesKey"] = chunk
            dataset2 = pd.concat([dataset2, pd.DataFrame(newline).T])

dataset2["speciesKey"] = dataset2["speciesKey"].apply(lambda x : x[0])
dataset2 = dataset2.reset_index().drop("index", axis=1)

  0%|          | 0/16074 [00:00<?, ?it/s]

In [30]:
dataset2 = pd.DataFrame(dataset2.groupby("class")["speciesKey"].unique())
dataset2["numSpecies"] = dataset2["speciesKey"].apply(lambda x : len(x))
dataset2

Unnamed: 0_level_0,speciesKey,numSpecies
class,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[5409958, 2888948, 2490255, 3033339, 3034714, ...",644
2,"[3033289, 2730127, 2883073, 3032585, 9490132, ...",553
3,"[3170040, 8207244, 3928139, 7270316, 2891147, ...",1238
4,"[2753146, 2159474, 2441055, 3114727, 4458814, ...",2601
5,"[5149438, 5405976, 5357013, 5352367, 7270427, ...",820
6,"[5389017, 3173338, 2650669, 2857601, 3170040, ...",2339
7,"[5231190, 2489214, 2889173, 5228676, 6065824, ...",1499
8,"[3033558, 2773942, 2889173, 2481800, 3021496, ...",1100
9,"[1862719, 3170040, 3173338, 3034681, 5420853, ...",1544
