In [1]:
import pandas as pd
import numpy  as np

## Preprocessing

### reading in data

In [2]:
wd_mammals = pd.read_csv('../data/wd_mammals.csv')
species = pd.read_csv('../data/species.csv', low_memory=False)

In [3]:
wd_mammals.head(5)

Unnamed: 0,resource,superClass,super2Class,super3Class,super4Class,parentTaxon,parent2Taxon,parent3Taxon,parent4Taxon,taxonName,...,super4ClassLabel,parentTaxonLabel,parent2TaxonLabel,parent3TaxonLabel,parent4TaxonLabel,taxonNameLabel,taxonCommonNameLabel,differentFromLabel,endemicToLabel,conservationStatusLabel
0,http://www.wikidata.org/entity/Q220213,http://www.wikidata.org/entity/Q7368,http://www.wikidata.org/entity/Q69301705,http://www.wikidata.org/entity/Q842175,http://www.wikidata.org/entity/Q105064135,,,,,,...,Ungulata,,,,,,,,,
1,http://www.wikidata.org/entity/Q220213,http://www.wikidata.org/entity/Q7368,http://www.wikidata.org/entity/Q57814795,http://www.wikidata.org/entity/Q57812611,http://www.wikidata.org/entity/Q57812559,,,,,,...,animal living in captivity,,,,,,,,,
2,http://www.wikidata.org/entity/Q220213,http://www.wikidata.org/entity/Q7368,http://www.wikidata.org/entity/Q57814795,http://www.wikidata.org/entity/Q57812611,http://www.wikidata.org/entity/Q7377,,,,,,...,mammal,,,,,,,,,
3,http://www.wikidata.org/entity/Q220213,http://www.wikidata.org/entity/Q7368,http://www.wikidata.org/entity/Q57814795,http://www.wikidata.org/entity/Q622852,http://www.wikidata.org/entity/Q729,,,,,,...,animal,,,,,,,,,
4,http://www.wikidata.org/entity/Q220213,http://www.wikidata.org/entity/Q7368,http://www.wikidata.org/entity/Q1797813,http://www.wikidata.org/entity/Q57812559,http://www.wikidata.org/entity/Q26401003,,,,,,...,individual animal,,,,,,,,,


### convert multirows to lists

__current situation:__
- one resource can have arbitrarily many rows due to different values for several columns
- e.g. resourceLabel: Cameroon | superClassLabel: sheep | super2ClassLabel: sheep, domesticated mammal, etc... (one line for each of the super2ClassLabel)



__desired situation:__
- convert different super2ClassLabel into a list
- one row per resource

In [4]:
ids_wd_mammal = wd_mammals.resource.unique()
print('Nr. of unique ids: ', len(ids_wd_mammal))

columns_wd = wd_mammals.columns 
wd_mammals_conv = pd.DataFrame(columns = columns_wd)
wd_mammals_conv

# convert each resource
for resource in ids_wd_mammal:
    sub_df = wd_mammals[wd_mammals.resource == resource]
    
    # loop through columns
    for column in columns_wd:
        break
    break
    
sub_df

Nr. of unique ids:  1816


Unnamed: 0,resource,superClass,super2Class,super3Class,super4Class,parentTaxon,parent2Taxon,parent3Taxon,parent4Taxon,taxonName,...,super4ClassLabel,parentTaxonLabel,parent2TaxonLabel,parent3TaxonLabel,parent4TaxonLabel,taxonNameLabel,taxonCommonNameLabel,differentFromLabel,endemicToLabel,conservationStatusLabel
0,http://www.wikidata.org/entity/Q220213,http://www.wikidata.org/entity/Q7368,http://www.wikidata.org/entity/Q69301705,http://www.wikidata.org/entity/Q842175,http://www.wikidata.org/entity/Q105064135,,,,,,...,Ungulata,,,,,,,,,
1,http://www.wikidata.org/entity/Q220213,http://www.wikidata.org/entity/Q7368,http://www.wikidata.org/entity/Q57814795,http://www.wikidata.org/entity/Q57812611,http://www.wikidata.org/entity/Q57812559,,,,,,...,animal living in captivity,,,,,,,,,
2,http://www.wikidata.org/entity/Q220213,http://www.wikidata.org/entity/Q7368,http://www.wikidata.org/entity/Q57814795,http://www.wikidata.org/entity/Q57812611,http://www.wikidata.org/entity/Q7377,,,,,,...,mammal,,,,,,,,,
3,http://www.wikidata.org/entity/Q220213,http://www.wikidata.org/entity/Q7368,http://www.wikidata.org/entity/Q57814795,http://www.wikidata.org/entity/Q622852,http://www.wikidata.org/entity/Q729,,,,,,...,animal,,,,,,,,,
4,http://www.wikidata.org/entity/Q220213,http://www.wikidata.org/entity/Q7368,http://www.wikidata.org/entity/Q1797813,http://www.wikidata.org/entity/Q57812559,http://www.wikidata.org/entity/Q26401003,,,,,,...,individual animal,,,,,,,,,
5,http://www.wikidata.org/entity/Q220213,http://www.wikidata.org/entity/Q7368,http://www.wikidata.org/entity/Q59099,http://www.wikidata.org/entity/Q72638,http://www.wikidata.org/entity/Q159344,,,,,,...,heterotroph,,,,,,,,,
6,http://www.wikidata.org/entity/Q220213,http://www.wikidata.org/entity/Q7368,http://www.wikidata.org/entity/Q59099,http://www.wikidata.org/entity/Q72638,http://www.wikidata.org/entity/Q7239,,,,,,...,organism,,,,,,,,,
7,http://www.wikidata.org/entity/Q220213,http://www.wikidata.org/entity/Q7368,http://www.wikidata.org/entity/Q59099,http://www.wikidata.org/entity/Q729,http://www.wikidata.org/entity/Q7239,,,,,,...,organism,,,,,,,,,


### Filter for "Label"-columns only

In [5]:

columns_wd_labeled = list(columns_wd[[x.find('Label')>0 for x in columns_wd]])
# append column 'resource' as unique identifier
columns_wd_labeled.append('resource')

In [6]:
columns_wd_labeled

['resourceLabel',
 'superClassLabel',
 'super2ClassLabel',
 'super3ClassLabel',
 'super4ClassLabel',
 'parentTaxonLabel',
 'parent2TaxonLabel',
 'parent3TaxonLabel',
 'parent4TaxonLabel',
 'taxonNameLabel',
 'taxonCommonNameLabel',
 'differentFromLabel',
 'endemicToLabel',
 'conservationStatusLabel',
 'resource']

In [7]:
wd_mammals = wd_mammals[columns_wd_labeled]
wd_mammals

Unnamed: 0,resourceLabel,superClassLabel,super2ClassLabel,super3ClassLabel,super4ClassLabel,parentTaxonLabel,parent2TaxonLabel,parent3TaxonLabel,parent4TaxonLabel,taxonNameLabel,taxonCommonNameLabel,differentFromLabel,endemicToLabel,conservationStatusLabel,resource
0,Cameroon,sheep,sheep,ruminant,Ungulata,,,,,,,,,,http://www.wikidata.org/entity/Q220213
1,Cameroon,sheep,domesticated mammal,mammal living in captivity,animal living in captivity,,,,,,,,,,http://www.wikidata.org/entity/Q220213
2,Cameroon,sheep,domesticated mammal,mammal living in captivity,mammal,,,,,,,,,,http://www.wikidata.org/entity/Q220213
3,Cameroon,sheep,domesticated mammal,domesticated animal,animal,,,,,,,,,,http://www.wikidata.org/entity/Q220213
4,Cameroon,sheep,productive animal,animal living in captivity,individual animal,,,,,,,,,,http://www.wikidata.org/entity/Q220213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13182,dog,pet,domesticated animal,animal,organism,,,,,,chyen,hammer,,,http://www.wikidata.org/entity/Q144
13183,dog,pet,animal living in captivity,individual animal,entity whose item has the given name property,,,,,,Kutya,hammer,,,http://www.wikidata.org/entity/Q144
13184,dog,pet,animal living in captivity,individual animal,individual,,,,,,Kutya,hammer,,,http://www.wikidata.org/entity/Q144
13185,dog,pet,animal living in captivity,individual animal,animal,,,,,,Kutya,hammer,,,http://www.wikidata.org/entity/Q144


## Species data set

In [8]:
species
species[species.Category == 'Mammal']

Unnamed: 0,Species ID,Park Name,Category,Order,Family,Scientific Name,Common Names,Record Status,Occurrence,Nativeness,Abundance,Seasonality,Conservation Status,Unnamed: 13
0,ACAD-1000,Acadia National Park,Mammal,Artiodactyla,Cervidae,Alces alces,Moose,Approved,Present,Native,Rare,Resident,,
1,ACAD-1001,Acadia National Park,Mammal,Artiodactyla,Cervidae,Odocoileus virginianus,"Northern White-Tailed Deer, Virginia Deer, Whi...",Approved,Present,Native,Abundant,,,
2,ACAD-1002,Acadia National Park,Mammal,Carnivora,Canidae,Canis latrans,"Coyote, Eastern Coyote",Approved,Present,Not Native,Common,,Species of Concern,
3,ACAD-1003,Acadia National Park,Mammal,Carnivora,Canidae,Canis lupus,"Eastern Timber Wolf, Gray Wolf, Timber Wolf",Approved,Not Confirmed,Native,,,Endangered,
4,ACAD-1004,Acadia National Park,Mammal,Carnivora,Canidae,Vulpes vulpes,"Black Fox, Cross Fox, Eastern Red Fox, Fox, Re...",Approved,Present,Unknown,Common,Breeder,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117527,ZION-1075,Zion National Park,Mammal,Rodentia,Sciuridae,Tamiasciurus hudsonicus,Red Squirrel,Approved,Present,Native,Uncommon,Breeder,,
117528,ZION-1076,Zion National Park,Mammal,Soricomorpha,Soricidae,Notiosorex crawfordi,Crawford's Desert Shrew,Approved,Present,Native,Unknown,,,
117529,ZION-1077,Zion National Park,Mammal,Soricomorpha,Soricidae,Sorex merriami,Merriam's Shrew,Approved,Not Confirmed,Native,,,,
117530,ZION-1078,Zion National Park,Mammal,Soricomorpha,Soricidae,Sorex monticolus,Montane Shrew,Approved,Present,Native,Unknown,Breeder,,


In [9]:
# Overlap
inin_overlap = species[species.Category == 'Mammal']['Scientific Name'].isin(wd_mammals.resourceLabel)

print(inin_overlap.describe())
species[species.Category == 'Mammal'][inin_overlap]

count      3867
unique        2
top       False
freq       3814
Name: Scientific Name, dtype: object


Unnamed: 0,Species ID,Park Name,Category,Order,Family,Scientific Name,Common Names,Record Status,Occurrence,Nativeness,Abundance,Seasonality,Conservation Status,Unnamed: 13
19,ACAD-1019,Acadia National Park,Mammal,Carnivora,Ursidae,Ursus americanus,Black Bear,Approved,Present,Native,Occasional,,,
1727,ARCH-1018,Arches National Park,Mammal,Carnivora,Ursidae,Ursus americanus,American Black Bear,Approved,Present,Native,Rare,,,
4172,BIBE-1026,Big Bend National Park,Mammal,Carnivora,Ursidae,Ursus americanus,American Black Bear,Approved,Present,Native,Unknown,Resident,,
6427,BISC-1012,Biscayne National Park,Mammal,Cetacea,Physeteridae,Physeter macrocephalus,Sperm Whale,Approved,Present,Unknown,Unknown,,Endangered,
8158,BLCA-1017,Black Canyon of the Gunnison National Park,Mammal,Carnivora,Ursidae,Ursus americanus,American Black Bear,Approved,Present,Native,Unknown,Breeder,,
9266,BRCA-1019,Bryce Canyon National Park,Mammal,Carnivora,Ursidae,Ursus americanus,American Black Bear,Approved,Present,Native,Unknown,,,
10552,CANY-1019,Canyonlands National Park,Mammal,Carnivora,Ursidae,Ursus americanus,American Black Bear,Approved,Present,Native,Rare,,,
11776,CARE-1020,Capitol Reef National Park,Mammal,Carnivora,Ursidae,Ursus americanus,American Black Bear,Approved,Present,Native,Occasional,,,
13336,CAVE-1014,Carlsbad Caverns National Park,Mammal,Carnivora,Felidae,Panthera onca,Jaguar,Approved,Not Confirmed,Native,,,Endangered,
13345,CAVE-1023,Carlsbad Caverns National Park,Mammal,Carnivora,Ursidae,Ursus americanus,American Black Bear,Approved,Present,Native,Unknown,Resident,,


In [10]:
#species[species.Category == 'Mammal'].merge(right=wd_mammals.resourceLabel, left_on='Scientific Name', right_on='resourceLabel')
print('Species: unique mammals: ', len(species[species.Category == 'Mammal']['Scientific Name'].unique()))
print('WD_mammals unique resource:', len(wd_mammals.resource.unique()))
print('WD_mammals unique label:', len(wd_mammals.resourceLabel.unique()))

print('\nOverlap:')
overlap_df = species[species.Category == 'Mammal'].merge(right=wd_mammals.resourceLabel.drop_duplicates(), left_on='Scientific Name', right_on='resourceLabel')
print('Nr. of overlapping scientific names: ', len(overlap_df['Scientific Name'].unique()))
overlap_df

#species[species.Category == 'Mammal'].apply(lambda: )

Species: unique mammals:  689
WD_mammals unique resource: 1816
WD_mammals unique label: 1806

Overlap:
Nr. of overlapping scientific names:  5


Unnamed: 0,Species ID,Park Name,Category,Order,Family,Scientific Name,Common Names,Record Status,Occurrence,Nativeness,Abundance,Seasonality,Conservation Status,Unnamed: 13,resourceLabel
0,ACAD-1019,Acadia National Park,Mammal,Carnivora,Ursidae,Ursus americanus,Black Bear,Approved,Present,Native,Occasional,,,,Ursus americanus
1,ARCH-1018,Arches National Park,Mammal,Carnivora,Ursidae,Ursus americanus,American Black Bear,Approved,Present,Native,Rare,,,,Ursus americanus
2,BIBE-1026,Big Bend National Park,Mammal,Carnivora,Ursidae,Ursus americanus,American Black Bear,Approved,Present,Native,Unknown,Resident,,,Ursus americanus
3,BLCA-1017,Black Canyon of the Gunnison National Park,Mammal,Carnivora,Ursidae,Ursus americanus,American Black Bear,Approved,Present,Native,Unknown,Breeder,,,Ursus americanus
4,BRCA-1019,Bryce Canyon National Park,Mammal,Carnivora,Ursidae,Ursus americanus,American Black Bear,Approved,Present,Native,Unknown,,,,Ursus americanus
5,CANY-1019,Canyonlands National Park,Mammal,Carnivora,Ursidae,Ursus americanus,American Black Bear,Approved,Present,Native,Rare,,,,Ursus americanus
6,CARE-1020,Capitol Reef National Park,Mammal,Carnivora,Ursidae,Ursus americanus,American Black Bear,Approved,Present,Native,Occasional,,,,Ursus americanus
7,CAVE-1023,Carlsbad Caverns National Park,Mammal,Carnivora,Ursidae,Ursus americanus,American Black Bear,Approved,Present,Native,Unknown,Resident,,,Ursus americanus
8,CONG-1012,Congaree National Park,Mammal,Carnivora,Ursidae,Ursus americanus,"American Black Bear, Black Bear",Approved,Not Present (Historical Report),Native,,,,,Ursus americanus
9,CRLA-1027,Crater Lake National Park,Mammal,Carnivora,Ursidae,Ursus americanus,Black Bear,Approved,Present,Native,Common,Breeder,,,Ursus americanus


## More advanced similarity search

In [11]:
sp_id = species[species.Category == 'Mammal']['Scientific Name'].unique()
wd_id = wd_mammals.resourceLabel.unique()


sp_id = np.array(sp_id)
wd_id = np.array(wd_id)

In [12]:
sp_id = [x.lower() for x in sp_id]
wd_id = [x.lower() for x in wd_id]

In [13]:
#### lower case overlap

In [14]:
# Lower case overlap
np.isin(wd_id, sp_id).sum()

5

In [15]:
# create cartesian product
from sklearn.utils.extmath import cartesian
cart_array = cartesian((wd_id, sp_id))
print(len(cart_array))
cart_array

1244334


array([['cameroon', 'alces alces'],
       ['cameroon', 'odocoileus virginianus'],
       ['cameroon', 'canis latrans'],
       ...,
       ['australian cattle dog', 'urocitellus beldingi'],
       ['australian cattle dog', 'sorex lyelli'],
       ['australian cattle dog', 'meriones unguiculatus']], dtype='<U42')

### normalized_levenshtein

In [16]:
#from similarity.levenshtein import Levenshtein
#levenshtein = Levenshtein()
#[levenshtein.distance(x[0], x[1]) for x in cart_array]

from similarity.normalized_levenshtein import NormalizedLevenshtein

normalized_levenshtein = NormalizedLevenshtein()
norm_leven_result = [normalized_levenshtein.distance(x[0], x[1]) for x in cart_array]


In [17]:
norm_leven_result = np.array(norm_leven_result)

threshold = 0.5
print(len(norm_leven_result[norm_leven_result < threshold]))
print(cart_array[norm_leven_result < threshold])

78
[['canis' 'canis']
 ['africanis' 'canis']
 ['jamora' 'marmota']
 ['dorset horn' 'sorex hoyi']
 ['danish landrace' 'canis latrans']
 ['physeter macrocephalus' 'physeter macrocephalus']
 ['ursus americanus' 'ursus americanus']
 ['ursus americanus' 'lepus americanus']
 ['ursus americanus' 'ursus americanus floridanus']
 ['ursus americanus' 'oreamnos americanus']
 ['ursus americanus' 'ursus arctos']
 ['ursus americanus' 'martes americana']
 ['ursus americanus' 'homo sapiens americanus']
 ['ursus americanus' 'microtus mexicanus']
 ['ursus americanus' 'ursus americana']
 ['ursus americanus' 'ursus horribilis']
 ['corriedale' 'cervidae']
 ['capra hircus kelleri' 'capra hircus']
 ['mukota' 'marmota']
 ['brangus' 'rattus']
 ['ratter' 'rattus']
 ['landais' 'canis']
 ["d'man" 'dama']
 ['charollais' 'ochotona collaris']
 ['damara' 'dama']
 ['bardoka' 'marmota']
 ['cormo' 'homo']
 ['damani' 'dama']
 ['cabra florida' 'neotoma floridana']
 ['tunis' 'canis']
 ['repartida' 'tadarida']
 ['otocyon' 'u

#### Jaro Winkler

In [18]:
from similarity.jarowinkler import JaroWinkler

jarowinkler = JaroWinkler()
jarowinkl_result = [jarowinkler.similarity(x[0], x[1]) for x in cart_array]

In [19]:
jarowinkl_result = np.array(jarowinkl_result)

threshold = 0.9
print(len(jarowinkl_result[jarowinkl_result > threshold]))
print(cart_array[jarowinkl_result > threshold])


35
[['canis' 'canis lupus']
 ['canis' 'canis']
 ['canis' 'canis niger']
 ['canis' 'canis rufus']
 ['physeter macrocephalus' 'physeter macrocephalus']
 ['ursus americanus' 'ursus americanus']
 ['ursus americanus' 'ursus americanus floridanus']
 ['ursus americanus' 'ursus americana']
 ['capra hircus kelleri' 'capra hircus']
 ['damara' 'dama']
 ['damani' 'dama']
 ['spermophilus' 'spermophilus variegatus']
 ['spermophilus' 'spermophilus spilosoma']
 ['spermophilus' 'spermophilus mexicanus']
 ['spermophilus' 'spermophilus lateralis']
 ['spermophilus' 'spermophilus beecheyi']
 ['spermophilus' 'spermophilus beldingi']
 ['spermophilus' 'spermophilus parryii']
 ['spermophilus' 'spermophilus']
 ['spermophilus' 'spermophilus mohavensis']
 ['spermophilus' 'spermophilus tereticaudus']
 ['spermophilus' 'spermophilus columbianus']
 ['spermophilus' 'spermophilus richardsonii']
 ['spermophilus' 'spermophilus armatus']
 ['spermophilus' 'spermophilus elegans']
 ['spermophilus' 'spermophilus mollis']
 ['s

In [20]:
#from similarity.cosine import Cosine
####