# Wikidata csv to xml converter

In [2]:
# read in data
path_kev = '../data/wikidata/wd_species_df_kev.csv'
path_til = '../data/wikidata/wd_species_df_tillman.csv'
path_scientificNames = '../data/wikidata/wd_scientificNames.csv'

length_limit = 100000

# import
from wd_species import *
import pandas as pd

## Dataframe Preprocessing 
Combine dataframes and deduplicate them

In [3]:
# read in and concaneate dataframes
df = pd.read_csv(path_kev)
df = pd.concat([df, pd.read_csv(path_til)])


print('Dataframe length before deduplication: ', df.shape[0])

# deduplicate dataframes
df = df.drop_duplicates(subset='resource')
# dropt taxonName because it has to be merged to the table because of wrong retrieval
df = df.drop(columns = 'taxonName')

print('Dataframe length after deduplication: ', df.shape[0])
df.head(5)

Dataframe length before deduplication:  152399
Dataframe length after deduplication:  150724


Unnamed: 0,class,classLabel,conservationStatus,conservationStatusLabel,differentFrom,differentFromLabel,endemicTo,endemicToLabel,family,familyLabel,order,orderLabel,resource,resourceLabel,taxonCommonName
0,"['http://www.wikidata.org/entity/q7377', 'http...","['mammal', 'dipnotetrapodomorpha']",http://www.wikidata.org/entity/q278113,vulnerable,,,,,http://www.wikidata.org/entity/q25265,felidae,http://www.wikidata.org/entity/q25306,carnivora,http://www.wikidata.org/entity/q140,lion,"['lion', 'african lion']"
1,,,,,,,,,http://www.wikidata.org/entity/q146030,rutaceae,http://www.wikidata.org/entity/q26316,sapindales,http://www.wikidata.org/entity/q500,citrus ×limon,
2,"['http://www.wikidata.org/entity/q23809240', '...","['dipnotetrapodomorpha', 'reptilia', 'bird']",http://www.wikidata.org/entity/q719675,near threatened,,,,,"['http://www.wikidata.org/entity/q17190971', '...","['rynchopidae', 'laridae']","['http://www.wikidata.org/entity/q25978', 'htt...","['charadriiformes', 'saurischia', 'dinosaur', ...",http://www.wikidata.org/entity/q18766,african skimmer,african skimmer
3,"['http://www.wikidata.org/entity/q7377', 'http...","['mammal', 'dipnotetrapodomorpha']",http://www.wikidata.org/entity/q3245245,data deficient,,,,,http://www.wikidata.org/entity/q25900,leporidae,http://www.wikidata.org/entity/q25401,lagomorpha,http://www.wikidata.org/entity/q18785,sumatran striped rabbit,"['sumatra short-eared rabbit', 'sumatran strip..."
4,"['http://www.wikidata.org/entity/q7377', 'http...","['mammal', 'dipnotetrapodomorpha']",http://www.wikidata.org/entity/q219127,critically endangered,,,,,http://www.wikidata.org/entity/q182968,old world monkey,http://www.wikidata.org/entity/q7380,primate,http://www.wikidata.org/entity/q18818,rhinopithecus avunculus,tonkin snub-nosed monkey


In [4]:
# merge labels to taxonCommonName
# convert strings of lists to list of stings in df
def convertStrToList(s):
    
    result = s
    if type(s) == str:
        if s[0] == '[' and s[-1] == ']':

            s = s.replace('\"', '')
            s = s.replace('\'', '')
            
            l_s = s[1:-1].split(', ')
            
            result = l_s
            
    return result

def extractAndAppendElements(elem, list_to_append):
    if type(elem) == str:
        if elem not in list_to_append:
            list_to_append.append(elem)
    if type(elem) == list:
        for e in elem:
            if e not in list_to_append:
                list_to_append.append(e) 
    return list_to_append

def mergeLabeltoTaxonCommonName(commonNames, labels):
    list_of_names = []
    list_of_names = extractAndAppendElements(commonNames, list_of_names)
    list_of_names = extractAndAppendElements(labels,      list_of_names)

    
    if len(list_of_names)   == 0:
        return_value = ''
    elif len(list_of_names) == 1:
        return_value = list_of_names[0]
    else:
        return_value = list_of_names
    
    return return_value

df['taxonCommonName'] = df['taxonCommonName'].apply(lambda x: convertStrToList(x))
df['taxonCommonName'] = df.apply(lambda x: mergeLabeltoTaxonCommonName(x['taxonCommonName'], x['resourceLabel']), axis = 1)
df = df.drop(columns='resourceLabel')
df.head(5)

Unnamed: 0,class,classLabel,conservationStatus,conservationStatusLabel,differentFrom,differentFromLabel,endemicTo,endemicToLabel,family,familyLabel,order,orderLabel,resource,taxonCommonName
0,"['http://www.wikidata.org/entity/q7377', 'http...","['mammal', 'dipnotetrapodomorpha']",http://www.wikidata.org/entity/q278113,vulnerable,,,,,http://www.wikidata.org/entity/q25265,felidae,http://www.wikidata.org/entity/q25306,carnivora,http://www.wikidata.org/entity/q140,"[lion, african lion]"
1,,,,,,,,,http://www.wikidata.org/entity/q146030,rutaceae,http://www.wikidata.org/entity/q26316,sapindales,http://www.wikidata.org/entity/q500,citrus ×limon
2,"['http://www.wikidata.org/entity/q23809240', '...","['dipnotetrapodomorpha', 'reptilia', 'bird']",http://www.wikidata.org/entity/q719675,near threatened,,,,,"['http://www.wikidata.org/entity/q17190971', '...","['rynchopidae', 'laridae']","['http://www.wikidata.org/entity/q25978', 'htt...","['charadriiformes', 'saurischia', 'dinosaur', ...",http://www.wikidata.org/entity/q18766,african skimmer
3,"['http://www.wikidata.org/entity/q7377', 'http...","['mammal', 'dipnotetrapodomorpha']",http://www.wikidata.org/entity/q3245245,data deficient,,,,,http://www.wikidata.org/entity/q25900,leporidae,http://www.wikidata.org/entity/q25401,lagomorpha,http://www.wikidata.org/entity/q18785,"[sumatra short-eared rabbit, sumatran striped ..."
4,"['http://www.wikidata.org/entity/q7377', 'http...","['mammal', 'dipnotetrapodomorpha']",http://www.wikidata.org/entity/q219127,critically endangered,,,,,http://www.wikidata.org/entity/q182968,old world monkey,http://www.wikidata.org/entity/q7380,primate,http://www.wikidata.org/entity/q18818,"[tonkin snub-nosed monkey, rhinopithecus avunc..."


In [5]:
# dataframe containing resource and respective scientific name (taxonName)
df_sn = pd.read_csv(path_scientificNames)
df_sn = df_sn.drop_duplicates()
df_sn.head(3)

Unnamed: 0,resource,taxonName
0,http://www.wikidata.org/entity/q140,panthera leo
1,http://www.wikidata.org/entity/q18851,charadrius dubius
2,http://www.wikidata.org/entity/q18873,cymothoa exigua


In [6]:
# final dataframe
df = pd.merge(df, df_sn, how='left', on=['resource'])
df.head(5)

Unnamed: 0,class,classLabel,conservationStatus,conservationStatusLabel,differentFrom,differentFromLabel,endemicTo,endemicToLabel,family,familyLabel,order,orderLabel,resource,taxonCommonName,taxonName
0,"['http://www.wikidata.org/entity/q7377', 'http...","['mammal', 'dipnotetrapodomorpha']",http://www.wikidata.org/entity/q278113,vulnerable,,,,,http://www.wikidata.org/entity/q25265,felidae,http://www.wikidata.org/entity/q25306,carnivora,http://www.wikidata.org/entity/q140,"[lion, african lion]",panthera leo
1,,,,,,,,,http://www.wikidata.org/entity/q146030,rutaceae,http://www.wikidata.org/entity/q26316,sapindales,http://www.wikidata.org/entity/q500,citrus ×limon,citrus ×limon
2,"['http://www.wikidata.org/entity/q23809240', '...","['dipnotetrapodomorpha', 'reptilia', 'bird']",http://www.wikidata.org/entity/q719675,near threatened,,,,,"['http://www.wikidata.org/entity/q17190971', '...","['rynchopidae', 'laridae']","['http://www.wikidata.org/entity/q25978', 'htt...","['charadriiformes', 'saurischia', 'dinosaur', ...",http://www.wikidata.org/entity/q18766,african skimmer,rynchops flavirostris
3,"['http://www.wikidata.org/entity/q7377', 'http...","['mammal', 'dipnotetrapodomorpha']",http://www.wikidata.org/entity/q3245245,data deficient,,,,,http://www.wikidata.org/entity/q25900,leporidae,http://www.wikidata.org/entity/q25401,lagomorpha,http://www.wikidata.org/entity/q18785,"[sumatra short-eared rabbit, sumatran striped ...",nesolagus netscheri
4,"['http://www.wikidata.org/entity/q7377', 'http...","['mammal', 'dipnotetrapodomorpha']",http://www.wikidata.org/entity/q219127,critically endangered,,,,,http://www.wikidata.org/entity/q182968,old world monkey,http://www.wikidata.org/entity/q7380,primate,http://www.wikidata.org/entity/q18818,"[tonkin snub-nosed monkey, rhinopithecus avunc...",rhinopithecus avunculus


In [7]:
# sort dataframe by resource
df['sort_column'] = df['resource'].apply(lambda x: int(x.split('/q')[-1]))
df = df.sort_values(by='sort_column', ascending=True)
df = df.drop(columns = 'sort_column')

# limit size of df
df = df.iloc[:length_limit,:]
df.shape

(100000, 15)

In [8]:
# write to dataframe path
df.to_csv(getPathDF(),  index = False)

In [None]:
# create excel file


#def amendURL(string):
#    if str

df_excel = df.copy()
df_excel = df_excel.drop(columns = ['class', 'conservationStatus', 'family', 'order'])
#for column in df_excel.columns:
    
df_excel['resource'] = df_excel['resource'].apply(lambda x: x[11:])
df_excel.to_excel('../Data/goldstandard/intermediates/goldstandard_preparation_wd.xlsx')
df_excel.head(3)

## Data Inspection

In [None]:
df = pd.read_csv(getPathDF())

In [None]:
print(df.shape)
df.head()

In [None]:
df.info()

In [None]:
df.taxonName

## XML creation

In [None]:
# delete file first !!!!
createXML()

### Create subset XML


In [10]:
path_to_gs        = '../JavaProjectUsingWinter/data/goldstandard/gs_biodiversity_wikidata.csv'
path_to_subsetDF  = '../Data/wikidata/wd_species_df_subset.csv'
path_to_subsetXML = '../xml/wd_species_subset.xml'

subset_fraction   = 0.001 

gs                = pd.read_csv(path_to_gs, names=['bio', 'wd', 'match'])
gs.head(3)

Unnamed: 0,bio,wd,match
0,BIO00009,http://www.wikidata.org/entity/q2113084,False
1,BIO00030,http://www.wikidata.org/entity/q978755,True
2,BIO00236,http://www.wikidata.org/entity/q2038905,False


In [11]:
# create subset of large Datadrame 
subset_df             = df.sample(frac=subset_fraction)
# subset of dataframe dataframe
dataset_containing_gs = pd.merge(left = gs['wd'], right = df, how = "inner", left_on = 'wd', right_on = 'resource').drop(columns='wd')
#concatenate both dataframes
subset_df             = pd.concat([subset_df, dataset_containing_gs], axis=0)
# remove eventual duplicates
subset_df             = subset_df.drop_duplicates(subset='resource')
# save
subset_df.to_csv(path_to_subsetDF, index = False)

In [12]:
createXML(path_XML = path_to_subsetXML, path_Df = path_to_subsetDF)