In [1]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from openai import OpenAI
import requests
import re
from defines import coordinates_dict
import glob
import difflib
import pickle
from multiprocessing import Pool, cpu_count
import unicodedata

### ToDO 
- All csv files
- All cleaned csv files
- All cleaned csv files with enriched with nee data

In [2]:
# Get all csv files 

folder_path = "./local_data/data/"  # Replace with the path to your folder
csv_files = glob.glob(f"{folder_path}/*.csv")

# Display the list of CSV files
print("CSV files in the folder:")
for file_path in csv_files:
    print(file_path)

CSV files in the folder:
./local_data/data/610.csv
./local_data/data/617.csv
./local_data/data/616.csv
./local_data/data/621.csv
./local_data/data/625.csv
./local_data/data/615.csv
./local_data/data/618.csv
./local_data/data/614.csv
./local_data/data/624.csv
./local_data/data/619.csv
./local_data/data/622.csv
./local_data/data/613.csv
./local_data/data/623.csv
./local_data/data/627.csv
./local_data/data/612.csv
./local_data/data/620.csv
./local_data/data/609.csv


In [3]:
def melt_df(df):
    ## columns rotation rules
     return df.melt(id_vars=["Unnamed: 0"], var_name="location", value_name="organismQuantity") # or treat as locality

def rename_columns(df):
    return df.rename(columns={'Unnamed: 0':'scientificName'})

In [4]:
## replacement values rules
# df = df.replace(to_replace=['+', '‒'], value=['present', 'absent'])
# df.head()

In [5]:
## Get the correct names of scientific species

# loaded dictionary

def get_scientificname_dict():
    with open('./local_data/taxon/taxon_dictionary.pkl', 'rb') as f:
        return pickle.load(f)


# Clean and correct names
def remove_number(text):
    if text !=text:
        return text
    text = re.sub('^[0-9.\*]*', '', text, count=1)
    text = re.sub('^[aA-zZ]\\)', '', text, count=1)
    return text

def replace_commas(text):
    if text !=text:
        return text
    text = re.sub('^, ,', ' ,, ', text)
    text = re.sub('^，，', ',,', text)
    text = re.sub('^,,', ' ,, ', text)
    text = re.sub('^,', ' ,, ', text)
    text = re.sub('^, , , ,', ' ,, ,, ', text)
    return text

def remove_extra_space(text):
    if text !=text:
        return text
    text = re.sub(' +', ' ', text).strip()
    return text

def normalize_to_ascii(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

def remove_roman_numerals(string):
    pattern = r'^((M{0,4})(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3}))\.'
    result = re.sub(pattern, '', string)
    return result.strip()

def complete_species_name(scientificName_list, i):
    prev = scientificName_list[i-1].split()[0]
    # print('0', scientificName_list[i], '\t', scientificName_list[i-1])
    # print(prev)
    scientificName_list[i] = scientificName_list[i].replace(',,', prev)
    # print('1', scientificName_list[i], '\t', scientificName_list[i-1], end='\n\n')
    return scientificName_list[i]

def get_kingdom(text):
    url = "https://api.gbif.org/v1/species/search?q={}&origin=SOURCE&status=ACCEPTED&strict=true".format(text)
    payload = {}
    headers = {'Authorization': 'Basic YWtodnlhczA6VnlAJDEyMzQ='}
    response = requests.request("GET", url, headers=headers, data=payload)
    try:
        if response.status_code==200:
            return response.json()['results'][0]['kingdom']
        else:
            return None
    except:
        return None

def get_close_scname_and_data_gbif(text):
    # url = "https://api.gbif.org/v1/species/search?q={}&origin=SOURCE&status=ACCEPTED&strict=true".format(text)
    url = "https://api.gbif.org/v1/species/match?name={}&status=ACCEPTED&strict=false&verbose=true".format(text)
    payload = {}
    # headers = {'Authorization': 'Basic YWtodnlhczA6VnlAJDEyMzQ='}
    headers = {}
    response = requests.request("GET", url, headers=headers, data=payload)
    # print(text)
    try:
        if response.status_code==200:
            if 'scientificName' in response.json():
                return response.json()['scientificName'],\
                   response.json()['confidence'], response.json()['kingdom'], None, None, None
            elif (len(response.json()['alternatives'])>1):
                if (response.json()['alternatives'][0]['confidence']==response.json()['alternatives'][1]['confidence']):
                    return response.json()['alternatives'][0]['scientificName'],\
                       response.json()['alternatives'][0]['confidence'],\
                       response.json()['alternatives'][0]['kingdom'],\
                       response.json()['alternatives'][1]['scientificName'],\
                       response.json()['alternatives'][1]['confidence'], \
                       response.json()['alternatives'][1]['kingdom']#response.json()['results'][0]['kingdom']
                else:
                    return response.json()['alternatives'][0]['scientificName'],\
                       response.json()['alternatives'][0]['confidence'], \
                       response.json()['alternatives'][0]['kingdom'], None, None, None
            else:
                return response.json()['alternatives'][0]['scientificName'],\
                   response.json()['alternatives'][0]['confidence'], \
                   response.json()['alternatives'][0]['kingdom'], None, None, None
        else:
            return None, None, None, None, None, None
    except Exception as e: 
        print('Except: ', e)
        return None, None, None, None, None, None

def get_close_scname_and_data_from_dict_gbif(sc_name, close_match_sc_dict_gbif):
    if sc_name in close_match_sc_dict_gbif:
        return close_match_sc_dict_gbif[sc_name]
    return (None, None, None, None, None, None)

def get_close_scname_and_data_gbif_list(text):
    # url = "https://api.gbif.org/v1/species/search?q={}&origin=SOURCE&status=ACCEPTED&strict=true".format(text)
    url = "https://api.gbif.org/v1/species/match?name={}&status=ACCEPTED&strict=false&verbose=true".format(text)
    payload = {}
    # headers = {'Authorization': 'Basic YWtodnlhczA6VnlAJDEyMzQ='}
    headers = {}
    response = requests.request("GET", url, headers=headers, data=payload)
    # print(text)
    try:
        if response.status_code==200:
            if 'scientificName' in response.json():
                alt_list = []
                alt_list.append((response.json()['scientificName'],\
                   response.json()['confidence'], response.json()['kingdom']))
                return alt_list, len(alt_list)
            else:
                try:
                    alt_list = []
                    # print('len: ', response.json()['alternatives'])
                    confidence = response.json()['alternatives'][0]['confidence']
                    # print('len: ', response.json()['alternatives'])
                    for i in range(len(response.json()['alternatives'])):
                        if response.json()['alternatives'][i]['confidence']==confidence:
                            alt_list.append((response.json()['alternatives'][i]['scientificName'],\
                                              response.json()['alternatives'][i]['confidence'],\
                                              response.json()['alternatives'][i]['kingdom']))
                        else:
                            break
                    return alt_list, len(alt_list)      
                except Exception as e:
                    print('Except: ', e, response.text, end='\n\n\n\n')
                    return None, None
        else:
            return None, None
    except Exception as e: 
        print('Except: ', e, response.text, end='\n\n\n\n')
        return None, None

def get_close_scname_and_data_from_dict_gbif_list(sc_name, close_match_sc_dict_gbif_list):
    if sc_name in close_match_sc_dict_gbif_list:
        return close_match_sc_dict_gbif_list[sc_name]
    return None, None

def remove_special_characters(input_string):
    # Define a regex pattern to match special characters
    pattern = re.compile(r'[^a-zA-Z0-9\s]')
    
    # Use the sub() method to replace matched patterns with an empty string
    result_string = pattern.sub('', input_string)
    
    return result_string


def get_close_scname_and_data(sc_name):
    closest_match = difflib.get_close_matches(sc_name, scientificname_dict.keys(), n=1, cutoff=0.5)
    if closest_match:
        score = difflib.SequenceMatcher(None, sc_name, closest_match[0]).ratio()
        return (closest_match[0], score, 
                scientificname_dict[closest_match[0]]['taxonID'], 
                scientificname_dict[closest_match[0]]['kingdom'],
                scientificname_dict[closest_match[0]]['class'],
                scientificname_dict[closest_match[0]]['family'])
    return (None, None, None, None, None, None)
    
    
def get_close_scname_and_data_from_dict(sc_name, close_match_sc_dict):
    if sc_name in close_match_sc_dict:
        return close_match_sc_dict[sc_name]
    return (None, None, None, None, None, None)
    

def getlistlength(text_list):
    try:
        return len(list(text_list))
    except:
        return None
## Get coordinates using prefilled dict

def correct_location_name(location):
    closest_match = difflib.get_close_matches(location, coordinates_dict.keys(), n=1, cutoff=0.6)
    if closest_match:
        return closest_match[0]
    return location
    
def get_coordinates(location):
    closest_match = difflib.get_close_matches(location, coordinates_dict.keys(), n=1, cutoff=0.6)
    if closest_match:
        return coordinates_dict[closest_match[0]]['latitude'], coordinates_dict[closest_match[0]]['longitude']
    return None, None

def enriched_df(df, image_or_file_id):
    df_meta = pd.read_csv('./local_data/imageId_metapath_metadata.csv', encoding='Utf')
    df_meta = df_meta[df_meta['Image_Id']==image_or_file_id]
    # print (df_meta.head())
    df['eventDate'] = int(df_meta['eventDate'].values[0])
    df['year'] = int(df_meta['year'].values[0])
    df['publicationTitle']= df_meta['publicationTitle'].values[0]
    df['publicationYear'] =  int(df_meta['publicationYear'].values[0] )
    df['collectionCode'] = df_meta['collectionCode'].values[0]
    df['catalogNumber'] = int(df_meta['catalogNumber'].values[0])
    df['publicationAuthors'] = df_meta['publicationAuthors'].values[0]
    df['authorityURI'] = df_meta['authorityURI'].values[0]
    df['authorityValue'] = int(df_meta['authorityValue'].values[0])
    
    # ToDo --- Fixed Scale
    df['organismQuantityType'] = 'Braun-Blanquet Scale'
    return df

  text = re.sub('^[0-9.\*]*', '', text, count=1)


In [6]:
# get_close_scname_and_data_gbif('Microcystis')

In [7]:
%%time

## Cleaning and enriching data
scientificname_dict = get_scientificname_dict()
# close_match_sc_dict =  dict()
close_match_sc_dict_gbif =  dict()
close_match_sc_dict_gbif_list =  dict()
all_locations = set()


def cleaning_data(csv_file_path):
    # getfile and melt it according to columns
    file_id = int(csv_file_path.split("/")[-1].split(".")[0])
    df = pd.read_csv(filepath_or_buffer=csv_file_path, encoding='utf-8')
    df = melt_df(df)
    df = rename_columns(df)

    # Creating new columns and feeding data
    df['basisOfRecord']='Human Observation'

    # cleaning df
    df = df.map(remove_extra_space, na_action='ignore')
    df['scientificName'] = df['scientificName'].apply(remove_number)
    df = df.map(remove_extra_space, na_action='ignore')
    df['scientificName'] = df['scientificName'].apply(replace_commas)
    df['scientificName'] = df['scientificName'].apply(normalize_to_ascii)
    df['scientificName'] = df['scientificName'].apply(remove_roman_numerals)
    df = df.map(remove_extra_space, na_action='ignore')
    
    scientificName_list = df['scientificName'].tolist()
    df['scientificName'] = [scientificName_list[0]]+ [complete_species_name(scientificName_list, i) for i, j in enumerate(scientificName_list) if i>0]

    # special character treatment
    # df['scientificName'] = df['scientificName'].apply(remove_special_characters)

    # ToDo ---  taking only 50 rows
    # df = df.sample(n=2, random_state=1)
    # this API is very Slow
    # get and add Kingdom
    # df['kingdom'] = df['scientificName'].apply(get_kingdom)

#    close_match_sc_dict.update({sc_name:get_close_scname_and_data(sc_name) for sc_name in df['scientificName'].unique().tolist()})
    close_match_sc_dict_gbif.update({sc_name:get_close_scname_and_data_gbif(sc_name) for sc_name in df['scientificName'].unique().tolist()})
    close_match_sc_dict_gbif_list.update({sc_name:get_close_scname_and_data_gbif_list(sc_name) for sc_name in df['scientificName'].unique().tolist()})
    
    # print(close_match_sc_dict)
    # correcting scientific name according to Taxon data file
    df[['scientificNameCloseGbif1', 'matchingScoreGbif1', 'kingdomGbif1', 'scientificNameCloseGbif2', 'matchingScoreGbif2', 'kingdomGbif2']]\
                    = pd.DataFrame(df['scientificName'].\
                                   apply(get_close_scname_and_data_from_dict_gbif, args=(close_match_sc_dict_gbif,)).\
                                   tolist(), index=df.index)
    df[['scientificName_matchingScore_kingdom_CloseGbiflist', 'scientificName_matchingScore_kingdom_CloseGbiflistLength']]\
                    = pd.DataFrame(df['scientificName'].\
                                   apply(get_close_scname_and_data_from_dict_gbif_list, args=(close_match_sc_dict_gbif_list,)).\
                                   tolist(), index=df.index)

    #df[['scientificNameClose', 'matchingScore', 'taxonID', 'kingdom', 'class', 'family']] = pd.DataFrame(df['scientificName'].\
    #                                                                                          apply(get_close_scname_and_data_from_dict, args=(close_match_sc_dict,)).\
    #                                                                                          tolist(), index=df.index)

    ## correct location name
    df['location'] = df['location'].apply(correct_location_name)
    
    ## get coordinates
    df['coordinates'] = df['location'].apply(get_coordinates)
    df[['latitude', 'longitude']] = pd.DataFrame(df['coordinates'].tolist(), index=df.index)

    ## ToDo -  add data from meta data  
    df = enriched_df(df, file_id)
    df.to_csv('./data/cleaned_data/'+ str(file_id) +'.csv', encoding='utf-8', index=False)
    all_locations.update(df['location'].tolist())
    # Todo - Remove Break
    # break

print("Number of cpu : ", cpu_count())
p = Pool(4)
# Todo - only two csv files
p.map(cleaning_data, csv_files)

# for csv_file_path in csv_files:
#    cleaning_data(csv_file_path)

Number of cpu :  12
Except:  'alternatives'
Except:  'alternatives'
Except:  'alternatives'
Except:  'alternatives'
Except:  'alternatives'
Except:  'alternatives'
Except:  'alternatives'
Except:  'alternatives'
Except:  'alternatives' {"confidence":100,"matchType":"NONE","synonym":false}



Except:  'alternatives' {"confidence":100,"matchType":"NONE","synonym":false}



Except:  'alternatives' {"confidence":100,"matchType":"NONE","synonym":false}



Except:  'alternatives' {"confidence":100,"matchType":"NONE","synonym":false}



Except:  'alternatives' {"confidence":100,"matchType":"NONE","synonym":false}



Except:  'alternatives' {"confidence":100,"matchType":"NONE","synonym":false}



Except:  'alternatives' {"confidence":100,"matchType":"NONE","synonym":false}



Except:  'alternatives' {"confidence":100,"matchType":"NONE","synonym":false}



Except:  'alternatives'
Except:  'alternatives'
Except:  'alternatives'
Except:  'alternatives'
Except:  'alternatives'
Except:  'alternativ

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]