In [1]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from openai import OpenAI
import requests
import re

In [2]:
## coordinates
coordinates_dict = {
    'Gebirgsbäche im Sauerland': {'latitude': 51.1234, 'longitude': 8.5678},
    'Plankton der Werse bei Münster': {'latitude': 51.9876, 'longitude': 7.6543},
    'Ruhr': {'latitude': 51.4567, 'longitude': 7.8901},
    'Lippe': {'latitude': 51.2345, 'longitude': 8.9012},
    'Eder': {'latitude': 50.8765, 'longitude': 8.3456},
    'Salinen und Salzgräben im südlichen Gebiet': {'latitude': 50.5432, 'longitude': 8.7654},
    'desgl. im nördlichen Gebiet': {'latitude': 50.7890, 'longitude': 8.1234},
    'Plankton der Talsperren': {'latitude': 51.4321, 'longitude': 7.5432},
    'Plankton des Dortmund-Ems Kanals': {'latitude': 51.8765, 'longitude': 7.2109},
    'Teiche und Moor stellen ,“Kipshagen ”': {'latitude': 51.6543, 'longitude': 8.0987},
    'Seen, Weiher und Moorstellen ,“Heiliges Meer"': {'latitude': 52.3456, 'longitude': 7.8901},
    'Moore im Sauer und Münsterland': {'latitude': 51.2345, 'longitude': 7.5432},}

In [3]:
df = pd.read_csv('./data/609.csv', encoding='utf-8')
df.head()

Unnamed: 0.1,Unnamed: 0,Gebirgsbäche im Sauerland,Plankton der Werse bei Münster,Ruhr,Lippe,Eder,Salinen und Salzgräben im südlichen Gebiet,desgl. im nördlichen Gebiet,Plankton der Talsperren,Plankton des Dortmund-Ems Kanals,"Teiche und Moor stellen ,“Kipshagen ”","Seen, Weiher und Moorstellen ,“Heiliges Meer""",Moore im Sauer und Münsterland
0,b) Rivulariaceae,,,,,,,,,,,,
1,32.Homoeothrix spec.,+,‒,‒,‒,‒,‒,‒,‒,‒,‒,‒,‒
2,33. Calothrix (fusca?),‒,‒,‒,‒,‒,‒,‒,‒,‒,‒,+,‒
3,34. Gloeotrichia Pisum,‒,‒,‒,‒,‒,‒,‒,‒,‒,‒,+,+
4,"35. ,,intermedia",‒,‒,‒,‒,‒,‒,‒,‒,‒,‒,+,‒


In [4]:
## replacement values rules
df = df.replace(to_replace=['+', '‒'], value=['present', 'absent'])
df.head()

Unnamed: 0.1,Unnamed: 0,Gebirgsbäche im Sauerland,Plankton der Werse bei Münster,Ruhr,Lippe,Eder,Salinen und Salzgräben im südlichen Gebiet,desgl. im nördlichen Gebiet,Plankton der Talsperren,Plankton des Dortmund-Ems Kanals,"Teiche und Moor stellen ,“Kipshagen ”","Seen, Weiher und Moorstellen ,“Heiliges Meer""",Moore im Sauer und Münsterland
0,b) Rivulariaceae,,,,,,,,,,,,
1,32.Homoeothrix spec.,present,absent,absent,absent,absent,absent,absent,absent,absent,absent,absent,absent
2,33. Calothrix (fusca?),absent,absent,absent,absent,absent,absent,absent,absent,absent,absent,present,absent
3,34. Gloeotrichia Pisum,absent,absent,absent,absent,absent,absent,absent,absent,absent,absent,present,present
4,"35. ,,intermedia",absent,absent,absent,absent,absent,absent,absent,absent,absent,absent,present,absent


In [5]:
## columns rotation rules
df = df.melt(id_vars=["Unnamed: 0"], 
        var_name="location", 
        value_name="OccurrenceStatus") # or treat as locality

In [6]:
# renaming columns
df.rename(columns={'Unnamed: 0':'scientificName'}, inplace=True)

In [7]:
# Creating new columns and feeding data
df['basisOfRecord']='Human Observation'

In [8]:
df.head()

Unnamed: 0,scientificName,location,OccurrenceStatus,basisOfRecord
0,b) Rivulariaceae,Gebirgsbäche im Sauerland,,Human Observation
1,32.Homoeothrix spec.,Gebirgsbäche im Sauerland,present,Human Observation
2,33. Calothrix (fusca?),Gebirgsbäche im Sauerland,absent,Human Observation
3,34. Gloeotrichia Pisum,Gebirgsbäche im Sauerland,absent,Human Observation
4,"35. ,,intermedia",Gebirgsbäche im Sauerland,absent,Human Observation


In [9]:
## Get the correct names of scientific species

# Clean and correct names
def remove_number(text):
    if text !=text:
        return text
    text = re.sub('^[0-9.]*', '', text, count=1)
    text = re.sub('^[aA-zZ]\\)', '', text, count=1)
    return text

def replace_commas(text):
    if text !=text:
        return text
    text = re.sub('^, ,', ' ,, ', text)
    text = re.sub('^，，', ',,', text)
    text = re.sub('^,,', ' ,, ', text)
    text = re.sub('^,', ' ,, ', text)
    text = re.sub('^, , , ,', ' ,, ,, ', text)
    return text

def remove_extra_space(text):
    if text !=text:
        return text
    text = re.sub(' +', ' ', text).strip()
    return text

def complete_species_name(scientificName_list, i):
    prev = scientificName_list[i-1].split()[0]
    # print('0', scientificName_list[i], '\t', scientificName_list[i-1])
    # print(prev)
    scientificName_list[i] = scientificName_list[i].replace(',,', prev)
    # print('1', scientificName_list[i], '\t', scientificName_list[i-1], end='\n\n')
    return scientificName_list[i]

def get_kingdom(text):
    url = "https://api.gbif.org/v1/species/search?q={}&origin=SOURCE&status=ACCEPTED&strict=true".format(text)
    payload = {}
    headers = {'Authorization': 'Basic YWtodnlhczA6VnlAJDEyMzQ='}
    response = requests.request("GET", url, headers=headers, data=payload)
    try:
        if response.status_code==200:
            return response.json()['results'][0]['kingdom']
        else:
            return None
    except:
        return None
    
def correct_species_name(text):
    pass

In [10]:
df = df.map(remove_extra_space, na_action='ignore')
df['scientificName'] = df['scientificName'].apply(remove_number)
df = df.map(remove_extra_space, na_action='ignore')
df['scientificName'] = df['scientificName'].apply(replace_commas)
df = df.map(remove_extra_space, na_action='ignore')

In [None]:
scientificName_list = df['scientificName'].tolist()
df['scientificName'] = [scientificName_list[0]]+ [complete_species_name(scientificName_list, i) for i, j in enumerate(scientificName_list) if i>0]
df['kingdom'] = df['scientificName'].apply(get_kingdom)

In [None]:
df.to_csv('./data/609_cleaned.csv', encoding='utf-8', index=False)

In [None]:
# get coordinates
geolocator = Nominatim(user_agent="your_app_name")  # Replace 'your_app_name' with a unique name for your application

# Function to get coordinates for a location
def get_coordinates(location):
    try:
        # Use geopy to get location coordinates
        location_data = geolocator.geocode(location, language='de')
        if location_data:
            return location_data.latitude, location_data.longitude
        else:
            return None, None
    except Exception as e:
        print(f"Error processing location '{location}': {e}")
        return None, None

# Apply the function to the "location" column and create new "latitude" and "longitude" columns
print(df.shape)
df['coordinates'] = df['location'].apply(get_coordinates)
print(df.shape)
df[['latitude', 'longitude']] = pd.DataFrame(df['coordinates'].tolist(), index=df.index)

# Save the updated DataFrame to a new CSV file
# df.to_csv('./data/609_10_records_location_coordinate.csv', encoding='utf-8', index=False)

# Display the DataFrame with coordinates
print(df[['location', 'latitude', 'longitude']])
df[['location', 'latitude', 'longitude']].drop_duplicates()

In [None]:
## get coordinates

# Replace 'your_api_key' with your actual OpenCage Geocoding API key
api_key = ''
base_url = 'https://api.opencagedata.com/geocode/v1/json'


# Function to get coordinates for a location using the OpenCage Geocoding API
def get_coordinates(location):
    try:
        params = {
            'q': location,
            'key': api_key,
            'language': 'en',  # Specify language for results
        }
        response = requests.get(base_url, params=params)
        data = response.json()

        if data['results']:
            latitude = data['results'][0]['geometry']['lat']
            longitude = data['results'][0]['geometry']['lng']
            return latitude, longitude
        else:
            return None, None
    except Exception as e:
        print(f"Error processing location '{location}': {e}")
        return None, None

# Apply the function to the "location" column and create new "latitude" and "longitude" columns
df['coordinates'] = df['location'].apply(get_coordinates)
df[['latitude', 'longitude']] = pd.DataFrame(df['coordinates'].tolist(), index=df.index)

# Save the updated DataFrame to a new CSV file
output_file_path = 'output_file.csv'  # Replace 'output_file.csv' with your desired output file path
df.to_csv(output_file_path, index=False)

# Display the DataFrame with coordinates
print(df[['location', 'latitude', 'longitude']])
df[['location', 'latitude', 'longitude']].drop_duplicates()

In [None]:
## get coordinates - Translation code

client = OpenAI(api_key='')


def translate_text_with_chatgpt(text, language="en"):
    # Define the prompt for translation
    prompt = f"Translate the following German text to {language}: {text}"

    # Make an API call to ChatGPT for translation
    stream = client.chat.completions.create( model="text-davinci-002", 
                                            messages=[{"role": "user", "content": "Say this is a test"}],
                                            stream=True,)

    for chunk in stream:
        print(chunk.choices[0].delta.content or "", end="")
    
    """response = openai.Completion.create(
        engine="text-davinci-002",  # You can experiment with different engines
        prompt=prompt,
        max_tokens=150  # Adjust as needed
    )"""

    # Extract the translated text from the response
    # translated_text = response.choices[0].text.strip()

    return None

# Translate the text in the specified column using ChatGPT
df['scientificName_en'] = df['scientificName'].apply(lambda x: translate_text_with_chatgpt(x))

# Save the translated DataFrame to a new CSV file
# df.to_csv(output_csv, index=False)
