Scripts for transforming the classifications from zooniverse into a human-readable format.

In [None]:
#to run the script you have to place your file into the same folder
import pandas as pd

# Specify your file path
csv_file = 'mollusques-recueillis-sur-les-cotes-de-la-tunisie-et-de-l-algerie-classifications (1).csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file)

# Now, you can work with the DataFrame, e.g., to access specific columns:
# Access a specific column by name, e.g., df['column_name']
# Access the first few rows, e.g., df.head()
# Perform data manipulation, analysis, and more
selected_columns = df.iloc[:, [0,11, 13]]

display(selected_columns.iloc[:,[1]])


# Assuming df is your DataFrame
selected_columns

In [None]:
# Import necessary libraries
import json
import pandas as pd

# Assuming 'selected_columns' is a list of dictionaries, create a DataFrame
df2 = pd.DataFrame(selected_columns)
data = df2

# Define a function to flatten the dictionaries in each row
def flatten_dicts(row):
    # Load the JSON-formatted string into a list of dictionaries
    row_data = json.loads(row)
    flattened_data = []

    # Iterate through each dictionary in the list
    for item in row_data:
        # Check if 'value' is a list within the dictionary
        if 'value' in item and isinstance(item['value'], list):
            # If 'value' is a list, iterate through its elements
            for sub_item in item['value']:
                # Create a flattened dictionary and append to the result
                flat_item = {
                    'task': sub_item['task'],
                    'value': sub_item['value'],
                    'task_label': sub_item['task_label']
                }
                flattened_data.append(flat_item)
        else:
            # If 'value' is not a list, append the original dictionary
            flattened_data.append(item)

    # Convert the result back to a JSON-formatted string
    return json.dumps(flattened_data)

# Apply the flattening function to the entire DataFrame
data.loc[:, 'annotations'] = data['annotations'].apply(flatten_dicts)

# Print the resulting DataFrame
print(data)

# Export the DataFrame to a CSV file ('algerie.csv')
data.to_csv('algerie.csv', index=False)


In [None]:
import pandas as pd
import json

# Assuming 'data' is a list of dictionaries, create a DataFrame
df3 = pd.DataFrame(data)

# Parse the 'annotations' column to convert the string data to dictionaries
df3['annotations'] = df3['annotations'].apply(lambda x: json.loads(x.replace('[deletion]', '').replace('[/deletion]', '')))

# Create new columns for 'task_label' and 'value'
df3['task'] = df3['annotations'].apply(lambda x: [item['task'] for item in x])
df3['task_label'] = df3['annotations'].apply(lambda x: [item['task_label'] for item in x])
df3['value'] = df3['annotations'].apply(lambda x: [item['value'] for item in x])

# Print the DataFrame with selected columns
print(df3[['classification_id', 'task', 'task_label', 'value', 'subject_ids']])

# Export the DataFrame to a CSV file ('algerie1.csv')
df3.to_csv('algerie1.csv', index=False)  # Set index=False to exclude the index column

# Print the first few rows of the DataFrame
print(df3.head())


In [None]:
import pandas as pd

# Keeps all taxon remarks
# Initialize an empty list to store rows
result_rows2 = []

# Iterate over rows in the original DataFrame
for index, row in df3.iterrows():
    classification_id = row['classification_id']
    subject_id = row['subject_ids']
    annotations = row['annotations']

    # Initialize a dictionary to store task values
    task_values = {'classification_id': classification_id, 'subject_ids': subject_id}

    # Initialize lists to store multiple taxon remarks
    taxon_remarks_list = []
    species_names_list = []
    authorship_names_list = []
    lsid_list = []

    # Iterate over annotations in the list
    for annotation in annotations:
        task_label = annotation.get('task_label', '')
        value = annotation.get('value', '')

        # Handle taxon remarks separately with sequential numbering
        if task_label == 'Taxon Remarks':
            # Check if the value is a dictionary, otherwise, treat it as a string
            if isinstance(value, dict):
                taxon_remarks_list.append(value.get('value', ''))
                species_names_list.append(value.get('Species Name', ''))
                authorship_names_list.append(value.get('Authorship Name', ''))
                lsid_list.append(value.get('LSID', ''))
            else:
                # If value is a string, directly append it to taxon remarks
                taxon_remarks_list.append(value.replace('\n', ''))
                species_names_list.append('')
                authorship_names_list.append('')
                lsid_list.append('')
        elif task_label in ['Species Name', 'Authorship Name', 'LSID']:
            # If the task label is Species Name, Authorship Name, or LSID, add the value to the corresponding list
            if task_label == 'Species Name':
                species_names_list.append(value)
            elif task_label == 'Authorship Name':
                authorship_names_list.append(value)
            elif task_label == 'LSID':
                lsid_list.append(value)
        else:
            # Handle other task labels
            task_values[task_label.replace(' ', '_') if task_label else 'Unknown_Task_Label'] = value

    # Remove empty strings from lists
    species_names_list = [species for species in species_names_list if species]
    authorship_names_list = [authorship for authorship in authorship_names_list if authorship]
    lsid_list = [lsid for lsid in lsid_list if lsid]

    # Add lists to task_values
    taxon_remarks_str = ', '.join(taxon_remarks_list)
    species_names_str = ', '.join(species_names_list)
    authorship_names_str = ', '.join(authorship_names_list)
    lsid_str = ', '.join(lsid_list)

    task_values['Taxon_Remarks'] = taxon_remarks_str
    task_values['Species_Name'] = species_names_str
    task_values['Authorship_Name'] = authorship_names_str
    task_values['LSID'] = lsid_str

    # Append the task_values dictionary to the result_rows list
    result_rows2.append(task_values)

# Convert the list of rows to a new DataFrame
result_df2 = pd.DataFrame(result_rows2)

# Display the resulting DataFrame
print(result_df2)

# Export the DataFrame to a CSV file ('algerie2.csv')
result_df2.to_csv('algerie2.csv', index=False)  # Set index=False to exclude the index column