In [17]:
import pandas as pd
import os
import requests
import time
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
# Folder path given the target 
# Can be set according to your repo at local
folder_path_tragets = "Dataset/tbiodiv10-0.01-sample/horizontal/targets/"

# Define folder path containing tables
folder_path_tables = "Dataset/tbiodiv10-0.01-sample/horizontal/tables/"

file_name_cea_target = "cea_targets.csv"
file_name_specific_table = "CHF06100910I265.csv" # AZC06100207I0103, WIT06100910I067

file_path_cea_target = folder_path_tragets + file_name_cea_target
file_path_specific_table = folder_path_tables + file_name_specific_table

In [3]:
#### Check the Target csv for the CEA task
## Check the target 
table_biodiv_cea_target = pd.read_csv(file_path_cea_target,header = None)
print(f"Shape of the target dataframe is: {table_biodiv_cea_target.shape}")
table_biodiv_cea_target.head()

Shape of the target dataframe is: (92439, 3)


Unnamed: 0,0,1,2
0,EGN060702I0010,1,0
1,EGN060702I0010,1,1
2,EGN060702I0010,1,2
3,EGN060702I0010,1,3
4,EGN060702I0010,1,4


In [4]:
# Check total unique tables in it having the first index containing table names.
print(f"Total unique table in given in the target CEA file {table_biodiv_cea_target[0].nunique()}")

Total unique table in given in the target CEA file 1317


In [5]:
# Check the duplicate rows in the csv file
duplicates = table_biodiv_cea_target.duplicated(subset=[table_biodiv_cea_target.columns[0], table_biodiv_cea_target.columns[1], table_biodiv_cea_target.columns[2]])
print(f"Total duplicate row in the given target CEA files are: {duplicates.sum()}")

Total duplicate row in the given target CEA files are: 0


In [6]:
# Function to read CSV files into a dictionary
def load_tables(folder_path_tables):
    tables = {}
    for filename in os.listdir(folder_path_tables):
        if filename.endswith('.csv'):
            table_name = filename[:-4]  # Remove '.csv' extension
            table_path = os.path.join(folder_path_tables, filename)
            tables[table_name] = pd.read_csv(table_path)
    return tables

# Function that fetches the value from preloaded tables
def get_value_from_preloaded_tables(row, tables):
    table_name = row[0]
    column_number = row[1]
    row_number = row[2]

    if table_name in tables:
        df_table = tables[table_name]
        if row_number < len(df_table) and column_number < len(df_table.columns):
            return df_table.iloc[row_number, column_number]
    return None

# Function to fetch values in parallel
def fetch_values_in_parallel(rows, tables):
    with ThreadPoolExecutor(max_workers=10) as executor:
        fetched_values = list(executor.map(lambda row: get_value_from_preloaded_tables(row, tables), rows))
    return fetched_values


In [7]:
%%time
# Load all tables once
tables = load_tables(folder_path_tables)

CPU times: user 23.4 s, sys: 3.51 s, total: 26.9 s
Wall time: 1min 8s


In [8]:
%%time
# Convert dataframe rows to a list of tuples for processing
rows = [tuple(row) for row in table_biodiv_cea_target.itertuples(index=False, name=None)]
# Fetch values in parallel
fetched_values = fetch_values_in_parallel(rows, tables)
# Add the fetched values as a new column in the dataframe
table_biodiv_cea_target['cell_values'] = fetched_values

CPU times: user 4.24 s, sys: 480 ms, total: 4.72 s
Wall time: 4.59 s


In [9]:
table_biodiv_cea_target.head(-1)

Unnamed: 0,0,1,2,cell_values
0,EGN060702I0010,1,0,Marchamp
1,EGN060702I0010,1,1,Saint-Maurice-de-Gourdans
2,EGN060702I0010,1,2,Nivigne et Suran
3,EGN060702I0010,1,3,Drom
4,EGN060702I0010,1,4,Lompnas
...,...,...,...,...
92433,CHF06100910I265,1,17,Category:Leap years
92434,CHF06100910I265,2,4,"Ottův slovník naučný, The Nuttall Encyclopædia..."
92435,CHF06100910I265,2,11,Encyclopædia Britannica 11th edition
92436,CHF06100910I265,2,17,Small Brockhaus and Efron Encyclopedic Diction...


In [10]:
# Function to extract the first value from a cell
def extract_first_element(cell):
    elements = cell.split(',')
    if elements:
        return elements[0].strip()
    return None

In [11]:
# Create the new column 'cell_values_first'
table_biodiv_cea_target['cell_values_first'] = table_biodiv_cea_target['cell_values'].astype(str).apply(extract_first_element)

In [12]:
table_biodiv_cea_target.head()

Unnamed: 0,0,1,2,cell_values,cell_values_first
0,EGN060702I0010,1,0,Marchamp,Marchamp
1,EGN060702I0010,1,1,Saint-Maurice-de-Gourdans,Saint-Maurice-de-Gourdans
2,EGN060702I0010,1,2,Nivigne et Suran,Nivigne et Suran
3,EGN060702I0010,1,3,Drom,Drom
4,EGN060702I0010,1,4,Lompnas,Lompnas


In [18]:
# Rate Limiter class
class RateLimiter:
    def __init__(self, max_calls, period):
        self.max_calls = max_calls
        self.period = period
        self.calls = []
    
    def wait(self):
        if len(self.calls) >= self.max_calls:
            time_to_wait = self.period - (time.time() - self.calls[-self.max_calls])
            if time_to_wait > 0:
                time.sleep(time_to_wait)
        self.calls.append(time.time())

# Initialize cache and rate limiter
cache = defaultdict(lambda: None)
rate_limiter = RateLimiter(max_calls=10, period=1)  # Adjust based on API limits

# Function to get Wikidata ID for a given label
def get_wikidata_id(category_label):
    if category_label in cache:
        return cache[category_label]
    
    rate_limiter.wait()
    
    search_url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "format": "json",
        "language": "en",
        "type": "item",
        "search": category_label
    }
    response = requests.get(search_url, params=params)
    
    if response.status_code == 200:
        try:
            data = response.json()
            if data.get("search"):
                wikidata_id = data["search"][0]["id"]
                cache[category_label] = wikidata_id
                return wikidata_id
            else:
                return None
        except ValueError:
            print(f"Error decoding JSON for label: {category_label}")
            print(response.text)
            return None
    else:
        print(f"Request failed with status code {response.status_code} for label: {category_label}")
        return None

# Function to construct Wikidata entity URI
def construct_entity_uri(wikidata_id):
    return "http://www.wikidata.org/entity/" + wikidata_id

# Simplified function to fetch and assign Wikidata URI for each cell value
def fetch_and_assign_wikidata_uri(category_label):
    if not isinstance(category_label, str):
        category_label = str(category_label)
    category_label = category_label.strip()
    wikidata_id = get_wikidata_id(category_label)
    if wikidata_id:
        return construct_entity_uri(wikidata_id)
    return None

# Function to process each row
def process_row(row):
    row["Wikidata Entity URI"] = fetch_and_assign_wikidata_uri(row["cell_values_first"])
    return row

# Applying the function to the DataFrame using ThreadPoolExecutor
def parallel_apply(df, func, workers=20):  # Increase number of workers
    df['cell_values_first'] = df['cell_values_first'].astype(str)  # Convert the column to strings
    with ThreadPoolExecutor(max_workers=workers) as executor:
        future_to_row = {executor.submit(func, row): row for _, row in df.iterrows()}
        results = []
        for future in as_completed(future_to_row):
            results.append(future.result())
    return pd.DataFrame(results)

# # Example DataFrame
# data = {
#     'cell_values_first': ['aquifer', 'underground lake', 'subglacial lake', 'value1', 'value2', 'value3', '12345']
# }
# df = pd.DataFrame(data)

# # Applying the function to annotate the DataFrame
# annotated_df = parallel_apply(df, process_row)

# # Display the resulting DataFrame
# print(annotated_df)


  cell_values_first                       Wikidata Entity URI
4            value2                                      None
3            value1                                      None
0           aquifer    http://www.wikidata.org/entity/Q208791
2   subglacial lake   http://www.wikidata.org/entity/Q1140477
5            value3                                      None
1  underground lake   http://www.wikidata.org/entity/Q1048337
6             12345  http://www.wikidata.org/entity/Q11185239


In [21]:
annotated_df.sort_index(inplace=True)
annotated_df.head()

Unnamed: 0,cell_values_first,Wikidata Entity URI
0,aquifer,http://www.wikidata.org/entity/Q208791
1,underground lake,http://www.wikidata.org/entity/Q1048337
2,subglacial lake,http://www.wikidata.org/entity/Q1140477
3,value1,
4,value2,


In [22]:
annotated_target_df = parallel_apply(table_biodiv_cea_target, process_row)

In [None]:
### Started at 8:16 on 19.06

In [23]:
from datetime import datetime

# Get the current time
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

current_time

'2024-06-19 06:32:35'

In [34]:
annotated_target_df.sort_index(inplace=True)
annotated_target_df.head()

Unnamed: 0,0,1,2,cell_values,cell_values_first,Wikidata Entity URI
0,EGN060702I0010,1,0,Marchamp,Marchamp,http://www.wikidata.org/entity/Q542133
1,EGN060702I0010,1,1,Saint-Maurice-de-Gourdans,Saint-Maurice-de-Gourdans,http://www.wikidata.org/entity/Q325794
2,EGN060702I0010,1,2,Nivigne et Suran,Nivigne et Suran,http://www.wikidata.org/entity/Q24938088
3,EGN060702I0010,1,3,Drom,Drom,http://www.wikidata.org/entity/Q842859
4,EGN060702I0010,1,4,Lompnas,Lompnas,http://www.wikidata.org/entity/Q840079


In [35]:
annotated_target_df.to_csv("annotated_target_biodiv_cea_R2_df.csv", index=False)

In [27]:
annotated_target_df.head(-1)

Unnamed: 0,0,1,2,cell_values,cell_values_first,Wikidata Entity URI
85,EGN060702I0010,3,28,"zone naturelle d'intérêt écologique, faunistiq...",zone naturelle d'intérêt écologique,http://www.wikidata.org/entity/Q16040909
48,EGN060702I0010,1,48,Loyettes,Loyettes,http://www.wikidata.org/entity/Q648628
114,EGN060702I0031,1,0,Category:Judiciary of Iran,Category:Judiciary of Iran,http://www.wikidata.org/entity/Q15404295
108,EGN060702I0010,3,51,"zone naturelle d'intérêt écologique, faunistiq...",zone naturelle d'intérêt écologique,http://www.wikidata.org/entity/Q16040909
32,EGN060702I0010,1,32,Ordonnaz,Ordonnaz,http://www.wikidata.org/entity/Q988672
...,...,...,...,...,...,...
92437,CHF06100910I265,3,0,"Lent, Christmastide, Eastertide",Lent,http://www.wikidata.org/entity/Q82866
92429,CHF06100910I099,2,1,Category:Local museums in Finland,Category:Local museums in Finland,http://www.wikidata.org/entity/Q90001771
92438,CHF06100910I265,3,8,"leap year, common year",leap year,http://www.wikidata.org/entity/Q19828
92432,CHF06100910I265,1,2,Category:BC,Category:BC,http://www.wikidata.org/entity/Q10148111


In [28]:
null_values_count = annotated_target_df['Wikidata Entity URI'].isnull().sum()

In [29]:
print(null_values_count)

272


In [30]:
rows_with_nulls = annotated_target_df[annotated_target_df['Wikidata Entity URI'].isnull()]

In [32]:
rows_with_nulls.head(-1)

Unnamed: 0,0,1,2,cell_values,cell_values_first,Wikidata Entity URI
3421,EGN060702I1832,15,33,Category:Born in Alaska,Category:Born in Alaska,
5121,EGN060702I2487,7,33,Category:Born in Alaska,Category:Born in Alaska,
4621,EGN060702I2395,8,33,Category:Born in Alaska,Category:Born in Alaska,
5921,EGN060702I2646,5,33,Category:Born in Alaska,Category:Born in Alaska,
8329,EGN060702I3259,27,31,Category:Born in Alaska,Category:Born in Alaska,
...,...,...,...,...,...,...
91703,EAC06100807I1236,3,75,Regional state archive in Pilsen,Regional state archive in Pilsen,
91997,QFM06100907I0307,14,49,people who have died in Narvik,people who have died in Narvik,
92369,QFM06100907I1101,6,48,Category:Opole University of Technology alumni,Category:Opole University of Technology alumni,
92377,QFM06100907I1101,6,85,Category:Alumni of the Białystok University of...,Category:Alumni of the Białystok University of...,
