In [1]:
import pandas as pd
import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from collections import defaultdict

In [2]:
## All such path can be set according to your local repo
# Folder path containing ground truths
folder_path_ground_truth = "Dataset/tbiomedical5-0.01-sample/horizontal/gt/"

# Folder path given the target 
folder_path_tragets = "Dataset/tbiomedical5-0.01-sample/horizontal/targets/"

# Define folder path containing tables
folder_path_tables = "Dataset/tbiomedical5-0.01-sample/horizontal/tables/"

file_name_cea_gt = "cea_gt.csv"
file_name_cea_target = "cea_targets.csv"
file_name_specific_table = "DLZ060702I025.csv" # AZC06100207I0103, WIT06100910I067

file_path_cea_target = folder_path_tragets + file_name_cea_target
file_path_cea_gt = folder_path_ground_truth + file_name_cea_gt
file_path_specific_table = folder_path_tables + file_name_specific_table

In [3]:
## Check the target 
table_biomedical_cea_target = pd.read_csv(file_path_cea_target,header = None)

In [4]:
print(f"Shape of the target dataframe is: {table_biomedical_cea_target.shape}")
table_biomedical_cea_target.head()

Shape of the target dataframe is: (200183, 3)


Unnamed: 0,0,1,2
0,DLZ060702I025,1,0
1,DLZ060702I025,1,1
2,DLZ060702I025,1,2
3,DLZ060702I025,1,3
4,DLZ060702I025,1,4


In [5]:
# Check total unique tables in it having the first index containing table names.
print(f"Total unique table in given in the target CEA file {table_biomedical_cea_target[0].nunique()}")

# Check the duplicate rows in the csv file
duplicates = table_biomedical_cea_target.duplicated(subset=[table_biomedical_cea_target.columns[0], table_biomedical_cea_target.columns[1], table_biomedical_cea_target.columns[2]])
print(f"Total duplicate row in the given target CEA files are: {duplicates.sum()}")

Total unique table in given in the target CEA file 4792
Total duplicate row in the given target CEA files are: 0


In [6]:
# Function to read CSV files into a dictionary
def load_tables(folder_path_tables):
    tables = {}
    for filename in os.listdir(folder_path_tables):
        if filename.endswith('.csv'):
            table_name = filename[:-4]  # Remove '.csv' extension
            table_path = os.path.join(folder_path_tables, filename)
            tables[table_name] = pd.read_csv(table_path)
    return tables

# Function that fetches the value from preloaded tables
def get_value_from_preloaded_tables(row, tables):
    table_name = row[0]
    column_number = row[1]
    row_number = row[2]

    if table_name in tables:
        df_table = tables[table_name]
        if row_number < len(df_table) and column_number < len(df_table.columns):
            return df_table.iloc[row_number, column_number]
    return None

# Function to fetch values in parallel
def fetch_values_in_parallel(rows, tables):
    with ThreadPoolExecutor(max_workers=10) as executor:
        fetched_values = list(executor.map(lambda row: get_value_from_preloaded_tables(row, tables), rows))
    return fetched_values

In [7]:
%%time
# Load all tables once
# folder_path_tables = 'path_to_tables'  # Change this to your actual path
tables = load_tables(folder_path_tables)

CPU times: user 4.67 s, sys: 274 ms, total: 4.95 s
Wall time: 4.97 s


In [8]:
%%time
# Convert dataframe rows to a list of tuples for processing
rows = [tuple(row) for row in table_biomedical_cea_target.itertuples(index=False, name=None)]
# Fetch values in parallel
fetched_values = fetch_values_in_parallel(rows, tables)
# Add the fetched values as a new column in the dataframe
table_biomedical_cea_target['cell_values'] = fetched_values

CPU times: user 7.02 s, sys: 618 ms, total: 7.63 s
Wall time: 8.34 s


In [9]:
table_biomedical_cea_target.head()

Unnamed: 0,0,1,2,cell_values
0,DLZ060702I025,1,0,ATR-mediated checkpoint pathways regulate phos...
1,DLZ060702I025,1,1,Human claspin is a ring-shaped DNA-binding pro...
2,DLZ060702I025,1,2,Loading of the human 9-1-1 checkpoint complex ...
3,DLZ060702I025,1,3,Sensing DNA damage through ATRIP recognition o...
4,DLZ060702I025,1,4,Expression of mammalian paralogues of HRAD9 an...


In [10]:
# Function to extract the first value from a cell
def extract_first_element(cell):
    elements = cell.split(',')
    if elements:
        return elements[0].strip()
    return None

In [11]:
# Create the new column 'cell_values_first'
table_biomedical_cea_target['cell_values_first'] = table_biomedical_cea_target['cell_values'].astype(str).apply(extract_first_element)

In [12]:
table_biomedical_cea_target.head()

Unnamed: 0,0,1,2,cell_values,cell_values_first
0,DLZ060702I025,1,0,ATR-mediated checkpoint pathways regulate phos...,ATR-mediated checkpoint pathways regulate phos...
1,DLZ060702I025,1,1,Human claspin is a ring-shaped DNA-binding pro...,Human claspin is a ring-shaped DNA-binding pro...
2,DLZ060702I025,1,2,Loading of the human 9-1-1 checkpoint complex ...,Loading of the human 9-1-1 checkpoint complex ...
3,DLZ060702I025,1,3,Sensing DNA damage through ATRIP recognition o...,Sensing DNA damage through ATRIP recognition o...
4,DLZ060702I025,1,4,Expression of mammalian paralogues of HRAD9 an...,Expression of mammalian paralogues of HRAD9 an...


In [13]:
# Rate Limiter class
class RateLimiter:
    def __init__(self, max_calls, period):
        self.max_calls = max_calls
        self.period = period
        self.calls = []
    
    def wait(self):
        if len(self.calls) >= self.max_calls:
            time_to_wait = self.period - (time.time() - self.calls[-self.max_calls])
            if time_to_wait > 0:
                time.sleep(time_to_wait)
        self.calls.append(time.time())

# Initialize cache and rate limiter
cache = defaultdict(lambda: None)
rate_limiter = RateLimiter(max_calls=10, period=1)  # Adjust based on API limits

# Function to get Wikidata ID for a given label
def get_wikidata_id(category_label):
    if category_label in cache:
        return cache[category_label]
    
    rate_limiter.wait()
    
    search_url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "format": "json",
        "language": "en",
        "type": "item",
        "search": category_label
    }
    response = requests.get(search_url, params=params)
    
    if response.status_code == 200:
        try:
            data = response.json()
            if data.get("search"):
                wikidata_id = data["search"][0]["id"]
                cache[category_label] = wikidata_id
                return wikidata_id
            else:
                return None
        except ValueError:
            print(f"Error decoding JSON for label: {category_label}")
            print(response.text)
            return None
    else:
        print(f"Request failed with status code {response.status_code} for label: {category_label}")
        return None

# Function to construct Wikidata entity URI
def construct_entity_uri(wikidata_id):
    return "http://www.wikidata.org/entity/" + wikidata_id

# Simplified function to fetch and assign Wikidata URI for each cell value
def fetch_and_assign_wikidata_uri(category_label):
    if not isinstance(category_label, str):
        category_label = str(category_label)
    category_label = category_label.strip()
    wikidata_id = get_wikidata_id(category_label)
    if wikidata_id:
        return construct_entity_uri(wikidata_id)
    return None

# Function to process each row
def process_row(row):
    row["Wikidata Entity URI"] = fetch_and_assign_wikidata_uri(row["cell_values_first"])
    return row

# Applying the function to the DataFrame using ThreadPoolExecutor
def parallel_apply(df, func, workers=20):  # Increase number of workers
    df['cell_values_first'] = df['cell_values_first'].astype(str)  # Convert the column to strings
    with ThreadPoolExecutor(max_workers=workers) as executor:
        future_to_row = {executor.submit(func, row): row for _, row in df.iterrows()}
        results = []
        for future in as_completed(future_to_row):
            results.append(future.result())
    return pd.DataFrame(results)

# Example DataFrame
data = {
    'cell_values_first': ['aquifer', 'underground lake', 'subglacial lake', 'value1', 'value2', 'value3', '12345']
}
df = pd.DataFrame(data)

# Applying the function to annotate the DataFrame
annotated_df = parallel_apply(df, process_row)

# Display the resulting DataFrame
print(annotated_df)

  cell_values_first                       Wikidata Entity URI
3            value1                                      None
0           aquifer    http://www.wikidata.org/entity/Q208791
5            value3                                      None
6             12345  http://www.wikidata.org/entity/Q11185239
1  underground lake   http://www.wikidata.org/entity/Q1048337
4            value2                                      None
2   subglacial lake   http://www.wikidata.org/entity/Q1140477


In [14]:
%%time
annotated_target_df = parallel_apply(table_biomedical_cea_target, process_row)

CPU times: user 11min, sys: 56.5 s, total: 11min 56s
Wall time: 50min 9s


In [17]:
annotated_target_df.sort_index(inplace=True)
annotated_target_df.head()

Unnamed: 0,0,1,2,cell_values,cell_values_first,Wikidata Entity URI
0,DLZ060702I025,1,0,ATR-mediated checkpoint pathways regulate phos...,ATR-mediated checkpoint pathways regulate phos...,http://www.wikidata.org/entity/Q24550952
1,DLZ060702I025,1,1,Human claspin is a ring-shaped DNA-binding pro...,Human claspin is a ring-shaped DNA-binding pro...,http://www.wikidata.org/entity/Q28269457
2,DLZ060702I025,1,2,Loading of the human 9-1-1 checkpoint complex ...,Loading of the human 9-1-1 checkpoint complex ...,http://www.wikidata.org/entity/Q24550745
3,DLZ060702I025,1,3,Sensing DNA damage through ATRIP recognition o...,Sensing DNA damage through ATRIP recognition o...,http://www.wikidata.org/entity/Q27860662
4,DLZ060702I025,1,4,Expression of mammalian paralogues of HRAD9 an...,Expression of mammalian paralogues of HRAD9 an...,http://www.wikidata.org/entity/Q24294838


In [18]:
# Storing values in the local
annotated_target_df.to_csv("annotated_target_biomedical_cea_R2_df.csv", index=False)

In [19]:
annotated_target_df.head(-1)

Unnamed: 0,0,1,2,cell_values,cell_values_first,Wikidata Entity URI
0,DLZ060702I025,1,0,ATR-mediated checkpoint pathways regulate phos...,ATR-mediated checkpoint pathways regulate phos...,http://www.wikidata.org/entity/Q24550952
1,DLZ060702I025,1,1,Human claspin is a ring-shaped DNA-binding pro...,Human claspin is a ring-shaped DNA-binding pro...,http://www.wikidata.org/entity/Q28269457
2,DLZ060702I025,1,2,Loading of the human 9-1-1 checkpoint complex ...,Loading of the human 9-1-1 checkpoint complex ...,http://www.wikidata.org/entity/Q24550745
3,DLZ060702I025,1,3,Sensing DNA damage through ATRIP recognition o...,Sensing DNA damage through ATRIP recognition o...,http://www.wikidata.org/entity/Q27860662
4,DLZ060702I025,1,4,Expression of mammalian paralogues of HRAD9 an...,Expression of mammalian paralogues of HRAD9 an...,http://www.wikidata.org/entity/Q24294838
...,...,...,...,...,...,...
200177,LAM06100910I5394,8,23,intimacy,intimacy,http://www.wikidata.org/entity/Q16358629
200178,LAM06100910I5394,12,22,"instinct, reason",instinct,http://www.wikidata.org/entity/Q18237
200179,LAM06100910I5394,12,25,"music, light, weather, mood, odor, soundscape",music,http://www.wikidata.org/entity/Q638
200180,LAM06100910I5404,2,1,psychology terminology,psychology terminology,http://www.wikidata.org/entity/Q77468620


In [21]:
null_values_count = annotated_target_df['Wikidata Entity URI'].isnull().sum()
print(null_values_count)

622


In [22]:
rows_with_nulls = annotated_target_df[annotated_target_df['Wikidata Entity URI'].isnull()]
rows_with_nulls.head(-1)

Unnamed: 0,0,1,2,cell_values,cell_values_first,Wikidata Entity URI
1852,QLY06100007I1097,3,2,"The Fall of Men:Eve, The Fall of Man: (Adam)",The Fall of Men:Eve,
2059,QLY06100007I1104,7,22,Category:Sakitama-kofungun,Category:Sakitama-kofungun,
3085,QLY06100007I2319,11,17,Category:Muslim Television Ahmadiyya Internati...,Category:Muslim Television Ahmadiyya Internati...,
3450,QLY06100007I3286,3,16,Russo-Turkish War (1806-1812),Russo-Turkish War (1806-1812),
3451,QLY06100007I3286,3,17,Crusader battles between 1110 and 1145,Crusader battles between 1110 and 1145,
...,...,...,...,...,...,...
197895,CMZ06100907I1743,6,46,City Council of Seville,City Council of Seville,
199542,CMZ06100907I6249,3,14,Category:1 star officers,Category:1 star officers,
199543,CMZ06100907I6249,3,19,Category:4 star officers,Category:4 star officers,
199547,CMZ06100907I6249,3,31,Category:3 star officers,Category:3 star officers,
