In [34]:
import os
import json
import pandas as pd

In [35]:
# count total number of JSON files
DATA_DIR = "../data/raw"

json_files = [f for f in os.listdir(DATA_DIR) if f.endswith('.json')]
total_files = len(json_files)

print(f"Total number of JSON files: {total_files}")

Total number of JSON files: 23045


In [36]:
# Categorize and count by type
file_types = {
    'areas': 0,
    'tasks': 0,
    'task': 0,
    'evaluations': 0,
    'evaluation_results': 0,
    'parents': 0,
    'children': 0,
}

for file in json_files:
    if file.startswith("areas"):
        file_types['areas'] += 1
    elif file.startswith("tasks_"):
        file_types['tasks'] += 1
    elif file.startswith("task_") and not file.endswith("_parents.json") and not file.endswith("_children.json") and not file.endswith("_evaluations.json"):
        file_types['task'] += 1
    elif file.startswith("task_") and file.endswith("_parents.json"):
        file_types['parents'] += 1
    elif file.startswith("task_") and file.endswith("_children.json"):
        file_types['children'] += 1
    elif file.startswith("task_") and file.endswith("_evaluations.json"):
        file_types['evaluations'] += 1
    elif file.startswith("evaluation_") and file.endswith("_results.json"):
        file_types['evaluation_results'] += 1
        
print("File counts by type: ")
for file_type, count in file_types.items():
    print(f"{file_type}: {count}")


File counts by type: 
areas: 1
tasks: 16
task: 3371
evaluations: 3371
evaluation_results: 9544
parents: 3371
children: 3371


In [42]:
# how many distinct tasks have evaluation results
# this will be distinct number of strings that start with "evaluation_" and end with "_results.json" but if they have "-on-" in the middle, it still counts as one

# Find distinct tasks with evaluation results
distinct_tasks_with_results = set()

for file in json_files:
    if file.startswith("evaluation_") and file.endswith("_results.json"):
        if '-on-' in file:
            task_name = file[len('evaluation_'):file.index('-on-')]
            distinct_tasks_with_results.add(task_name)

# Count the number of distinct tasks
num_distinct_tasks = len(distinct_tasks_with_results)

print(f"Number of distinct tasks with evaluation results: {num_distinct_tasks}")


# find average number of evaluations per task
total_evaluations = file_types['evaluation_results']

average_evaluations_per_task = total_evaluations / num_distinct_tasks

# format to 2 decimal places
average_evaluations_per_task = "{:.2f}".format(average_evaluations_per_task)

print(f"Average number of evaluations per task: {average_evaluations_per_task}")

Number of distinct tasks with evaluation results: 1681
Average number of evaluations per task: 5.68


## Organized dataframes that will be inserted into the PostgreSQL database:

### Dataframe 1: Tasks by Area

For all 3371 tasks:

Task1,Area,Parents,Children,NumEvaluations

Task2,Area,Parents,Children,NumEvaluations

Task3,Area,Parents,Children,NumEvaluations


### Dataframe 2: Evaluations by Task

For all 1681 tasks with evaluations:

Task1,Eval1 (Dataset1),Metric1,Model1,Paper1,Date1

Task1,Eval1 (Dataset1),Metric2,Model2,Paper2,Date2

Task1,Eval1 (Dataset1),Metric3,Model3,Paper3,Date3

Task1,Eval2 (Dataset2),Metric1,Model1,Paper1,Date1

Task1,Eval2 (Dataset2),Metric2,Model2,Paper2,Date2

Task2,Eval1 (Dataset1),Metric1,Model1,Paper1,Date1

Task3,Eval1 (Dataset1),Metric1,Model1,Paper1,Date1

Task3,Eval2 (Dataset2),Metric1,Model1,Paper1,Date1

In [19]:
# load tasks_layman_descriptions.csv and count number of rows

import pandas as pd

tasks_layman_descriptions = pd.read_csv("../data/processed/tasks_layman_descriptions.csv")

num_tasks = tasks_layman_descriptions.shape[0]

print(f"Number of layman description tasks: {num_tasks}")

# load number of tassk in tasks_by_area.csv and count number of rows

tasks_by_area = pd.read_csv("../data/processed/tasks_by_area.csv")

num_tasks_by_area = tasks_by_area.shape[0]

print(f"Number of tasks in tasks_by_area.csv: {num_tasks_by_area}")

# what percentage of tasks have layman descriptions

percentage_tasks_with_layman_descriptions = (num_tasks / num_tasks_by_area) * 100

# format to 2 decimal places

percentage_tasks_with_layman_descriptions = "{:.2f}".format(percentage_tasks_with_layman_descriptions)

print(f"Percentage of tasks with layman descriptions: {percentage_tasks_with_layman_descriptions}%")


Number of layman description tasks: 3371
Number of tasks in tasks_by_area.csv: 3495
Percentage of tasks with layman descriptions: 96.45%


In [17]:
# find list of 'id' in tasks_by_area.csv that are not in 'task' col of tasks_layman_descriptions.csv'

tasks_by_area_ids = tasks_by_area['id'].tolist()

tasks_layman_descriptions_ids = tasks_layman_descriptions['task'].tolist()

tasks_without_layman_descriptions = [task for task in tasks_by_area_ids if task not in tasks_layman_descriptions_ids]

print(f"Tasks without layman descriptions: {tasks_without_layman_descriptions}")

Tasks without layman descriptions: []


In [20]:
# return list of duplicate 'id' in tasks_by_area.csv 

duplicate_tasks_by_area = tasks_by_area[tasks_by_area.duplicated(subset=['id'], keep=False)]

print(f"Duplicate tasks in tasks_by_area.csv: {duplicate_tasks_by_area}")

Duplicate tasks in tasks_by_area.csv:                              area                                          id  \
2                     methodology                  neural-network-compression   
9                     methodology                                     chatbot   
10                    methodology             multi-label-text-classification   
11                    methodology                          continual-learning   
13                    methodology                      cyber-attack-detection   
...                           ...                                         ...   
3360  natural-language-processing                             deep-clustering   
3459  natural-language-processing              open-domain-question-answering   
3466  natural-language-processing  semeval-2022-task-4-1-binary-pcl-detection   
3481  natural-language-processing                                 code-repair   
3491  natural-language-processing               multimodal-sentiment-an

Well that makes sense. 242 duplicates mean 121 tasks are repeated in the tasks_by_area.csv file.

3371 + 121 = 3492, which is the total number of tasks in the dataset.