In [1]:
from datasets import list_datasets
from requests_cache import CachedSession
import requests
import json
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import time
import warnings
from concurrent.futures import ThreadPoolExecutor
from huggingface_hub import hf_hub_download
from tqdm import tqdm
import os

<h3>Extraction des données des datasets d'HuggingFace</h3>


<h4>1) Extraction des métadonnées</h4>

In [1]:
"""
Fonction pour récupérer les informations (métadonnées) d'un dataset
    name : nom du dataset
    session : session avec cache
"""
def fetch_dataset_info(name, session):
    url = f"https://huggingface.co/api/datasets/{name}"

    while True:
        response = session.get(url, params={"full": "True"})
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 429:
            time.sleep(15)
        else:
            return None

In [3]:

"""
Fonction pour récupérer les informations (métadonnées) de plusieurs datasets en parallèle
    dataset_names : liste des noms des datasets
    session : session avec cache
    max_workers : nombre de threads
"""
def retrieve_dataset_info(dataset_names,session,max_workers=15):
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(fetch_dataset_info, name, session) for name in dataset_names]
        for future in tqdm(futures, total=len(dataset_names), desc="Progress"):
            result = future.result()
            results.append(result)
    return results

In [4]:
# Récupération des noms de tous les datasets disponibles sur HuggingFace
dataset_names = list_datasets()
print("nb total de dataset disponible sur huggingface :", len(dataset_names))

  dataset_names = list_datasets()


nb total de dataset disponible sur huggingface : 136631


In [None]:
# Création d'une session avec cache
session = CachedSession()
# Récupération des informations (métadonnées) des datasets
datasets_info = retrieve_dataset_info(dataset_names, session)
print("nb de dataset ayant des métadonnées :", len(datasets_info))

Progress: 100%|██████████| 136631/136631 [1:07:37<00:00, 33.67it/s]   


nb de dataset ayant des métadonnées : 136631


In [7]:
# Création d'un fichier json avec les données des datasets
with open("datasets_info.json", "w", encoding="utf8") as f:
    json.dump(datasets_info, f, ensure_ascii=False, indent=2)

<h4>2) Nettoyage des métadonnées</h4>


In [8]:
#with open("datasets_info.json", "r", encoding="utf8") as f:
#   datasets_info = json.load(f)

# Nettoyage des données
datasets_info_clean = []
for dataset in tqdm(datasets_info):
    arxiv_list = []
    if dataset is None:
        continue
    else:
        # On enlève les données inutiles pour réduire la taille du fichier json
        if "cardData" in dataset and "configs" in dataset["cardData"]:
            dataset["cardData"].pop("configs")
        if "siblings" in dataset:
            dataset.pop("siblings")
        if "tags" in dataset:
            for tag in dataset["tags"]:
                tag = tag.split(':')
                if tag[0] == "arxiv" and len(tag) == 2:
                    arxiv_list.append(tag[1])
        dataset.pop("tags")
        if arxiv_list:
            dataset["arxiv"] = arxiv_list
        datasets_info_clean.append(dataset)

print("dataset_info Nettoyé :")
print(datasets_info_clean[0])
print("Nombre de datasets nettoyés :", len(datasets_info_clean))

100%|██████████| 136631/136631 [00:21<00:00, 6298.19it/s] 

dataset_info Nettoyé :
{'_id': '621ffdd236468d709f181d58', 'id': 'acronym_identification', 'sha': '15ef643450d589d5883e289ffadeb03563e80a9e', 'lastModified': '2024-01-09T11:39:57.000Z', 'private': False, 'gated': False, 'disabled': False, 'description': '\n\t\n\t\t\n\t\n\t\n\t\tDataset Card for Acronym Identification Dataset\n\t\n\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Summary\n\t\n\nThis dataset contains the training, validation, and test data for the Shared Task 1: Acronym Identification of the AAAI-21 Workshop on Scientific Document Understanding.\n\n\t\n\t\t\n\t\n\t\n\t\tSupported Tasks and Leaderboards\n\t\n\nThe dataset supports an acronym-identification task, where the aim is to predic which tokens in a pre-tokenized sentence correspond to acronyms. The dataset was released for a Shared… See the full description on the dataset page: https://huggingface.co/datasets/acronym_identification.', 'paperswithcode_id': 'acronym-identification', 'downloads': 632, 'likes': 18, 'cardData': {'anno




In [9]:
with open("datasets_info.json", "w", encoding="utf8") as f:
    json.dump(datasets_info_clean, f, ensure_ascii=False, indent=2)

<h4>3) Extraction des dataset_cards</h4>

In [2]:
"""
Fonction pour télécharger le fichier README.md(datasetCard) d'un dataset
    repo_id : nom du dataset
"""
def download_dataset_card(repo_id):
    try:
        data_card_path = hf_hub_download(repo_id=repo_id, filename="README.md", repo_type="dataset")
        with open(data_card_path, "r", encoding="utf8") as f:
            dataset_card = f.read()
        return dataset_card
    except Exception as e:
        print(f"Erreur téléchargement datasetCard {repo_id} :", str(e))
        return None

In [3]:
"""
Fonction pour récupérer les dataset cards
    datasets_info : liste contenant les métadonnées des datasets
"""
def retrieve_datasets_card(datasets_info):
    nb_datasets_card = 0

    # Liste pour stocker les futurs résultats
    futures = []
    # Liste pour stocker les datasetCards
    dataset_card_list = []

    # Utilisation de ThreadPoolExecutor pour télécharger en parallèle
    with ThreadPoolExecutor(max_workers=10) as executor:
        for dataset_info in datasets_info:
            if dataset_info and "cardData" in dataset_info:
                # Soumettre une tâche de téléchargement pour chaque dataset
                future = executor.submit(download_dataset_card, dataset_info["id"])
                futures.append((dataset_info["id"], future))

        # Attendre que toutes les tâches soient terminées
        for dataset_id, future in tqdm(futures, total=len(futures), desc="Progress"):
            dataset_card = future.result()
            if dataset_card:
                # Ajouter le dataset_card aux métadonnées du dataset
                dataset_card_list.append({"id":dataset_id, "dataset_card":dataset_card})
                nb_datasets_card += 1
    
    print(f"Nombre de dataset_card récupérés : {nb_datasets_card} / {len(datasets_info)}")
    
    return dataset_card_list

In [None]:
with open("datasets_info.json", "r", encoding="utf8") as f:
    datasets_info = json.load(f)

# Désactivation des warnings
warnings.filterwarnings("ignore")
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# Récupération des dataset cards
datasets_card = retrieve_datasets_card(datasets_info)

# Création d'un fichier json avec les métadonnées des datasets + les dataset_card
with open("datasets_card.json", "w", encoding="utf8") as f:
    json.dump(datasets_card, f, ensure_ascii=False, indent=2)

Progress:   4%|▍         | 3937/90543 [01:06<23:23, 61.72it/s]  

README.md:   0%|          | 0.00/4.05k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.06k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.90k [00:00<?, ?B/s]

Progress:   6%|▌         | 5036/90543 [01:23<20:48, 68.48it/s]

README.md:   0%|          | 0.00/404 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/397 [00:00<?, ?B/s]

Progress:   6%|▌         | 5230/90543 [01:27<23:55, 59.42it/s]

README.md:   0%|          | 0.00/202 [00:00<?, ?B/s]

Progress:   6%|▋         | 5683/90543 [01:34<21:12, 66.69it/s]

README.md:   0%|          | 0.00/422 [00:00<?, ?B/s]

Progress:   6%|▋         | 5883/90543 [01:37<22:54, 61.60it/s]

<h3>Traitement des datasetCards</h3>

<h4>1) Extraction de la description des datasetCards</h4>


In [3]:
def get_description_from_card(model_card):
    description_keywords =\
    ["Description", "description", "Summary", "summary", "Detail", "detail", "Dataset", "dataset"]
    
    description = ""
    lines = model_card.split("\n")
    
    for i in range(len(lines)):
        if lines[i].startswith("#"):
            for description_keyword in description_keywords:
                if description_keyword in lines[i]:
                    # On récupère la description du dataset
                    i+=1
                    while i < len(lines) and not lines[i].startswith("#"):
                        description = description + lines[i]
                        i+=1
                    return description
    return None 

In [4]:
# Parcourir le fichier json des datasets pour récupérer les dataset cards
with open("datasets_card.json", "r", encoding="utf8") as f:
    datasets = json.load(f)

descriptions = []
nb_datasets = len(datasets)
nb_descriptions_found = 0

# Récupération des descriptions des dataset cards
for dataset in tqdm(datasets):
    #pq datataset peut être None
    if dataset is None:
        continue
    if "dataset_card" in dataset:
        dataset_card = dataset["dataset_card"]
        description = get_description_from_card(dataset_card)
        if description:
            descriptions.append({"description":description, "id":dataset["id"]})
            nb_descriptions_found += 1

print(nb_descriptions_found)
print(nb_datasets)
print(f"Pourcentage de dataset cards avec description: {(nb_descriptions_found/nb_datasets)*100:.2f}%")
print(descriptions[:20])

100%|██████████| 90540/90540 [00:01<00:00, 55167.19it/s] 

40298
90540
Pourcentage de dataset cards avec description: 44.51%
[{'description': '<div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p><b>Deprecated:</b> Dataset "common_voice" is deprecated and will soon be deleted. Use datasets under <a href="https://huggingface.co/mozilla-foundation">mozilla-foundation</a> organisation instead. For example, you can load <a href="https://huggingface.co/datasets/mozilla-foundation/common_voice_13_0">Common Voice 13</a> dataset via <code>load_dataset("mozilla-foundation/common_voice_13_0", "en")</code></p></div>', 'id': 'common_voice'}, {'description': ' ', 'id': 'freebase_qa'}, {'description': '**BAAD16** is an **Authorship Attribution dataset for Bengali Literature**. It was collected and analyzed by the authors of [this paper](https://arxiv.org/




<h3>Récupération d'information académiques</h3>

<h4>1) Récupération des papiers arxiv</h4>

In [2]:
import urllib.request
import xmltodict
"""
Fonction pour télécharger le Papier de recherche correspondant à un arxiv cité dans les méta-données du dataset
    arxiv : numéro de l'arxiv
"""
def download_arxiv_paper(arxiv):
    try:
        url = f"http://export.arxiv.org/api/query?max_results=1&search_query=all:{arxiv}"
        data = urllib.request.urlopen(url)
        # Convertir les données XML en dictionnaire Python
        xml_data = data.read().decode('utf-8')
        dict_data = xmltodict.parse(xml_data)
        return dict_data
    
    except Exception as e:
        print(f"Erreur téléchargement du papier arxiv:{arxiv} :", str(e))
        return None

In [3]:
"""
Fonction pour récupérer les papiers de recherche arxiv
    datasets_info : liste contenant les métadonnées des datasets (notamment les numéros arxiv)
"""
def retrieve_arxiv_paper(datasets_info):
    nb_arxiv_paper = 0
    nb_arxiv_number = 0

    # Liste pour stocker les futurs résultats
    futures = []
    # Liste pour stocker les datasetCards
    arxiv_paper_list = []

    # Utilisation de ThreadPoolExecutor pour télécharger en parallèle
    with ThreadPoolExecutor(max_workers=10) as executor:
        for dataset_info in datasets_info:
            if "arxiv" in dataset_info:
                for arxiv in dataset_info["arxiv"]:
                    nb_arxiv_number += 1
                    # Soumettre une tâche de téléchargement pour chaque arxiv
                    future = executor.submit(download_arxiv_paper, arxiv)
                    futures.append((dataset_info["id"], arxiv, future))

        # Attendre que toutes les tâches soient terminées
        for dataset_id, arxiv, future in tqdm(futures, total=len(futures), desc="Progress"):
            paper_arxiv = future.result()
            if paper_arxiv:
                arxiv_paper_list.append({"id":dataset_id, "arxiv":arxiv, "paper_arxiv":paper_arxiv})
                nb_arxiv_paper += 1
    
    print(f"nb de paper_arxiv récupérés / nb de numéro arxiv récupérés : {nb_arxiv_paper} / {nb_arxiv_number}")
    
    return arxiv_paper_list

In [4]:
# Lecture du fichier json
with open("datasets_info.json", "r", encoding="utf8") as f:
    datasets_info = json.load(f)

In [5]:
# Compte le nombre de datasets ayant des numéros arxiv
nb_datasets_with_arxiv = 0
for dataset in datasets_info:
    if "arxiv" in dataset:
        nb_datasets_with_arxiv += 1
print(f"Nombre de datasets ayant des numéros arxiv : {nb_datasets_with_arxiv}/{len(datasets_info)}")

Nombre de datasets ayant des numéros arxiv : 3581/133172


In [4]:
# Récupération des papiers de recherche arxiv
arxiv_paper_list = retrieve_arxiv_paper(datasets_info)

# Création d'un fichier json contenant les papiers de recherche arxiv
with open("arxiv_paper.json", "w", encoding="utf8") as f:
    json.dump(arxiv_paper_list, f, ensure_ascii=False, indent=2)

NameError: name 'retrieve_arxiv_paper' is not defined

<h4>2) Nettoyage des papiers arxiv</h4>

In [5]:
with open("arxiv_paper.json", "r", encoding="utf8") as f:
    arxiv_paper_list = json.load(f)

In [9]:
new_arxiv_paper_list = []
for arxiv_paper in arxiv_paper_list:
    if arxiv_paper and arxiv_paper["paper_arxiv"] and "feed" in arxiv_paper["paper_arxiv"] and "entry" in arxiv_paper["paper_arxiv"]["feed"]:
        new_arxiv_paper_list.append({"id": arxiv_paper["id"], "arxiv": arxiv_paper["arxiv"], "paper_arxiv": arxiv_paper["paper_arxiv"]["feed"]["entry"]})

print("Nombre de papiers arxiv nettoyés :", len(new_arxiv_paper_list))
with open("arxiv_paper.json", "w", encoding="utf8") as f:
    json.dump(new_arxiv_paper_list, f, ensure_ascii=False, indent=2)

Nombre de papiers arxiv nettoyés : 2020


<h4>3) Récupération des informations sur les citations (Serpapi)</h4>

In [4]:
#refered by:
# import serpapi
from serpapi import GoogleSearch

params = {
    "engine": "google_scholar",
    "num": "1",
    "q": "arXiv:1909.11942",
    "hl": "en",
    "api_key": "5c031d347fae722e2e6576c726ff739ccf6d55165001bb936c95cfa7e15a5994"
}

search = GoogleSearch(params)
results = search.get_dict()
if results:
    if "inline_links" in results and "cited_by" in results["inline_links"]:
        print(results["inline_links"]["cited_by"])
print(results)

{'search_metadata': {'id': '662b9ad55fc49355de6308ca', 'status': 'Success', 'json_endpoint': 'https://serpapi.com/searches/dba6a72319338321/662b9ad55fc49355de6308ca.json', 'created_at': '2024-04-26 12:15:17 UTC', 'processed_at': '2024-04-26 12:15:17 UTC', 'google_scholar_url': 'https://scholar.google.com/scholar?q=arXiv%3A1909.11942&hl=en&num=1', 'raw_html_file': 'https://serpapi.com/searches/dba6a72319338321/662b9ad55fc49355de6308ca.html', 'total_time_taken': 0.98}, 'search_parameters': {'engine': 'google_scholar', 'q': 'arXiv:1909.11942', 'hl': 'en', 'num': '1'}, 'search_information': {'organic_results_state': 'Results for exact spelling', 'query_displayed': 'arXiv:1909.11942'}, 'profiles': {'link': 'https://scholar.google.com/scholar?lookup=0&q=arXiv:1909.11942&hl=en&num=1&as_sdt=0,11', 'serpapi_link': 'https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=arXiv%3A1909.11942'}, 'organic_results': [{'position': 0, 'title': 'Albert: A lite bert for self-supervi

<h4>4) Récupération des informations sur les citations (Scholarly)</h4>

In [4]:
from scholarly import scholarly

# Définir le terme de recherche
search_query = 'arXiv:1909.11942'
# Effectuer la recherche
search_results = scholarly.search_pubs(search_query)

# Afficher les informations sur les citations pour chaque résultat de la recherche
for i, result in enumerate(search_results):
    print(result)
    
    """
    print(f"Résultat {i+1}:")
    print("Année de publication:", result["bib"]['pub_year'])
    print("url citation:", "https://scholar.google.com"+result["citedby_url"])
    
    if "num_citations" in result:
        print("Nombre de citations:", result["num_citations"])
    else:
        print("Aucune information sur les citations disponible.")
    
    results = scholarly.search_citedby("6606720413006378435")
    for i, res in enumerate(results):
        print(f"Résultat {i+1}:")
        print("Titre:", res["bib"]["title"])
        print("Année de publication:", res["bib"]['pub_year'])
        
        if "num_citations" in res:
            print("Nombre de citations:", res["num_citations"])
        else:
            print("Aucune information sur les citations disponible.")
    """

{'container_type': 'Publication', 'source': <PublicationSource.PUBLICATION_SEARCH_SNIPPET: 'PUBLICATION_SEARCH_SNIPPET'>, 'bib': {'title': 'Albert: A lite bert for self-supervised learning of language representations', 'author': ['Z Lan', 'M Chen', 'S Goodman', 'K Gimpel'], 'pub_year': '2019', 'venue': 'arXiv preprint arXiv …', 'abstract': 'Increasing model size when pretraining natural language representations often results in improved performance on downstream tasks. However, at some point further model increases become harder due to GPU/TPU memory limitations and longer training times. To address these problems, we present two parameter-reduction techniques to lower memory consumption and increase the training speed of BERT. Comprehensive empirical evidence shows that our proposed methods lead to models that scale much better compared to the'}, 'filled': False, 'gsrank': 1, 'pub_url': 'https://arxiv.org/abs/1909.11942', 'author_id': ['tlDABkgAAAAJ', 'aRncxakAAAAJ', 'xgZ6V-sAAAAJ', '

<h4>5) Récupération des papiers de recherche citant le dataset (Serpapi)</h4>

In [5]:
import re

url = "https://scholar.google.com/scholar?cites=6606720413006378435&as_sdt=2005&sciodt=0,5&hl=en"
cites_id_match = re.search(r'cites=(\d+)', url)
results = scholarly.search_citedby(cites_id_match.group(1))

for i, res in enumerate(results):
    print(f"Résultat {i+1}:")
    print("Titre:", res["bib"]["title"])
    print("Année de publication:", res["bib"]['pub_year'])
    
    if "num_citations" in res:
        print("Nombre de citations:", res["num_citations"])
    else:
        print("Aucune information sur les citations disponible.")

6606720413006378435
Résultat 1:
Titre: Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing
Année de publication: 2023
Nombre de citations: 3114
Résultat 2:
Titre: Deep learning--based text classification: a comprehensive review
Année de publication: 2021
Nombre de citations: 1475
Résultat 3:
Titre: Lamda: Language models for dialog applications
Année de publication: 2022
Nombre de citations: 1096
Résultat 4:
Titre: Flashattention: Fast and memory-efficient exact attention with io-awareness
Année de publication: 2022
Nombre de citations: 717
Résultat 5:
Titre: Vivit: A video vision transformer
Année de publication: 2021
Nombre de citations: 1817
Résultat 6:
Titre: On the dangers of stochastic parrots: Can language models be too big?🦜
Année de publication: 2021
Nombre de citations: 3557
Résultat 7:
Titre: Beyond the imitation game: Quantifying and extrapolating the capabilities of language models
Année de publication: 2022
Nombre de cit

KeyboardInterrupt: 