In [1]:
import re
import time
import random
import pandas as pd

from bs4 import BeautifulSoup
from selenium import webdriver
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [2]:
LINKS = [f"https://www.kaggle.com/datasets?page={i}" for i in range(200, 400)]
HREFS = []

In [3]:
def parseHrefs(url):
    driver = webdriver.Chrome()
    driver.get(url)

    time.sleep(random.uniform(5, 10))
    
    html = driver.page_source
    driver.quit()
    
    soup = BeautifulSoup(html, 'html.parser')
    
    links = []
    for a in soup.find_all('a', href=True):
        if '/datasets/' in a['href']:
            links.append('https://www.kaggle.com' + a['href'])
    
    return links

In [4]:
with ThreadPoolExecutor(max_workers=5) as executor:
    results = list(executor.map(parseHrefs, LINKS))

for result in results:
    HREFS.extend(result)

In [5]:
len(HREFS)

4196

In [6]:
def fetch_page(url):
    try:
        driver = webdriver.Chrome()
        driver.get(url)
        time.sleep(random.uniform(5, 10))
        html = driver.page_source
        driver.quit()
        soup = BeautifulSoup(html, 'html.parser')
        
        heading = soup.find('h1')
        medal_part = soup.find('img', class_='sc-ljzhBY eCrtuc')
        if medal_part is None:
            medal = 'None'
        else:
            medal_part = str(medal_part)
            if 'gold' in medal_part:
                medal = 'Gold'
            elif 'silver' in medal_part:
                medal = 'Silver'
            elif 'bronze' in medal_part:
                medal = 'Bronze'
        if heading is None:
                return
        heading = heading.get_text()
        usability = soup.find('span', attrs={'data-testid': 'usability-value'}).get_text() 
        if usability is None:
            return
                
        target = soup.find_all(string=lambda text: text and 'columns' in text)
        if target is not None:
            if len(target) == 2:
                columns = extract_number(target[-1])
            else:
                return       
                
        data = []
        buttons = soup.find_all(attrs={'role': 'checkbox'})
        if len(buttons) > 0:
            for button in buttons:
                data.append(button.find('span').get_text())
                
            used_for_learning = data[0].split()[-1]
            used_for_research = data[1].split()[-1]
            used_for_application = data[2].split()[-1]
            used_for_llm_fine_tuning = data[3].split()[-1]
            well_documented = data[4].split()[-1]
            well_maintained = data[5].split()[-1]
            clean_data = data[6].split()[-1]
            original = data[7].split()[-1]
            high_quality_notebooks = data[8].split()[-1]
        else:
            return
            
        data = soup.find_all('div', class_='sc-cvFxSY fujiuq')
        data = [x.find_all('span') for x in data]
        if len(data) > 0:
            views = extract_number(data[0][1].get_text())
            downloads = extract_number(data[1][1].get_text())
            engagements = data[2][1].get_text()
            comments = data[3][1].get_text()
        else:
            return
        
        weight = soup.find('div', class_='sc-fYLTzp dnCYSt')
        if weight is None:
            return
        weight = convert_to_kilobytes(weight.get_text())
            
        return heading, usability, columns, used_for_learning, used_for_research, used_for_application, used_for_llm_fine_tuning, well_documented, well_maintained, clean_data, original, high_quality_notebooks, views, downloads, engagements, comments, weight, medal
    except Exception as e:
        print(f"Parsing error {url}: {e}")
        return
    


In [7]:
def extract_number(s: str):
    match = re.search(r'(\d+(\.\d+)?)([kKmM]?)', s)

    if match:
        number = float(match.group(1))
        suffix = match.group(3)

        if suffix.lower() == 'k':
            number *= 1000
        elif suffix.lower() == 'm':
            number *= 1000000 

        return int(number)
    
    return None

In [8]:
def convert_to_kilobytes(string):
    pattern = r'(\d+\.\d+|\d+)\s?(B|kB|MB|GB|TB)'
    match = re.search(pattern, string)
    
    if match:
        size = float(match.group(1))
        unit = match.group(2)

        if unit == 'B':
            return size / 1024
        elif unit == 'kB':
            return size
        elif unit == 'MB':
            return size * 1024
        elif unit == 'GB':
            return size * 1024 * 1024
        elif unit == 'TB':
            return size * 1024 * 1024 * 1024
    else:
        raise ValueError("Invalid input format. Please provide a valid size with unit.")
 

In [9]:
def results(href):
    return fetch_page(href)

In [10]:
HEADINGS = []
USABILITIES = []
COLUMNS = []
USED_FOR_LEARNING = []
USED_FOR_RESEARCH = []
USED_FOR_APPLICATION = []
USED_FOR_LLM_FINETUNING = []
WELL_DOCUMENTED = []
WELL_MAINTAINED = []
CLEAN_DATA = []
ORIGINAL_DATA = []
HIGH_QUALITY_NOTEBOOKS = []
VIEWS = []
DOWNLOADS = []
ENGAGEMENTS = []
COMMENTS = []
WEIGHTS = []
MEDALS = []

In [11]:
with ThreadPoolExecutor(max_workers=5) as executor:
    results = list(tqdm(executor.map(results, HREFS), total=len(HREFS), desc='Parsing'))
    
    for result in results:
        if result is not None: 
            HEADINGS.append(result[0])
            USABILITIES.append(result[1])
            COLUMNS.append(result[2])
            USED_FOR_LEARNING.append(result[3])
            USED_FOR_RESEARCH.append(result[4])
            USED_FOR_APPLICATION.append(result[5])
            USED_FOR_LLM_FINETUNING.append(result[6])
            WELL_DOCUMENTED.append(result[7])
            WELL_MAINTAINED.append(result[8])
            CLEAN_DATA.append(result[9])
            ORIGINAL_DATA.append(result[10])
            HIGH_QUALITY_NOTEBOOKS.append(result[11])
            VIEWS.append(result[12])
            DOWNLOADS.append(result[13])
            ENGAGEMENTS.append(result[14])
            COMMENTS.append(result[15])
            WEIGHTS.append(result[16])
            MEDALS.append(result[17])

Parsing:   2%|▏         | 100/4196 [05:34<2:38:40,  2.32s/it]

Parsing error https://www.kaggle.com/datasets/alexandrelemercier/internet-traffic-relevagan-playground: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=130.0.6723.70)
Stacktrace:
	GetHandleVerifier [0x00007FF6B57B3AB5+28005]
	(No symbol) [0x00007FF6B57183B0]
	(No symbol) [0x00007FF6B55B580A]
	(No symbol) [0x00007FF6B55B2F5F]
	(No symbol) [0x00007FF6B55A3E49]
	(No symbol) [0x00007FF6B55A5BE6]
	(No symbol) [0x00007FF6B55A410F]
	(No symbol) [0x00007FF6B55A39BD]
	(No symbol) [0x00007FF6B55A38DA]
	(No symbol) [0x00007FF6B55A151E]
	(No symbol) [0x00007FF6B55A1DEC]
	(No symbol) [0x00007FF6B55B88AA]
	(No symbol) [0x00007FF6B564C6DE]
	(No symbol) [0x00007FF6B562BA3A]
	(No symbol) [0x00007FF6B564B8B3]
	(No symbol) [0x00007FF6B562B7E3]
	(No symbol) [0x00007FF6B55F75C8]
	(No symbol) [0x00007FF6B55F8731]
	GetHandleVerifier [0x00007FF6B5AA643D+3118829]
	GetHandleVerifier [0x00007FF6B5AF6C90+3448640]
	GetHandleVerifier [0x00007FF6B5AECF0D+3408317]
	GetHandleVerifier [0x

Parsing:   2%|▏         | 101/4196 [06:19<14:25:41, 12.68s/it]

Parsing error https://www.kaggle.com/datasets/undersc0re/flight-delay-and-causes: 'NoneType' object has no attribute 'get_text'


Parsing:  90%|████████▉ | 3775/4196 [3:54:06<26:56,  3.84s/it]   

Parsing error https://www.kaggle.com/datasets/carlosaguayo/usa-hospitals: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=130.0.6723.70)
Stacktrace:
	GetHandleVerifier [0x00007FF6B57B3AB5+28005]
	(No symbol) [0x00007FF6B57183B0]
	(No symbol) [0x00007FF6B55B580A]
	(No symbol) [0x00007FF6B55B2F5F]
	(No symbol) [0x00007FF6B55A3E49]
	(No symbol) [0x00007FF6B55A5BE6]
	(No symbol) [0x00007FF6B55A410F]
	(No symbol) [0x00007FF6B55A39BD]
	(No symbol) [0x00007FF6B55A38DA]
	(No symbol) [0x00007FF6B55A151E]
	(No symbol) [0x00007FF6B55A1DEC]
	(No symbol) [0x00007FF6B55B88AA]
	(No symbol) [0x00007FF6B564C6DE]
	(No symbol) [0x00007FF6B562BA3A]
	(No symbol) [0x00007FF6B564B8B3]
	(No symbol) [0x00007FF6B562B7E3]
	(No symbol) [0x00007FF6B55F75C8]
	(No symbol) [0x00007FF6B55F8731]
	GetHandleVerifier [0x00007FF6B5AA643D+3118829]
	GetHandleVerifier [0x00007FF6B5AF6C90+3448640]
	GetHandleVerifier [0x00007FF6B5AECF0D+3408317]
	GetHandleVerifier [0x00007FF6B587A40B+841403]
	(No 

Parsing: 100%|██████████| 4196/4196 [4:18:22<00:00,  3.69s/it]


In [12]:
data = {
    'headings': HEADINGS,
    'usabilities': USABILITIES,
    'used_for_learning': USED_FOR_LEARNING,
    'used_for_research': USED_FOR_RESEARCH,
    'used_for_application': USED_FOR_APPLICATION,
    'used_for_llm_fine_tuning': USED_FOR_LLM_FINETUNING,
    'well_documented': WELL_DOCUMENTED,
    'well_maintained': WELL_MAINTAINED,
    'clean_data': CLEAN_DATA,
    'original': ORIGINAL_DATA,
    'high_quality_notebooks': HIGH_QUALITY_NOTEBOOKS,
    'views': VIEWS,
    'downloads': DOWNLOADS,
    'engagements': ENGAGEMENTS,
    'comments': COMMENTS,
    'weight': WEIGHTS,
    'medal': MEDALS
}

In [13]:
df = pd.DataFrame(data)
df.to_csv('data2.csv', index=False, encoding='utf-8')
df = pd.read_csv('data2.csv')

In [14]:
df

Unnamed: 0,headings,usabilities,used_for_learning,used_for_research,used_for_application,used_for_llm_fine_tuning,well_documented,well_maintained,clean_data,original,high_quality_notebooks,views,downloads,engagements,comments,weight,medal
0,Tortilla prices in Mexico,10.00,4,3,0,0,0,0,2,0,0,9936,1922,0.19344,0,15697.92,
1,Spotify 1.2M+ Songs,8.24,8,0,1,0,1,0,3,0,0,54100,7455,0.13777,2,354037.76,
2,Causes of Death in Indonesia,10.00,8,3,0,0,3,0,0,0,0,52000,10400,0.19994,0,410.80,
3,Indicators of Anxiety or Depression,10.00,0,1,0,0,0,0,0,0,0,6891,1058,0.15353,0,2652.16,
4,diamonds,10.00,0,0,0,0,0,0,0,0,0,2006,331,0.16500,0,3266.56,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1276,COVID-19 Lockdown dates by country,8.82,0,0,0,0,0,0,0,0,0,38900,4376,0.11248,3,43.49,
1277,Spam Email Dataset,10.00,0,0,0,0,0,0,0,0,0,1378,323,0.23440,0,485.70,
1278,CO2 Emissions (USA),6.47,0,0,0,0,0,0,0,0,0,1046,226,0.21606,0,4485.12,
1279,Punjab Stubble Burning Crop Fire Data,9.41,0,0,0,0,0,0,0,0,0,20,2,0.10000,0,1648.64,
