In [21]:
import re
import time
import random
import pandas as pd

from bs4 import BeautifulSoup
from selenium import webdriver
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [75]:
LINKS = [f"https://www.kaggle.com/datasets?page={i}" for i in range(2, 100)]
HREFS = []

In [76]:
def parseHrefs(url):
    driver = webdriver.Chrome()
    driver.get(url)

    time.sleep(random.uniform(5, 10))
    
    html = driver.page_source
    driver.quit()
    
    soup = BeautifulSoup(html, 'html.parser')
    
    links = []
    for a in soup.find_all('a', href=True):
        if '/datasets/' in a['href']:
            links.append('https://www.kaggle.com' + a['href'])
    
    return links

In [77]:
with ThreadPoolExecutor(max_workers=5) as executor:
    results = list(executor.map(parseHrefs, LINKS))

for result in results:
    HREFS.extend(result)

In [84]:
def fetch_page(url):
    try:
        driver = webdriver.Chrome()
        driver.get(url)
        time.sleep(random.uniform(5, 10))
        html = driver.page_source
        driver.quit()
        soup = BeautifulSoup(html, 'html.parser')
        
        heading = soup.find('h1')
        medal_part = soup.find('img', class_='sc-ljzhBY eCrtuc')
        if medal_part is None:
            medal = 'None'
        else:
            medal_part = str(medal_part)
            if 'gold' in medal_part:
                medal = 'Gold'
            elif 'silver' in medal_part:
                medal = 'Silver'
            elif 'bronze' in medal_part:
                medal = 'Bronze'
        if heading is None:
                return
        heading = heading.get_text()
        usability = soup.find('span', attrs={'data-testid': 'usability-value'}).get_text() 
        if usability is None:
            return
                
        target = soup.find_all(string=lambda text: text and 'columns' in text)
        if target is not None:
            if len(target) == 2:
                columns = extract_number(target[-1])
            else:
                return       
                
        data = []
        buttons = soup.find_all(attrs={'role': 'checkbox'})
        if len(buttons) > 0:
            for button in buttons:
                data.append(button.find('span').get_text())
                
            used_for_learning = data[0].split()[-1]
            used_for_research = data[1].split()[-1]
            used_for_application = data[2].split()[-1]
            used_for_llm_fine_tuning = data[3].split()[-1]
            well_documented = data[4].split()[-1]
            well_maintained = data[5].split()[-1]
            clean_data = data[6].split()[-1]
            original = data[7].split()[-1]
            high_quality_notebooks = data[8].split()[-1]
        else:
            return
            
        data = soup.find_all('div', class_='sc-cvFxSY fujiuq')
        data = [x.find_all('span') for x in data]
        if len(data) > 0:
            views = extract_number(data[0][1].get_text())
            downloads = extract_number(data[1][1].get_text())
            engagements = data[2][1].get_text()
            comments = data[3][1].get_text()
        else:
            return
        
        weight = soup.find('div', class_='sc-fYLTzp dnCYSt')
        if weight is None:
            return
        weight = convert_to_kilobytes(weight.get_text())
            
        return heading, usability, columns, used_for_learning, used_for_research, used_for_application, used_for_llm_fine_tuning, well_documented, well_maintained, clean_data, original, high_quality_notebooks, views, downloads, engagements, comments, weight, medal
    except Exception as e:
        print(f"Parsing error {url}: {e}")
        return
    


In [85]:
def extract_number(s: str):
    match = re.search(r'(\d+(\.\d+)?)([kKmM]?)', s)

    if match:
        number = float(match.group(1))
        suffix = match.group(3)

        if suffix.lower() == 'k':
            number *= 1000
        elif suffix.lower() == 'm':
            number *= 1000000 

        return int(number)
    
    return None

In [86]:
def convert_to_kilobytes(string):
    pattern = r'(\d+\.\d+|\d+)\s?(B|kB|MB|GB|TB)'
    match = re.search(pattern, string)
    
    if match:
        size = float(match.group(1))
        unit = match.group(2)

        if unit == 'B':
            return size / 1024
        elif unit == 'kB':
            return size
        elif unit == 'MB':
            return size * 1024
        elif unit == 'GB':
            return size * 1024 * 1024
        elif unit == 'TB':
            return size * 1024 * 1024 * 1024
    else:
        raise ValueError("Invalid input format. Please provide a valid size with unit.")
 

In [87]:
def results(href):
    return fetch_page(href)

In [32]:
HEADINGS = []
USABILITIES = []
COLUMNS = []
USED_FOR_LEARNING = []
USED_FOR_RESEARCH = []
USED_FOR_APPLICATION = []
USED_FOR_LLM_FINETUNING = []
WELL_DOCUMENTED = []
WELL_MAINTAINED = []
CLEAN_DATA = []
ORIGINAL_DATA = []
HIGH_QUALITY_NOTEBOOKS = []
VIEWS = []
DOWNLOADS = []
ENGAGEMENTS = []
COMMENTS = []
WEIGHTS = []
MEDALS = []

In [88]:
with ThreadPoolExecutor(max_workers=5) as executor:
    results = list(tqdm(executor.map(results, HREFS), total=len(HREFS), desc='Parsing'))
    
    for result in results:
        if result is not None: 
            HEADINGS.append(result[0])
            USABILITIES.append(result[1])
            COLUMNS.append(result[2])
            USED_FOR_LEARNING.append(result[3])
            USED_FOR_RESEARCH.append(result[4])
            USED_FOR_APPLICATION.append(result[5])
            USED_FOR_LLM_FINETUNING.append(result[6])
            WELL_DOCUMENTED.append(result[7])
            WELL_MAINTAINED.append(result[8])
            CLEAN_DATA.append(result[9])
            ORIGINAL_DATA.append(result[10])
            HIGH_QUALITY_NOTEBOOKS.append(result[11])
            VIEWS.append(result[12])
            DOWNLOADS.append(result[13])
            ENGAGEMENTS.append(result[14])
            COMMENTS.append(result[15])
            WEIGHTS.append(result[16])
            MEDALS.append(result[17])

Parsing: 100%|██████████| 1028/1028 [1:09:05<00:00,  4.03s/it]


In [97]:
data = {
    'headings': HEADINGS,
    'usabilities': USABILITIES,
    'used_for_learning': USED_FOR_LEARNING,
    'used_for_research': USED_FOR_RESEARCH,
    'used_for_application': USED_FOR_APPLICATION,
    'used_for_llm_fine_tuning': USED_FOR_LLM_FINETUNING,
    'well_documented': WELL_DOCUMENTED,
    'well_maintained': WELL_MAINTAINED,
    'clean_data': CLEAN_DATA,
    'original': ORIGINAL_DATA,
    'high_quality_notebooks': HIGH_QUALITY_NOTEBOOKS,
    'views': VIEWS,
    'downloads': DOWNLOADS,
    'engagements': ENGAGEMENTS,
    'comments': COMMENTS,
    'weight': WEIGHTS,
    'medal': MEDALS
}

In [98]:
df = pd.DataFrame(data)
df.to_csv('data.csv', index=False, encoding='utf-8')
df = pd.read_csv('data.csv')

In [101]:
df

Unnamed: 0,headings,usabilities,used_for_learning,used_for_research,used_for_application,used_for_llm_fine_tuning,well_documented,well_maintained,clean_data,original,high_quality_notebooks,views,downloads,engagements,comments,weight,medal
0,Top 50 Luxury Hotels Worldwide 2024,10.00,0,0,0,0,0,0,0,0,0,5036,1213,0.24087,0,14.69,
1,World's Best Restaurants,10.00,5,0,0,0,1,0,1,0,0,7018,1740,0.24793,0,63.19,Bronze
2,Customer purchase behavior - Electronic Sales ...,10.00,21,0,0,0,6,0,0,1,0,16300,4430,0.27158,0,2488.32,Bronze
3,Melbourne Housing Snapshot,7.06,247,16,9,7,56,20,40,8,12,485000,160000,0.32983,33,2140.16,Gold
4,Canadian Cheese Directory,10.00,1,0,1,0,0,0,0,0,0,5255,938,0.17850,1,147.11,Bronze
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1294,Microsoft Stock Data,10.00,6,6,1,0,2,3,2,2,1,21400,3828,0.17858,0,627.62,Bronze
1295,Human vs. LLM Text Corpus,10.00,6,3,3,0,2,3,3,2,4,8057,1196,0.14844,6,4037017.60,Gold
1296,Anual Salary reports survey,8.24,3,1,0,0,0,0,0,0,0,12500,2611,0.20960,0,3491.84,Bronze
1297,VideoGames Sales,6.76,7,0,0,0,3,2,0,0,1,20300,3725,0.18312,0,470.62,
