# Table Downloader
__authors__: ufukbombar@gmail.com

__date__: 2024-11-18

__goal__: This notebook downloads the latest table names to the data folder.

In [13]:
# Required packages
%pip install pandas requests python-dotenv httpx aiofiles dataclasses tqdm pyarrow

# External imports
import pandas as pd 
from dataclasses import dataclass
import json
from datetime import datetime
from tqdm.asyncio import tqdm
import os

# Internal imports
from config import config
import common
import file as ff

Note: you may need to restart the kernel to use updated packages.


In [14]:
# Set the repsonse format for clickhouse.
config.response_format="JSONCompact"

@dataclass
class TableNames():
    table_name: str 
    creation_date: datetime

async def download_all_table_names() -> pd.DataFrame:
    df_list = []
    with ff.TemporaryFile() as temp:
        query_string = "show tables"

        with tqdm(desc="Running the query result", unit="MB") as pb:
            # Run async downloader
            async for chunk in tqdm(common.run_query(config, query_string)):
                temp.write(chunk)
                pb.update(len(chunk) // config.chunk_size)
        current_time = datetime.now()
        temp.close()

        with open(temp.filename, "r") as tmpr:
            json_object = json.load(tmpr)

        for record in json_object['data']:
            name = record[0]
            df_list.append(TableNames(
                table_name=name,
                creation_date=current_time
            ))

    return pd.DataFrame(df_list)

df = await download_all_table_names()
df

9it [00:01,  5.29it/s]lt: 8MB [00:01,  6.07MB/s]
Running the query result: 8MB [00:01,  4.69MB/s]


Unnamed: 0,table_name,creation_date
0,cleaned_links__007046a9_518e_46cb_8e70_e598b8b...,2024-11-18 20:10:38.865040
1,cleaned_links__007046a9_518e_46cb_8e70_e598b8b...,2024-11-18 20:10:38.865040
2,cleaned_links__007046a9_518e_46cb_8e70_e598b8b...,2024-11-18 20:10:38.865040
3,cleaned_links__007046a9_518e_46cb_8e70_e598b8b...,2024-11-18 20:10:38.865040
4,cleaned_links__007046a9_518e_46cb_8e70_e598b8b...,2024-11-18 20:10:38.865040
...,...,...
100550,results__ffd2d86c_05c1_4607_a5b1_4ab04b77ee41_...,2024-11-18 20:10:38.865040
100551,results__ffd9d901_2127_4048_8f9a_805e3fa4930c_...,2024-11-18 20:10:38.865040
100552,results__ffebc346_5c58_4d73_8866_1746dedadc50_...,2024-11-18 20:10:38.865040
100553,results__ffec2848_65c7_44d1_8f15_8bd08e2d41f0_...,2024-11-18 20:10:38.865040


In [15]:
ff.save_dataframe_feather(df, "../data/tables/", prefix="tables-")
print("Done")

Done
