In [2]:
import requests
import json
from typing import Union

from fourdimensions.appapi.utils.auth import enc_data
from fourdimensions.appapi.const import DEFAULT_HEADER

class Detail:
    @staticmethod
    def get(item_id: Union[str, int], sess: requests.Session = None) -> dict:
        assert isinstance(item_id, (str, int))
        url = "https://api-hl.bcy.net/apiv2/item/detail"
        params = {
            "item_id": item_id,
        }
        enced_data = enc_data(json.dumps(params, separators=(",", ":")))
        real_params = {
            "data": enced_data,
        }
        r = sess.post(url, data=real_params)
        r.raise_for_status()
        return r.json()

In [4]:
if __name__ == "__main__":
    sess = requests.Session()
    sess.headers.update(DEFAULT_HEADER)
    detail = Detail.get(
            item_id=7215077620059216956,
            sess=sess,
            )
    with open("detail-demo.json", "w", encoding="utf-8") as f:
        json.dump(detail, f, indent=4, ensure_ascii=False)

In [18]:
import pandas as pd
from typing import Dict

def parse_json_to_df(json_dict: Dict) -> pd.DataFrame:
    data = {
        'user_id': json_dict['user']['id'],
        'name': json_dict['user']['name'],
        #'utags': "###".join(json_dict['user']['utags']),
        'post_id': json_dict['post']['id'],
        'tags': "###".join(json_dict['post']['tags']),
        'date': json_dict['post']['date'],
        'parody': json_dict['post']['parody'],
        'content': json_dict['post']['content'],
        'likes': json_dict['post']['likes'],
        'shares': json_dict['post']['shares'],
        'replies': json_dict['post']['replies'],
        'type': json_dict['post']['image_list'][0]['type'],
        'mid': json_dict['post']['image_list'][0]['mid'],
        'w': json_dict['post']['image_list'][0]['w'],
        'h': json_dict['post']['image_list'][0]['h'],
        'original_path': json_dict['post']['image_list'][0]['original_path'],
        'visible_level': json_dict['post']['image_list'][0]['visible_level'],
        'format': json_dict['post']['image_list'][0]['format'],
        # 'collection_title': json_dict['collection']['title'],
        # 'collection_id': json_dict['collection']['collection_id'],
        # 'prev_id': json_dict['collection']['prev_post']['item_id'],
        # 'next_id': json_dict['collection']['next_post']['item_id'],
        # 'collection_user_id': json_dict['collection']['user']['uid'],
        'category': json_dict['category'],
        'subcategory': json_dict['subcategory'],
        'num': json_dict['num'],
        'id': json_dict['id'],
        'width': json_dict['width'],
        'height': json_dict['height'],
        'filename': json_dict['filename'],
        'extension': json_dict['extension'],
        'filter': json_dict['filter'],
        'orig_path': json_dict['orig_path']
    }
    
    return pd.DataFrame([data])

# Usage:
# df = parse_json_to_df(json_dict)
# df.to_parquet('output.parquet')


In [10]:
import json
# reads json
with open(r"/home/studio-lab-user/dev/gdld/gallery-dl/1037581441053848###木甘牌甘木_/7215077620059216956###228660350.jpg.json", "r", encoding="utf-8") as f:
    data = f.read()
    
print(type(data))
json_dict = json.loads(data)
print(json_dict)

<class 'str'>
{'user': {'id': 1037581441053848, 'name': '木甘牌甘木_', 'avatar': 'https://img-bcy-qn.pstatp.com/8a6c7b1cfcf041ec9a5fa2000d14b3c1', 'utags': ['绘师', '娃妈', '猫控']}, 'post': {'id': 7215077620059216956, 'tags': ['稿件展示', '孩厨', 'OC', '孩厨交流中心', '不是我画的', '约稿', '时之庭的回响', '自家OC', '绘画', '海伦娜HELENA'], 'date': '2023-03-27 04:26:06', 'parody': '', 'content': '旅途中，海伦娜会接受一些冒险协会的赏金委托——通常都是采集一些稀有罕见的材料，或者去寻找［传说］的痕迹。<br>  这一次，她正在寻找一种传说中的，能够让人回想起最美好记忆的奇迹之花……', 'likes': 113, 'shares': 0, 'replies': 8, 'image_list': [{'path': 'https://p3-bcy-sign.bcyimg.com/banciyuan/fd00f46128e74ca19cd1ec5d1308d9d1~tplv-banciyuan-w650.image?x-expires=1702771281&x-signature=up4kg1nUesFET19nNMsF0WzWzVo%3D', 'type': 'image', 'mid': 228660350, 'w': 1440, 'h': 900, 'original_path': 'https://p3-bcy-sign.bcyimg.com/banciyuan/fd00f46128e74ca19cd1ec5d1308d9d1~noop.image?x-expires=1702771280&x-signature=zElBj8O2O9UtyHMqQmOvwcMSjQg%3D', 'visible_level': '', 'ratio': 0, 'origin': 'https://p3-bcy-sign.bcyimg.com/banciyuan/fd00f

In [11]:
parse_json_to_df(json_dict)

Unnamed: 0,user_id,name,utags,post_id,tags,date,parody,content,likes,shares,...,category,subcategory,num,id,width,height,filename,extension,filter,orig_path
0,1037581441053848,木甘牌甘木_,绘师###娃妈###猫控,7215077620059216956,稿件展示###孩厨###OC###孩厨交流中心###不是我画的###约稿###时之庭的回响#...,2023-03-27 04:26:06,,旅途中，海伦娜会接受一些冒险协会的赏金委托——通常都是采集一些稀有罕见的材料，或者去寻找［传...,113,0,...,bcy,post,1,228660350,1440,900,fd00f46128e74ca19cd1ec5d1308d9d1,jpg,noop,https://p3-bcy-sign.bcyimg.com/banciyuan/fd00f...


In [23]:
import os
import json
import tarfile
import shutil
import pandas as pd
from typing import List
from glob import glob

class JSONProcessor:
    def __init__(self, user_infos: str):
        self.user_infos = user_infos
        if not os.path.exists(user_infos):
            os.makedirs(user_infos)

    def process(self, directory: str) -> None:
        if not os.path.exists(directory):
            raise FileNotFoundError(f"The directory {directory} does not exist.")
            
        df = self._load_and_combine_json(directory)
        if df.empty:
            print(f"No JSON data found in {directory}.")
            return

        self._save_as_parquet(df, directory)
        self._copy_info_json(directory)
        self._rename_associated_jsons(directory)
        self._compress_and_delete_directory(directory)

    def _load_and_combine_json(self, directory: str) -> pd.DataFrame:
        json_files = glob(os.path.join(directory, "*.json"))
        json_files = [file for file in json_files if "info.json" not in file]

        data_frames = []
        for file in json_files:
            with open(file, 'r') as f:
                data = json.load(f)
                df = parse_json_to_df(data)  # Your original function
                data_frames.append(df)

        if not data_frames:
            return pd.DataFrame()  # Empty DataFrame

        combined_df = pd.concat(data_frames, ignore_index=True)
        return combined_df

    def _save_as_parquet(self, df: pd.DataFrame, directory: str) -> None:
        parquet_path = f"{directory}.parquet"
        df.to_parquet(parquet_path)

    def _copy_info_json(self, directory: str) -> None:
        src = os.path.join(directory, "info.json")
        if not os.path.isfile(src):
            print(f"info.json does not exist in {directory}.")
            return

        directory_name = os.path.basename(directory)
        dest = os.path.join(self.user_infos, f"{directory_name}_info.json")
        shutil.copy2(src, dest)

    def _rename_associated_jsons(self, directory: str) -> None:
        all_files = glob(os.path.join(directory, "*"))
        for file in all_files:
            basename = os.path.basename(file)
            json_file = f"{file}.json"
            if basename.endswith(".jpg") and os.path.isfile(json_file):
                try:
                    os.rename(json_file, file.replace(".jpg", ".json"))
                except Exception as e:
                    print(f"Failed to rename {json_file}: {e}")
                
    def _compress_and_delete_directory(self, directory: str) -> None:
        try:
            with tarfile.open(f"{directory}.tar", "w") as tar:
                tar.add(directory, arcname=os.path.basename(directory))
        except Exception as e:
            print(f"Failed to compress {directory}: {e}")
            return
        
        try:
            shutil.rmtree(directory)
        except Exception as e:
            print(f"Failed to delete {directory}: {e}")


In [24]:
processor = JSONProcessor(user_infos="./user_infos")
processor.process("/home/studio-lab-user/dev/gdld/gallery-dl/1037581441053848###木甘牌甘木_2")