In [301]:
import zipfile
import json
import os
from pathlib import Path 
import re
import io
import ast

In [302]:
def infer_placeholder(value):
    if isinstance(value, str):
        return "string"
    elif isinstance(value, bool):
        return "boolean"
    elif isinstance(value, int) or isinstance(value, float):
        return "number"
    elif isinstance(value, list):
        return ["array"]
    elif isinstance(value, dict):
        return {k: infer_placeholder(v) for k, v in value.items()}
    elif value is None:
        return None
    else:
        return "unknown"



In [303]:
def simplify_json_structure(data):
    # If the data is a dictionary...
    if isinstance(data, dict):
        # ...create a new dictionary where:
        # - each key is kept the same
        # - each value is simplified by calling this function recursively
        return {k: simplify_json_structure(v) for k, v in data.items()}

    # If the data is a list...
    
    elif isinstance(data, list):
        if len(data) > 0:
            # Map each item in the list through this function
            simplified_items = [simplify_json_structure(item) for item in data]

            # If all items are the same, keep only one to reduce noise
            if all(item == simplified_items[0] for item in simplified_items):
                return [simplified_items[0]]
            else:
                return simplified_items
        else:
            return ["array"]

    else:
        return infer_placeholder(data)


In [304]:
def extract_json_from_js(js_content):
    """
    Extract JSON from JavaScript by removing variable assignment like: window.YT_DATA = {...};
    """
    index = js_content.find('=')
    data =  js_content[index + 1:]
    
    try:
        data = json.loads(data)
    except:
        print('Not loaded:', data)
    #print(data)
    if isinstance(data, list):
        if len(data) == 1 and isinstance(data[0], dict):
            return data[0]  # single dictionary in a list
        
        elif all(isinstance(item, dict) for item in data):
            return data  # list of dictionaries
        else:
            raise ValueError("List contains non-dictionary items")
    elif isinstance(data, dict):
        return data  # already a dictionary
    else:
        return None

In [306]:
def save_structure(output_structure, zip_path):
    # Serializing json
    json_object = json.dumps(output_structure, indent=2)
    zip_name = Path(zip_path).stem 

    # Writing to sample.json
    with open(f"{main_path}Input_test/X_structure_{zip_name}.json", "w") as outfile:
        outfile.write(json_object)




In [308]:

def structure_from_zip(zip_path):
    output_structure = {}

    with zipfile.ZipFile(zip_path, 'r') as z:
        for file_info in z.infolist():
            # Split the path into parts
            path_parts = file_info.filename.split('/')

            # Process only files where 'data' is in the second position
            if file_info.is_dir() or len(path_parts) < 2 or path_parts[0] != 'data':
                continue

            with z.open(file_info.filename) as f:
                try:
                    raw_bytes = f.read()
                except Exception:
                    output_structure[file_info.filename] = "Failed to read file"
                    continue

                try:
                    content_str = raw_bytes.decode("utf-8")
                except UnicodeDecodeError:
                    try:
                        content_str = raw_bytes.decode("latin1")
                    except Exception:
                        output_structure[file_info.filename] = "Encoding error"
                        continue

                content = None

                if file_info.filename.endswith('.js'):
                    content = extract_json_from_js(content_str)
                else:
                    continue  # Skip unknown file types

                placeholder_content = simplify_json_structure(content)

                if placeholder_content == ["array"]:
                     output_structure[file_info.filename] = "No data"
                        
                else:
                    output_structure[file_info.filename] = placeholder_content

    save_structure(output_structure, zip_path)
    return json.dumps(output_structure, indent=2, ensure_ascii=False)
    

In [309]:
main_path = "/home/rvissche/Nextcloud/What-If/what-if-data-donation/what-if-data-donation/structure_donations/Processed_structure_donations/Twitter/"
input_directory = Path(f'{main_path}/Raw')  

In [310]:
for file in input_directory.iterdir():  
    if file.is_file():  
        structure_from_zip(file)