In [28]:
import pandas as pd
import json
import re
import numpy as np
import ast
from pathlib import Path  

In [29]:
main_path = "/home/rvissche/Nextcloud/What-If/what-if-data-donation/what-if-data-donation/json_structure_donations/processed_structure_donations/"

In [30]:
# Define the data types
data_types = ['string', 'array', 'number', 'boolean']

def process_json_2(row):
    if row['json_2'] in data_types:
        row['data_type'] = row['json_2']
        row['json_2'] = np.nan
    return row

In [31]:

def structure_donations(data):

    data = Path(data)  
    #file_name = data.name  # Extracts "data.json"
    file_name = Path(data).stem 

    # Load JSON file
    with open(data, 'r') as f:
        data = json.load(f)

    # Flatten JSON (handling nested structures)
    df = pd.json_normalize(data, max_level=0)

    # Delete user specific informations
    df.columns = df.columns.str.replace(r'^[^/]+/', '', regex=True)

    # Extract column names
    cols = df.columns.tolist()


    # From wide to long df
    df = pd.melt(df, value_vars= cols)

    # Create different colums for each part of the document path
    df['path_1'] = df['variable'].str.split('/', n=1).str[0]
    df['path_2'] = df['variable'].str.split('/', n=3).str[1]
    df['path_3'] = df['variable'].str.split('/', n=3).str[2]
    df['path_4'] = df['variable'].str.split('/', n=3).str[3]

    # Create a column with the JSON name
    df['json_name'] = df['variable'].str.rsplit('/', n=1).str[-1]


    # As the JSON name is stored in the json_name column, fill other parts of the path with Na if the name of the JSON is present
    mark = ".json"

    df['path_2'] = df['path_2'].apply(lambda x: np.nan if isinstance(x, str) and mark in x else x)
    df['path_3'] = df['path_3'].apply(lambda x: np.nan if isinstance(x, str) and mark in x else x)
    df['path_4'] = df['path_4'].apply(lambda x: np.nan if isinstance(x, str) and mark in x else x)

    # Unlist the value column (where the JSON info is stored)
    for index, row in df.iterrows():
        if isinstance(row['value'], list):
            df.at[index, 'value'] = row['value'][0]


    # Create an emtpy column for the level 1 JSON 
    df['json_1'] = ''

    # Extract the level 1 keys
    for index, row in df.iterrows():
        df.at[index, 'json_1'] = list(row['value'].keys())

    # Take the level 1 keys stored in a list and store them in individual rows
    df = df.explode('json_1')

    # For the level 1 keys stored in json_1 extract the level 2 keys and store in json_2
    df['json_2'] = df.apply(lambda row: [row['value'].get(row['json_1'], None)], axis=1)

    # Unlist and store in individual rows
    df['json_2'] = df['json_2'].apply(lambda x: x[0] if isinstance(x, list) else x)
    df['json_2'] = df['json_2'].apply(lambda x: x[0] if isinstance(x, list) else x)

    print('Before explode json_2')
    display(df)


    df = df.explode('json_2')

    print('after explode json_2')
    display(df)

    # If a data type is stored in json_2 (data types aren't keys) replace with Na and store in the column data_type

    df = df.apply(process_json_2, axis=1)

    # Reorder the columns in the df
    df = df.loc[:, ['variable', 'value', 'path_1', 'path_2',
        'path_3', 'path_4', 'json_name', 'json_1', 'json_2', 'data_type']]
    
    df.reset_index(drop=True, inplace=True)
    
    dfs = {}
    dfs[file_name] = df

    df.to_csv(f"{main_path}Youtube/Output/Output_" + file_name + '.csv', index=False)




In [32]:
input_directory = Path(f'{main_path}Youtube/Input')  
print(input_directory)

/home/rvissche/Nextcloud/What-If/what-if-data-donation/what-if-data-donation/json_structure_donations/processed_structure_donations/Youtube/Input


In [33]:

for file in input_directory.iterdir():  
    if file.is_file():  
        print(file)
        structure_donations(file)

/home/rvissche/Nextcloud/What-If/what-if-data-donation/what-if-data-donation/json_structure_donations/processed_structure_donations/Youtube/Input/json_structure_youtube.json
Before explode json_2


Unnamed: 0,variable,value,path_1,path_2,path_3,path_4,json_name,json_1,json_2
0,YouTube and YouTube Music/history/watch-histor...,"{'header': 'string', 'title': 'string', 'title...",YouTube and YouTube Music,history,,,watch-history.json,header,string
0,YouTube and YouTube Music/history/watch-histor...,"{'header': 'string', 'title': 'string', 'title...",YouTube and YouTube Music,history,,,watch-history.json,title,string
0,YouTube and YouTube Music/history/watch-histor...,"{'header': 'string', 'title': 'string', 'title...",YouTube and YouTube Music,history,,,watch-history.json,titleUrl,string
0,YouTube and YouTube Music/history/watch-histor...,"{'header': 'string', 'title': 'string', 'title...",YouTube and YouTube Music,history,,,watch-history.json,subtitles,"{'name': 'string', 'url': 'string'}"
0,YouTube and YouTube Music/history/watch-histor...,"{'header': 'string', 'title': 'string', 'title...",YouTube and YouTube Music,history,,,watch-history.json,time,string
0,YouTube and YouTube Music/history/watch-histor...,"{'header': 'string', 'title': 'string', 'title...",YouTube and YouTube Music,history,,,watch-history.json,products,string
0,YouTube and YouTube Music/history/watch-histor...,"{'header': 'string', 'title': 'string', 'title...",YouTube and YouTube Music,history,,,watch-history.json,activityControls,string


after explode json_2


Unnamed: 0,variable,value,path_1,path_2,path_3,path_4,json_name,json_1,json_2
0,YouTube and YouTube Music/history/watch-histor...,"{'header': 'string', 'title': 'string', 'title...",YouTube and YouTube Music,history,,,watch-history.json,header,string
0,YouTube and YouTube Music/history/watch-histor...,"{'header': 'string', 'title': 'string', 'title...",YouTube and YouTube Music,history,,,watch-history.json,title,string
0,YouTube and YouTube Music/history/watch-histor...,"{'header': 'string', 'title': 'string', 'title...",YouTube and YouTube Music,history,,,watch-history.json,titleUrl,string
0,YouTube and YouTube Music/history/watch-histor...,"{'header': 'string', 'title': 'string', 'title...",YouTube and YouTube Music,history,,,watch-history.json,subtitles,name
0,YouTube and YouTube Music/history/watch-histor...,"{'header': 'string', 'title': 'string', 'title...",YouTube and YouTube Music,history,,,watch-history.json,subtitles,url
0,YouTube and YouTube Music/history/watch-histor...,"{'header': 'string', 'title': 'string', 'title...",YouTube and YouTube Music,history,,,watch-history.json,time,string
0,YouTube and YouTube Music/history/watch-histor...,"{'header': 'string', 'title': 'string', 'title...",YouTube and YouTube Music,history,,,watch-history.json,products,string
0,YouTube and YouTube Music/history/watch-histor...,"{'header': 'string', 'title': 'string', 'title...",YouTube and YouTube Music,history,,,watch-history.json,activityControls,string


In [34]:
# Path to the folder containing CSV files
output_path = f"{main_path}Youtube/Output"

# Get a list of all CSV files in the folder
csv_files = list(Path(output_path).glob("*.csv"))

# Load all CSVs into a list of DataFrames
dfs = [pd.read_csv(file) for file in csv_files]


common_columns = ['variable', 'value', 'path_1', 'path_2',
        'path_3', 'path_4', 'json_name', 'json_1', 'json_2', 'data_type']

merged_df = dfs[0]  # Start with the first DataFrame
for df in dfs[1:]:  # Merge with the rest
    merged_df = merged_df.merge(df, on=common_columns, how="outer")

# Filter where col1 contains 'messages', then drop duplicates based on col2
df_filtered = merged_df[merged_df["path_1"] == "messages"].drop_duplicates(subset="path_2")


# Append rows where col1 does not contain 'messages'
df_final = pd.concat([df_filtered, merged_df[merged_df["path_1"] != "messages"]], ignore_index=True)


# Save the final merged DataFrame
df_final.to_csv(f"{main_path}Youtube/Final/Merged_structures_YT.csv", index=False)