In [21]:
import pandas as pd
import json
import re
import numpy as np
import ast


In [22]:
# Load JSON file
with open('/home/rvissche/Nextcloud/What-If/data/json_structure_donations/Input/json_structure_2_RB_Facebook.json', 'r') as f:
    data = json.load(f)

In [23]:
# Flatten JSON (handling nested structures)
df = pd.json_normalize(data, max_level=0)


In [24]:
# Delete user specific informations
df.columns = df.columns.str.replace(r'^[^/]+/', '', regex=True)

In [25]:
# Extract column names
cols = df.columns[0:2775]

# From wide to long df
df = pd.melt(df, value_vars= cols)

In [26]:
# Create different colums for each part of the document path
df['path_1'] = df['variable'].str.split('/', n=1).str[0]
df['path_2'] = df['variable'].str.split('/', n=3).str[1]
df['path_3'] = df['variable'].str.split('/', n=3).str[2]
df['path_4'] = df['variable'].str.split('/', n=3).str[3]

# Create a column with the JSON name
df['json_name'] = df['variable'].str.rsplit('/', n=1).str[-1]


In [27]:
# As the JSON name is stored in the json_name column, fill other parts of the path with Na if the name of the JSON is present
mark = ".json"

df['path_2'] = df['path_2'].apply(lambda x: np.nan if isinstance(x, str) and mark in x else x)
df['path_3'] = df['path_3'].apply(lambda x: np.nan if isinstance(x, str) and mark in x else x)
df['path_4'] = df['path_4'].apply(lambda x: np.nan if isinstance(x, str) and mark in x else x)



In [28]:
# Unlist the value column (where the JSON info is stored)
for index, row in df.iterrows():
    if isinstance(row['value'], list):
        df.at[index, 'value'] = row['value'][0]

In [29]:
# Create an emtpy column for the level 1 JSON 
df['json_1'] = ''

# Extract the level 1 keys
for index, row in df.iterrows():
    df.at[index, 'json_1'] = list(row['value'].keys())

# Take the level 1 keys stored in a list and store them in individual rows
df = df.explode('json_1')

In [30]:
# For the level 1 keys stored in json_1 extract the level 2 keys and store in json_2
df['json_2'] = df.apply(lambda row: [row['value'].get(row['json_1'], None)], axis=1)


In [31]:
# Unlist and store in individual rows
df['json_2'] = df['json_2'].apply(lambda x: x[0] if isinstance(x, list) else x)
df['json_2'] = df['json_2'].apply(lambda x: x[0] if isinstance(x, list) else x)
df = df.explode('json_2')


In [32]:
# Define the data types
data_types = ['string', 'array', 'number', 'boolean']

# If a data type is stored in json_2 (data types aren't keys) replace with Na and store in the column data_type
def process_json_2(row):
    if row['json_2'] in data_types:
        row['data_type'] = row['json_2']
        row['json_2'] = np.nan
    return row

df = df.apply(process_json_2, axis=1)

In [33]:
# Reorder the columns in the df
df = df.loc[:, ['variable', 'value', 'path_1', 'path_2',
       'path_3', 'path_4', 'json_name', 'json_1', 'json_2', 'data_type']]

In [34]:
df.reset_index(drop=True, inplace=True)
