In [155]:
import pandas as pd
import json
import re
import numpy as np
import ast
from pathlib import Path  

In [156]:
main_path = "/home/rvissche/Nextcloud/What-If/what-if-data-donation/what-if-data-donation/structure_donations/Processed_structure_donations/"

In [157]:
# Define the data types
data_types = ['string', 'array', 'number', 'boolean', 'object', 'str', 'int', 'float', 'bool', 'dict', 'list']
columns = ['col_path_1','col_path_2','col_path_3','col_path_4', 'col_path_5']


def process_row_path(row, columns, data_types):
    row['data_type'] = ''
    for column in columns:
        if row[column] in data_types:
            row['data_type'] = row[column]
            row[column] = np.nan
        else:
            row[column]
    return row


In [158]:
def file_paths(df):
    # Create different colums for each part of the document path
    df['path_1'] = df['variable'].str.split('/', n=1).str[0]
    df['path_2'] = df['variable'].str.split('/', n=3).str[1]
    df['path_3'] = df['variable'].str.split('/', n=3).str[2]
    df['path_4'] = df['variable'].str.split('/', n=3).str[3]

    # Create a column with the JSON name
    df['json_name'] = df['variable'].str.rsplit('/', n=1).str[-1]


    # As the JSON name is stored in the json_name column, fill other parts of the path with Na if the name of the JSON is present
    mark = ".json"

    df[['path_2', 'path_3', 'path_4']] =  df[['path_2', 'path_3', 'path_4']].map(lambda x: np.nan if isinstance(x, str) and mark in x else x)



    # Unlist the value column (where the JSON info is stored)
    for index, row in df.iterrows():
        if isinstance(row['value'], list):
            df.at[index, 'value'] = row['value'][0]
    
    return df

In [159]:
def string_to_dict(s):
    if ',' not in s and ':' not in s:
        return s
    result = {}
    items = s.split(',')
    for item in items:
        if ':' not in item:
            continue  
        key, value = item.split(':', 1)  # use maxsplit=1 to avoid unpacking issues
        try:
            key = eval(key.strip())
            value = eval(value.strip())
        except Exception as e:
            continue
        result[key] = value
    return result

    

In [160]:
def column_paths(df, row_var, get_var, unlist_var, explode_var):
        
        # Convert to dict if it's a string
        df[row_var] = df[row_var].apply(
            lambda x: string_to_dict(x) if isinstance(x, str) else x)
        
        
        # Extract nested values using keys
        df[unlist_var] = df.apply(lambda row: row[row_var].get(row[get_var], None) if isinstance (row[row_var], dict) else None, axis=1)


        # Assign list if the data in the original structure is a list
        def detect_list(x):
            if isinstance(x, list):
                return 'LIST'
            elif pd.isna(x):
                return 'MISSING'
            elif x == 'No data':
                    return 'MISSING'
            else:
                return 'NO LIST'

        df[f'{explode_var}_LIST'] = df[unlist_var].apply(detect_list)

        df[unlist_var] = df[unlist_var].apply(lambda x: x[0] if isinstance(x, list) else x)
        
        df[explode_var] = df[unlist_var]
        df = df.explode(explode_var)

    
        print(df.columns.tolist())
        return df

In [161]:
def row_column_paths(df):
    
    # Initialize the new columns
    df['row_path'] = '' 
    df['col_path_1'] = ''  
    df['col_path_2'] = ''  
    df['col_path_3'] = ''
    df['col_path_4'] = ''
    df['col_path_5'] = '' 
    
    # Extract the level 1 keys 
    for index, row in df.iterrows():
        df.at[index, 'row_path'] = list(row['value'].keys())

    df['row_path'] = df['row_path'].astype('str').apply(eval)

    # Take the level 1 keys stored in a list and store them in individual rows
    df = df.explode('row_path')


    row_var = ['value', 'col_path_1_values', 'col_path_2_values', 'col_path_3_values', 'col_path_4_values']
    get_var = ['row_path', 'col_path_1', 'col_path_2', 'col_path_3', 'col_path_4']
    unlist_var = ['col_path_1_values', 'col_path_2_values', 'col_path_3_values', 'col_path_4_values', 'col_path_5_values']
    explode_var = ['col_path_1', 'col_path_2', 'col_path_3', 'col_path_4', 'col_path_5']

    for r, g, u, e in zip(row_var, get_var, unlist_var, explode_var):
        df = column_paths(df, r, g, u, e)
        row_var = unlist_var

    return df


In [162]:
def clean_and_store(df, file_name):
 
    df = df.loc[:, ['variable', 'value', 'path_1', 'path_2',
                  'path_3', 'path_4', 'json_name', 'row_path', 
                  'col_path_1', 'col_path_1_LIST',
                  'col_path_2', 'col_path_2_LIST',
                  'col_path_3', 'col_path_3_LIST',
                  'col_path_4', 'col_path_4_LIST',
                  'col_path_5', 'col_path_5_LIST',
                  'data_type', 
                  'col_path_1_values','col_path_2_values',
                  'col_path_3_values', 'col_path_4_values', 
                  'col_path_5_values']]
   
    df.reset_index(drop=True, inplace=True)
   
    df = df.fillna('Missing')
    #df = df.astype(str)
    


    df.to_csv(f"{main_path}Facebook/Output/Output_" + file_name + '.csv', index=False)

    return df

In [163]:
def structure_donations(data):

    data = Path(data)  
    #file_name = data.name  # Extracts "data.json"
    file_name = Path(data).stem 

    # Load JSON file
    with open(data, 'r') as f:
        data = json.load(f)

    # Flatten JSON (handling nested structures)
    df = pd.json_normalize(data, max_level=0)

    # Delete user specific informations
    df.columns = df.columns.str.replace(r'^[^/]+/', '', regex=True)

    # Extract column names
    cols = df.columns.tolist()


    # From wide to long df
    df = pd.melt(df, value_vars= cols)

    df = file_paths(df)
    
    df = row_column_paths(df)
    
    df = df.apply(lambda row: process_row_path(row, columns, data_types), axis=1)
    
    df = clean_and_store(df, file_name)
    
    
    return df


In [164]:
input_directory = Path(f'{main_path}Facebook/Input_test')  
print(input_directory)

/home/rvissche/Nextcloud/What-If/what-if-data-donation/what-if-data-donation/structure_donations/Processed_structure_donations/Facebook/Input_test


In [165]:

for file in input_directory.iterdir():  
    if file.is_file():  
        print(file)
        structure_donations(file)
        

/home/rvissche/Nextcloud/What-If/what-if-data-donation/what-if-data-donation/structure_donations/Processed_structure_donations/Facebook/Input_test/FB_structure_facebook_takeout_ro.json
['variable', 'value', 'path_1', 'path_2', 'path_3', 'path_4', 'json_name', 'row_path', 'col_path_1', 'col_path_2', 'col_path_3', 'col_path_4', 'col_path_5', 'col_path_1_values', 'col_path_1_LIST']
['variable', 'value', 'path_1', 'path_2', 'path_3', 'path_4', 'json_name', 'row_path', 'col_path_1', 'col_path_2', 'col_path_3', 'col_path_4', 'col_path_5', 'col_path_1_values', 'col_path_1_LIST', 'col_path_2_values', 'col_path_2_LIST']
['variable', 'value', 'path_1', 'path_2', 'path_3', 'path_4', 'json_name', 'row_path', 'col_path_1', 'col_path_2', 'col_path_3', 'col_path_4', 'col_path_5', 'col_path_1_values', 'col_path_1_LIST', 'col_path_2_values', 'col_path_2_LIST', 'col_path_3_values', 'col_path_3_LIST']
['variable', 'value', 'path_1', 'path_2', 'path_3', 'path_4', 'json_name', 'row_path', 'col_path_1', 'co

In [166]:
# Path to the folder containing CSV files
output_path = f"{main_path}Facebook/Output"

# Get a list of all CSV files in the folder
csv_files = list(Path(output_path).glob("*.csv"))

# Load all CSVs into a list of DataFrames
dfs = [pd.read_csv(file) for file in csv_files]


# Concatenate all dataframes
merged_df = pd.concat(dfs, axis=0, ignore_index=True)
display(merged_df)

# browser_cookies.json has cookie names in col_path_1, we should only keep 1 to not unnecessaily duplicate rows
pattern = r'\*{4,}'
mask = merged_df['col_path_1'].astype(str).str.contains(pattern, na=False)
# Select the first row where mask is True
first_idx = merged_df[mask].index.min()
merged_df = merged_df[(~mask) | (merged_df.index == first_idx)]


# Drop rows that are completely identical across all columns except the 'variable' column as this includes profile names duplicating the same structure
merged_df = merged_df.drop_duplicates(subset=merged_df.columns.difference(['variable']))
    

# Message user names cuase duplication
filtered_df = merged_df[merged_df["path_1"] == "messages"].drop_duplicates(subset=merged_df.columns.difference(['variable', 'path_3']))
merged_df = merged_df[merged_df["path_1"] != "messages"]
merged_df = pd.concat([merged_df, filtered_df], ignore_index=True)



# Append rows where col1 does not contain 'messages'
#df_final = pd.concat([df_filtered, merged_df[merged_df["path_1"] != "messages"]], ignore_index=True)
df_final = merged_df
df_final = df_final.replace('Missing', np.nan)


print(df_final.isna().sum())

# Save the final merged DataFrame
df_final.to_csv(f"{main_path}Facebook/Final/Merged_structures_FB.csv", index=False)

Unnamed: 0,variable,value,path_1,path_2,path_3,path_4,json_name,row_path,col_path_1,col_path_2,...,col_path_1_values,col_path_2_values,col_path_3_values,col_path_4_values,col_path_5_values,col_path_1_LIST,col_path_2_LIST,col_path_3_LIST,col_path_4_LIST,col_path_5_LIST
0,apps_and_websites_off_of_facebook/your_activit...,"{'timestamp': 'number', 'media': 'array', 'lab...",apps_and_websites_off_of_facebook,Missing,Missing,Missing,your_activity_off_meta_technologies_settings.json,timestamp,Missing,Missing,...,number,Missing,Missing,Missing,Missing,,,,,
1,apps_and_websites_off_of_facebook/your_activit...,{'off_facebook_activity_v2': [{'name': 'string...,apps_and_websites_off_of_facebook,Missing,Missing,Missing,your_activity_off_meta_technologies.json,off_facebook_activity_v2,name,Missing,...,"{'name': 'string', 'events': [{'id': 'number',...",string,Missing,Missing,Missing,,,,,
2,apps_and_websites_off_of_facebook/your_activit...,{'off_facebook_activity_v2': [{'name': 'string...,apps_and_websites_off_of_facebook,Missing,Missing,Missing,your_activity_off_meta_technologies.json,off_facebook_activity_v2,events,id,...,"{'name': 'string', 'events': [{'id': 'number',...","{'id': 'number', 'type': 'string', 'timestamp'...",number,Missing,Missing,,,,,
3,apps_and_websites_off_of_facebook/your_activit...,{'off_facebook_activity_v2': [{'name': 'string...,apps_and_websites_off_of_facebook,Missing,Missing,Missing,your_activity_off_meta_technologies.json,off_facebook_activity_v2,events,type,...,"{'name': 'string', 'events': [{'id': 'number',...","{'id': 'number', 'type': 'string', 'timestamp'...",string,Missing,Missing,,,,,
4,apps_and_websites_off_of_facebook/your_activit...,{'off_facebook_activity_v2': [{'name': 'string...,apps_and_websites_off_of_facebook,Missing,Missing,Missing,your_activity_off_meta_technologies.json,off_facebook_activity_v2,events,timestamp,...,"{'name': 'string', 'events': [{'id': 'number',...","{'id': 'number', 'type': 'string', 'timestamp'...",number,Missing,Missing,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8052,preferences/preferences/language_and_locale.json,"{'language_and_locale_v2': [{'name': 'string',...",preferences,preferences,Missing,Missing,language_and_locale.json,language_and_locale_v2,name,Missing,...,"{'name': 'string', 'description': 'string', 'c...",string,Missing,Missing,Missing,LIST,NO LIST,MISSING,MISSING,MISSING
8053,preferences/preferences/language_and_locale.json,"{'language_and_locale_v2': [{'name': 'string',...",preferences,preferences,Missing,Missing,language_and_locale.json,language_and_locale_v2,description,Missing,...,"{'name': 'string', 'description': 'string', 'c...",string,Missing,Missing,Missing,LIST,NO LIST,MISSING,MISSING,MISSING
8054,preferences/preferences/language_and_locale.json,"{'language_and_locale_v2': [{'name': 'string',...",preferences,preferences,Missing,Missing,language_and_locale.json,language_and_locale_v2,children,name,...,"{'name': 'string', 'description': 'string', 'c...","{'name': 'string', 'description': 'string', 'e...",string,Missing,Missing,LIST,LIST,NO LIST,MISSING,MISSING
8055,preferences/preferences/language_and_locale.json,"{'language_and_locale_v2': [{'name': 'string',...",preferences,preferences,Missing,Missing,language_and_locale.json,language_and_locale_v2,children,description,...,"{'name': 'string', 'description': 'string', 'c...","{'name': 'string', 'description': 'string', 'e...",string,Missing,Missing,LIST,LIST,NO LIST,MISSING,MISSING


variable                0
value                   0
path_1                  0
path_2               2090
path_3               3183
path_4               3508
json_name               0
row_path                0
col_path_1            990
col_path_2           2451
col_path_3           2971
col_path_4           3370
col_path_5           3492
data_type             102
col_path_1_values      64
col_path_2_values     984
col_path_3_values    2429
col_path_4_values    2971
col_path_5_values    3370
col_path_1_LIST      2062
col_path_2_LIST      2062
col_path_3_LIST      2062
col_path_4_LIST      2062
col_path_5_LIST      2062
dtype: int64


  df_final = df_final.replace('Missing', np.nan)
