In [45]:
import pandas as pd
import json
import re
import numpy as np
import ast
from pathlib import Path  

In [46]:
main_path = "/home/rvissche/Nextcloud/What-If/what-if-data-donation/what-if-data-donation/structure_donations/Processed_structure_donations/"


In [47]:
# Define the data types
data_types = ['string', 'array', 'number', 'boolean', 'object', 'str', 'int', 'float', 'bool', 'dict', 'list']
columns = ['col_path_1','col_path_2','col_path_3','col_path_4', 'col_path_5']


def process_row_path(row, columns, data_types):
    row['data_type'] = ''
    for column in columns:
        if row[column] in data_types:
            row['data_type'] = row[column]
            row[column] = np.nan
        else:
            row[column]
    return row


In [48]:
def file_paths(df):
    # Create different colums for each part of the document path
    df['path_1'] = df['variable'].str.split('/', n=1).str[0]
    df['path_2'] = df['variable'].str.split('/', n=3).str[1]
    df['path_3'] = df['variable'].str.split('/', n=3).str[2]
    df['path_4'] = df['variable'].str.split('/', n=3).str[3]

    # Create a column with the JSON name
    df['json_name'] = df['variable'].str.rsplit('/', n=1).str[-1]


    # As the JSON name is stored in the json_name column, fill other parts of the path with Na if the name of the JSON is present
    mark = ".json"

    df['path_2'] = df['path_2'].apply(lambda x: np.nan if isinstance(x, str) and mark in x else x)
    df['path_3'] = df['path_3'].apply(lambda x: np.nan if isinstance(x, str) and mark in x else x)
    df['path_4'] = df['path_4'].apply(lambda x: np.nan if isinstance(x, str) and mark in x else x)

    # Unlist the value column (where the JSON info is stored)
    for index, row in df.iterrows():
        if isinstance(row['value'], list):
            df.at[index, 'value'] = row['value'][0]
    
    return df

In [49]:
def string_to_dict(s):
    if ',' not in s and ':' not in s:
        return s
    result = {}
    items = s.split(',')
    for item in items:
        if ':' not in item:
            continue  
        key, value = item.split(':', 1)  # use maxsplit=1 to avoid unpacking issues
        try:
            key = eval(key.strip())
            value = eval(value.strip())
        except Exception as e:
            continue
        result[key] = value
    return result

    

In [50]:
def row_column_paths(df):
    # Initialize the new columns
    df['row_path'] = '' 
    df['col_path_1'] = ''  
    df['col_path_2'] = ''  
    df['col_path_3'] = ''
    df['col_path_4'] = ''
    df['col_path_5'] = '' 

    # Extract the level 1 keys 
    for index, row in df.iterrows():
        df.at[index, 'row_path'] = list(row['value'].keys())

    df['row_path'] = df['row_path'].astype('str').apply(eval).str[0]

    # Take the level 1 keys stored in a list and store them in individual rows
    df = df.explode('row_path')

    def column_paths(df, row_var, get_var, unlist_var, explode_var):

        print("before", df[row_var].apply(type).value_counts())
        
        # Convert to dict if it's a string
        df[row_var] = df[row_var].apply(
            lambda x: string_to_dict(x) if isinstance(x, str) else x)
        
        print("After", df[row_var].apply(type).value_counts())
        
        # Extract nested values using keys
        df[unlist_var] = df.apply(lambda row: [row[row_var].get(row[get_var], None)] if isinstance (row[row_var], dict) else None, axis=1)


        # Unlist and explode
        df[unlist_var] = df[unlist_var].apply(lambda x: x[0] if isinstance(x, list) else x)
        df[unlist_var] = df[unlist_var].apply(lambda x: x[0] if isinstance(x, list) else x)
        
        df[explode_var] = df[unlist_var]
        df = df.explode(explode_var)

    

        return df

    row_var = ['value', 'col_path_1_values', 'col_path_2_values', 'col_path_3_values', 'col_path_4_values']
    get_var = ['row_path', 'col_path_1', 'col_path_2', 'col_path_3', 'col_path_4']
    unlist_var = ['col_path_1_values', 'col_path_2_values', 'col_path_3_values', 'col_path_4_values', 'col_path_5_values']
    explode_var = ['col_path_1', 'col_path_2', 'col_path_3', 'col_path_4', 'col_path_5']

    for r, g, u, e in zip(row_var, get_var, unlist_var, explode_var):
        df = column_paths(df, r, g, u, e)
        row_var = unlist_var

    return df


In [51]:
def clean_and_store(df, file_name):
 
    # Reorder the columns in the df
    df = df.loc[:, ['variable', 'value', 'path_1', 'path_2',
        'path_3', 'path_4', 'json_name', 'row_path', 'col_path_1','col_path_2','col_path_3', 'col_path_4', 'col_path_5','data_type', 'col_path_1_values','col_path_2_values','col_path_3_values', 'col_path_4_values', 'col_path_5_values']]
    
    df.reset_index(drop=True, inplace=True)
    df = df.fillna('Missing')
    #df = df.astype(str)
    


    df.to_csv(f"{main_path}Instagram/Output/Output_" + file_name + '.csv', index=False)

    return df

In [52]:
def structure_donations(data):

    data = Path(data)  
    #file_name = data.name  # Extracts "data.json"
    file_name = Path(data).stem 

    # Load JSON file
    with open(data, 'r') as f:
        data = json.load(f)

    
    # Flatten JSON (handling nested structures)
    df = pd.json_normalize(data, max_level=0)
    display(df)

    # Delete user specific informations
    df.columns = df.columns.str.replace(r'^[^/]+/', '', regex=True)
    display(df)

    # Extract column names
    cols = df.columns.tolist()


    # From wide to long df
    df = pd.melt(df, value_vars= cols)

    df = file_paths(df)
    
    df = row_column_paths(df)
    
    df = df.apply(lambda row: process_row_path(row, columns, data_types), axis=1)
    
    df = clean_and_store(df, file_name)
    
    
    return df


In [53]:
input_directory = Path(f'{main_path}Instagram/Input_test')  
print(input_directory)

/home/rvissche/Nextcloud/What-If/what-if-data-donation/what-if-data-donation/structure_donations/Processed_structure_donations/Instagram/Input_test


In [54]:

for file in input_directory.iterdir():  
    if file.is_file():  
        print(file)
        structure_donations(file)
        

/home/rvissche/Nextcloud/What-If/what-if-data-donation/what-if-data-donation/structure_donations/Processed_structure_donations/Instagram/Input_test/instagram-geodag91-2024-11-20-poQTe0ag-20250214T091036Z-001_structure.json


Unnamed: 0,instagram-geodag91-2024-11-20-poQTe0ag/your_instagram_activity/saved/saved_posts.json,instagram-geodag91-2024-11-20-poQTe0ag/your_instagram_activity/likes/liked_posts.json,instagram-geodag91-2024-11-20-poQTe0ag/your_instagram_activity/subscriptions/show_exclusive_story_promo_setting.json,instagram-geodag91-2024-11-20-poQTe0ag/ads_information/ads_and_topics/posts_viewed.json,instagram-geodag91-2024-11-20-poQTe0ag/your_instagram_activity/messages/secret_conversations.json,instagram-geodag91-2024-11-20-poQTe0ag/security_and_login_information/login_and_profile_creation/profile_privacy_changes.json,instagram-geodag91-2024-11-20-poQTe0ag/personal_information/personal_information/personal_information.json,instagram-geodag91-2024-11-20-poQTe0ag/preferences/media_settings/consents.json,instagram-geodag91-2024-11-20-poQTe0ag/security_and_login_information/login_and_profile_creation/login_activity.json,instagram-geodag91-2024-11-20-poQTe0ag/security_and_login_information/login_and_profile_creation/instagram_signup_details.json,...,instagram-geodag91-2024-11-20-poQTe0ag/logged_information/recent_searches/profile_searches.json,instagram-geodag91-2024-11-20-poQTe0ag/security_and_login_information/login_and_profile_creation/last_known_location.json,instagram-geodag91-2024-11-20-poQTe0ag/ads_information/ads_and_topics/videos_watched.json,instagram-geodag91-2024-11-20-poQTe0ag/personal_information/personal_information/profile_changes.json,instagram-geodag91-2024-11-20-poQTe0ag/your_instagram_activity/content/profile_photos.json,instagram-geodag91-2024-11-20-poQTe0ag/personal_information/personal_information/professional_information.json,instagram-geodag91-2024-11-20-poQTe0ag/your_instagram_activity/monetization/eligibility.json,instagram-geodag91-2024-11-20-poQTe0ag/your_instagram_activity/subscriptions/your_muted_story_teaser_creators.json,instagram-geodag91-2024-11-20-poQTe0ag/connections/followers_and_following/followers_1.json,instagram-geodag91-2024-11-20-poQTe0ag/connections/followers_and_following/following.json
0,"{'saved_saved_media': [{'title': 'string', 'st...","{'likes_media_likes': [{'title': 'string', 'st...",{'subscriptions_show_story_teaser_setting': [{...,{'impressions_history_posts_seen': [{'string_m...,{'ig_secret_conversations': {'armadillo_device...,{'account_history_account_privacy_history': [{...,"{'profile_user': [{'title': 'string', 'media_m...","{'timestamp': 'number', 'media': ['array'], 'l...",{'account_history_login_history': [{'title': '...,{'account_history_registration_info': [{'title...,...,"{'searches_user': [{'title': 'string', 'media_...",{'account_history_imprecise_last_known_locatio...,{'impressions_history_videos_watched': [{'stri...,{'profile_profile_change': [{'title': 'string'...,"{'ig_profile_picture': [{'uri': 'string', 'cre...","{'profile_business': [{'title': 'string', 'med...",{'monetization_eligibility': [{'title': 'strin...,{'subscriptions_muted_story_teaser_creators': ...,"{'title': 'string', 'media_list_data': ['array...",{'relationships_following': [{'title': 'string...


Unnamed: 0,your_instagram_activity/saved/saved_posts.json,your_instagram_activity/likes/liked_posts.json,your_instagram_activity/subscriptions/show_exclusive_story_promo_setting.json,ads_information/ads_and_topics/posts_viewed.json,your_instagram_activity/messages/secret_conversations.json,security_and_login_information/login_and_profile_creation/profile_privacy_changes.json,personal_information/personal_information/personal_information.json,preferences/media_settings/consents.json,security_and_login_information/login_and_profile_creation/login_activity.json,security_and_login_information/login_and_profile_creation/instagram_signup_details.json,...,logged_information/recent_searches/profile_searches.json,security_and_login_information/login_and_profile_creation/last_known_location.json,ads_information/ads_and_topics/videos_watched.json,personal_information/personal_information/profile_changes.json,your_instagram_activity/content/profile_photos.json,personal_information/personal_information/professional_information.json,your_instagram_activity/monetization/eligibility.json,your_instagram_activity/subscriptions/your_muted_story_teaser_creators.json,connections/followers_and_following/followers_1.json,connections/followers_and_following/following.json
0,"{'saved_saved_media': [{'title': 'string', 'st...","{'likes_media_likes': [{'title': 'string', 'st...",{'subscriptions_show_story_teaser_setting': [{...,{'impressions_history_posts_seen': [{'string_m...,{'ig_secret_conversations': {'armadillo_device...,{'account_history_account_privacy_history': [{...,"{'profile_user': [{'title': 'string', 'media_m...","{'timestamp': 'number', 'media': ['array'], 'l...",{'account_history_login_history': [{'title': '...,{'account_history_registration_info': [{'title...,...,"{'searches_user': [{'title': 'string', 'media_...",{'account_history_imprecise_last_known_locatio...,{'impressions_history_videos_watched': [{'stri...,{'profile_profile_change': [{'title': 'string'...,"{'ig_profile_picture': [{'uri': 'string', 'cre...","{'profile_business': [{'title': 'string', 'med...",{'monetization_eligibility': [{'title': 'strin...,{'subscriptions_muted_story_teaser_creators': ...,"{'title': 'string', 'media_list_data': ['array...",{'relationships_following': [{'title': 'string...


before value
<class 'dict'>    23
Name: count, dtype: int64
After value
<class 'dict'>    23
Name: count, dtype: int64
before col_path_1_values
<class 'dict'>    59
<class 'str'>      2
Name: count, dtype: int64
After col_path_1_values
<class 'dict'>    59
<class 'str'>      2
Name: count, dtype: int64
before col_path_2_values
<class 'dict'>        80
<class 'str'>         23
<class 'NoneType'>     2
Name: count, dtype: int64
After col_path_2_values
<class 'dict'>        80
<class 'str'>         23
<class 'NoneType'>     2
Name: count, dtype: int64
before col_path_3_values
<class 'dict'>        151
<class 'NoneType'>     39
<class 'str'>          13
Name: count, dtype: int64
After col_path_3_values
<class 'dict'>        151
<class 'NoneType'>     39
<class 'str'>          13
Name: count, dtype: int64
before col_path_4_values
<class 'str'>         149
<class 'NoneType'>     52
<class 'dict'>          2
Name: count, dtype: int64
After col_path_4_values
<class 'str'>         149
<class 'N

In [55]:
# Path to the folder containing CSV files
output_path = f"{main_path}Instagram/Output"

# Get a list of all CSV files in the folder
csv_files = list(Path(output_path).glob("*.csv"))

# Load all CSVs into a list of DataFrames
dfs = [pd.read_csv(file) for file in csv_files]


common_columns = ['variable', 'value', 'path_1', 'path_2',
        'path_3', 'path_4', 'json_name', 'row_path', 'col_path_1','col_path_2','col_path_3', 'col_path_4', 'col_path_5','data_type', 'col_path_1_values','col_path_2_values','col_path_3_values', 'col_path_4_values', 'col_path_5_values']


merged_df = dfs[0]  # Start with the first DataFrame
for df in dfs[1:]:  # Merge with the rest  #
    merged_df = merged_df.merge(df, on=common_columns, how="outer")




# Filter where col1 contains 'messages', then drop duplicates based on col2
df_filtered = merged_df[merged_df["path_1"] == "messages"].drop_duplicates(subset="path_2")


# Append rows where col1 does not contain 'messages'
df_final = pd.concat([df_filtered, merged_df[merged_df["path_1"] != "messages"]], ignore_index=True)
df_final = df_final.replace('Missing', np.nan)


print(df_final.isna().sum())

# Save the final merged DataFrame
df_final.to_csv(f"{main_path}Instagram/Final/Merged_structures_IG.csv", index=False)

variable               0
value                  0
path_1                 0
path_2                 0
path_3               203
path_4               203
json_name              0
row_path               0
col_path_1             2
col_path_2            39
col_path_3            52
col_path_4           201
col_path_5           202
data_type             15
col_path_1_values      0
col_path_2_values      2
col_path_3_values     39
col_path_4_values     52
col_path_5_values    201
dtype: int64


  df_final = df_final.replace('Missing', np.nan)
