# Parsing Instagram Data Structures into a Schema_df

 The purpose of his jupyter notebook is to parse the collected data structures in earlier iterations of utilising the data donation tool into a schema_df which can be used to inform future iterations of the data donation tool.

In [1]:
import pandas as pd
import json
import numpy as np
from pathlib import Path  
from itertools import cycle

In [2]:
main_path = "/home/rvissche/Nextcloud/What-If/what-if-data-donation/what-if-data-donation/structure_donations/Processed_structure_donations/"

## Creating functions


### columns_creator()

In [3]:
max_col_path = 6
def columns_creator(max_col_path):
    col_path = [f"col_path_{i}" for i in range(1, max_col_path+1)]
    col_var = ['value']+[f"col_path_{i}_values" for i in range(1, max_col_path)]
    get_var = [f"col_path_{i}" for i in range(1, max_col_path+1)]
    unlist_var = [f"col_path_{i}_values" for i in range(1, max_col_path+1)]
    list_var = [f"col_path_{i}" for i in range(2, max_col_path+1)]
    col_path_list = [f"col_path_{i}_LIST" for i in range(2, max_col_path+1)]

    
    return col_path, col_var, get_var, unlist_var, list_var, col_path_list

col_path, col_var, get_var, unlist_var,list_var, col_path_list = columns_creator(max_col_path)



### process_col_path()
The 'process_col_path()' function checks whether the value in a row of for one of the column paths is actually a datatype and stores this value in a column data_type and replaces the original value with NA. The data types are the lowest level values in the JSON files

In [4]:
# Define the data types
data_types = ['string', 'array', 'number', 'boolean', 'object', 'str', 'int', 'float', 'bool', 'dict', 'list']

#Define the column names
columns = col_path



# Define the function 'process_col_path()'
def process_col_path(row, columns, data_types):

    """
    row: Rows in the dataframe
    columns: List of column names of column path columns 
    data_types: List of the values that are data types
    """

    row['data_type'] = ''
    for column in columns:

        #If the value stored in the column is found in the list 'data_types', 
        if row[column] in data_types:
            # this value is placed in the column 'data_type'
            row['data_type'] = row[column]
            # and the original value is replaced with NA
            row[column] = np.nan

        #If the value is not found in the 'data_types' list, the original value is returned
        else:
            row[column]
    return row


### file_paths()
The 'file_paths()' function splits up the paths to where the JSON file is stored in the folder and provides the name of the json file. 

In [5]:
def file_paths(df):
    
    df['value'] = df['value'].apply(lambda x: x[0] if isinstance(x, list) else x)
    
    return df

### string_to_dict()
As the JSON files are loaded as strings, they need to be converted to dictionaries to extract the values and be cleaned. 

In [6]:
def string_to_dict(s):
    # Check if the items needed for splitting are present. If not present it does not need to be splitted and the orginal value is returned
    if ',' not in s and ':' not in s:
        return s
    
    # Create an empty dictionary
    result = {}

    # Split the items by comma (split into key-value pairs)
    items = s.split(',')

    # For each item in the original dictionary
    for item in items:
        # Check if it contains a key-value pair, if not continue
        if ':' not in item:
            continue  

        # Split the key-value pair into a variable 'key' and a variable 'value'
        key, value = item.split(':', 1)  # use maxsplit=1 to avoid unpacking issue

        # Try to strip any white spaces from the keys and values 
        try:
            key = eval(key.strip())
            value = eval(value.strip())
        # If not possible, continue
        except Exception as e:
            continue
        
        # Save the converted and cleaned dictionary
        result[key] = value
    return result
    

### detect_list()

Due to lists in unexpected places in the JSON files and lengthy json paths, we need to identify the positions of lists to later select the correct get() function

In [7]:
# Assign list if the data in the original structure is a list
    
def detect_list(x):
    # If the data type of the value is list, 'LIST' is assigned in the '_LIST' columns (see'column_paths())
    if isinstance(x, list):
        return 'LIST'
    # If the value is missing, 'MISSING' is assigned
    elif pd.isna(x):
        return 'MISSING'
    # If the value is 'No data' (there is an empty place holder) 'MISSING' is assigned
    elif x == 'No data':
            return 'MISSING'
    # if the value is not missing and is not a list, 'NO LIST' is assigned
    else:
        return 'NO LIST'

### columns_paths()
Through this function, we map the json paths to each value by putting the keys in separate columns: colpath_{1,2,3,4,5} (see 'row_column_paths()')

In [8]:
def column_paths(df, col_var, get_var, unlist_var, list_var):

    # Convert to dict if it's a string
    df[col_var] = df[col_var].apply(
        lambda x: string_to_dict(x) if isinstance(x, str) else x)
    
        # Extract nested values using keys
    """
    1. Selects the key obtained in the previous iteration stored in 'get_var'
    2. The key put into a get() function which extracts values of a dictionary based on the keys
    3. The get() function extracts the values from the (nested) dictionary stored in 'col_var'
    4. If the get() function fails, None is returned
    5. The steps above are only performed if the value stored in 'col_var' is a dictionary, otherwise None is returned
    6. All steps above are executed for each row in the df
    """

    df[get_var] = df.apply(
    lambda row: list(row[col_var].keys()) if isinstance(row[col_var], dict) else row[col_var], axis=1)

    
    df = df.explode(get_var)

    
    # Extract nested values using keys
    df[unlist_var] = df.apply(lambda row: row[col_var].get(row[get_var], None) if isinstance (row[col_var], dict) else None, axis=1)

    # For each each column_path, check if the value contains a list
    df[f'{list_var}_LIST'] = df[unlist_var].apply(detect_list)
    

    df[unlist_var] = df[unlist_var].apply(
        lambda x: [x] if not isinstance(x, list) else x)

    df = df.explode(unlist_var)
    
     # Unlist data to avoid double nested lists 
    df[unlist_var] = df[unlist_var].apply(lambda x: x[0] if isinstance(x, list) else x)



    return df

In [9]:
print(unlist_var)
print(list_var)
print(col_var)
print(col_path)

['col_path_1_values', 'col_path_2_values', 'col_path_3_values', 'col_path_4_values', 'col_path_5_values', 'col_path_6_values']
['col_path_2', 'col_path_3', 'col_path_4', 'col_path_5', 'col_path_6']
['value', 'col_path_1_values', 'col_path_2_values', 'col_path_3_values', 'col_path_4_values', 'col_path_5_values']
['col_path_1', 'col_path_2', 'col_path_3', 'col_path_4', 'col_path_5', 'col_path_6']


### row_column_paths()
This function prepares the row paths and consequently executes the column_paths function to obtain the paths of keys (stored in individual columns) to reach the lowest value which is where the actual data is stored


In [10]:
def row_column_paths(df, col_path, col_var, get_var, unlist_var, list_var):
    # Initialize the new columns
    df['row_path'] = df['variable'] 
    df[col_path] = np.nan

    # Initiate the list of columns needed to inform the function column_paths()
    col_var = col_var
    get_var = get_var 
    unlist_var = unlist_var
    list_var =  list_var

    # Execute the colums_path() function
    # zip is necessary due to the large number of variables needed in the column_paths function
    for r, g, u, l in zip(col_var, get_var, unlist_var, list_var):
        df = column_paths(df, r, g, u, l)
        
    return df

### clean_and_store()
This function orders the columns in the DataFrame, resets the index, fills the NA and saves the DataFrame as CSV

In [11]:
def clean_and_store(df, file_name):

    """
    df: The dataframe that will be cleaned and stored
    file_name: The filename of data structure that is being processed
    """
    
    # Reorder the columns in the df
    df = df[['variable', 'value', 'row_path', 
                    'col_path_1',  
                     'col_path_2', 'col_path_2_LIST',
                     'col_path_3',  'col_path_3_LIST',
                     'col_path_4', 'col_path_4_LIST',
                     'col_path_5', 'col_path_5_LIST',
                     'col_path_6', 'col_path_6_LIST',
                     'data_type']]
    
    
    # Reset the index
    df.reset_index(drop=True, inplace=True)

    # fill na values with 'Missing'
    df = df.fillna('Missing')


    # Save the DataFrame 
    df.to_csv(f"{main_path}TikTok/Output/Output_" + file_name + '.csv', index=False)

    return df

### extract_path()

In [12]:
def extract_path(df, max_col_path) -> tuple[str, ...]:
    path = []
    for col in ["row_path"] + [f"col_path_{i}" for i in range(1, max_col_path)]:
        val = df[col]
        if pd.notna(val) and str(val) != "Missing":
            path.append(str(val).strip())
    return tuple(path)

### structure_donations()
The structure_donations() function executes all functions above and results in a saved DataFrame for each data structure.

In [13]:
def structure_donations(data):

    """
    data: The unprocessed data structure JSON that will be processed
    """

    # Store the path to the data structure
    data = Path(data)  
    
    # Save teh file name of the data structure
    file_name = Path(data).stem 

    # Load JSON file (data structures)
    with open(data, 'r') as f:
        data = json.load(f)
    
    # Strip top-level key
    data = list(data.values())[0]


    # Flatten JSON (handling nested structures)
    df = pd.json_normalize(data, max_level=0)

    # Extract column names
    cols = df.columns.tolist()


    # From wide to long df
    df = pd.melt(df, value_vars= cols)

    # Execute the 'file_paths()' function and store the result in df
    df = file_paths(df)

    # Execute the 'row_column_paths()' function and store the result in df
    df = row_column_paths(df, col_path, col_var, get_var, unlist_var, list_var)
    # Execute the 'process_col_path()' function and store the result in df
    df = df.apply(lambda row: process_col_path(row, columns, data_types), axis=1)

    # Execute the 'clean_and_store()' function and store the result in df
    df = clean_and_store(df, file_name)
    
    
    return df


## Execute 'structure_donations()': Transform data structures from JSON format to tabular format

In [14]:
input_directory = Path(f'{main_path}TikTok/Input_test')  
print(input_directory)

/home/rvissche/Nextcloud/What-If/what-if-data-donation/what-if-data-donation/structure_donations/Processed_structure_donations/TikTok/Input_test


In [15]:
# Execute the 'structure_donations()' function for each file (data structure) in the input directory
for file in input_directory.iterdir():  
    if file.is_file():  
        structure_donations(file)
        

## Merge all data structures into one schema_df

In [16]:
# Path to the folder containing CSV files
output_path = f"{main_path}TikTok/Output"

# Get a list of all CSV files in the folder
csv_files = list(Path(output_path).glob("*.csv"))

# Load all CSVs into a list of DataFrames
dfs = [pd.read_csv(file) for file in csv_files]

# Concatenate all DataFrames
merged_df = pd.concat(dfs, axis=0, ignore_index=True)

col_subset = merged_df.columns.tolist()
col_subset.remove('value')

# Drop rows that are completely identical across all columns except value
merged_df = merged_df.drop_duplicates(subset= col_subset)


df_index = merged_df[(merged_df['row_path'] == 'Direct Message')].index
merged_df.drop(df_index, inplace= True)

# Replace 'Missing' with NaN
merged_df = merged_df.replace('Missing', np.nan)




  merged_df = merged_df.replace('Missing', np.nan)


In [17]:
col_subset = merged_df.columns.tolist()
col_subset.remove('value')

# Drop rows that are completely identical across all columns
merged_df = merged_df.drop_duplicates(subset= col_subset)

merged_df['path'] = merged_df.apply(lambda x: extract_path(x, max_col_path), axis = 1)



### Clean up and reduce number of columns

In [None]:
def list_path(row):
    path = row['path'] 
    for i in range(len(path)):
        if row[f"col_path_{i+1}_LIST"] == "LIST":
            var_type = 'list'
            list_path = json.dumps(path[:-1])
            subfield_path = path[-1]
            column_name = subfield_path
            subfield_path = json.dumps(subfield_path)
        elif row[f"col_path_{i+1}_LIST"] == "NO LIST":
            var_type = 'static'
            list_path = np.nan
            subfield_path = json.dumps(path)
            column_name = path[-1]
        elif row[f"col_path_{i+1}_LIST"] == "MISSING" :
            var_type = 'skip'
            list_path = np.nan
            subfield_path = json.dumps(path)
            column_name = path[-1]
        return list_path, subfield_path, column_name, var_type


merged_df[['list_path', 'subfield_path', 'column_name', 'var_type']] = merged_df.apply(lambda x: pd.Series(list_path(x)), axis = 1)



### Id creation

In [19]:
id_df = merged_df[['variable', 'path']]
id_df = id_df.drop_duplicates()
id_df['id'] = ''

for n in range(1,max_col_path):
    duplicates = id_df[id_df.duplicated(subset='id', keep=False)]

    for i, row in duplicates.iterrows():
        path = list(row['path'])
        path_zero = path[0]
        path_rest = path[-n:]  # Varies each loop to attempt uniqueness

        id_list = [path_zero] + path_rest
        new_id = ':'.join(id_list)

        id_df.at[i, 'id'] = new_id  # Properly update the DataFrame

merged_df = pd.merge(merged_df, id_df, on = ['variable', 'path'], how = 'left')

col = merged_df.pop('id') 
merged_df.insert(0, 'id', col) 
#merged_df['id']= merged_df['id'].fillna(merged_df['variable'])


In [20]:
merged_df = merged_df.drop_duplicates()
dup = merged_df[merged_df.duplicated('id', keep= False)]
dup['duplicate_flag'] = 'Yes'
dup = dup[['id', 'duplicate_flag']]
merged_df = pd.merge(merged_df, dup, on = 'id', how = 'left')
merged_df['duplicate_flag'] = merged_df['duplicate_flag'].fillna(value = 'No')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup['duplicate_flag'] = 'Yes'


In [21]:
merged_df['json_name'] = ''

keep_columns = ['id', 'column_name',  'variable',
                 'path', 'list_path', 'subfield_path', 
                 'var_type', 'data_type','row_path'] + col_path + ['json_name', 'duplicate_flag']

print(keep_columns)

merged_df = merged_df[keep_columns ]


['id', 'column_name', 'variable', 'path', 'list_path', 'subfield_path', 'var_type', 'data_type', 'row_path', 'col_path_1', 'col_path_2', 'col_path_3', 'col_path_4', 'col_path_5', 'col_path_6', 'json_name', 'duplicate_flag']


In [22]:
# Save the final merged DataFrame
merged_df.to_csv(f"{main_path}TikTok/Final/Merged_structures_TT.csv", index=False)

In [23]:
merged_df.shape

(227, 17)