# Parsing X/Twitter Data Structures into a Schema_df

In [177]:
import pandas as pd
import json
import numpy as np
from pathlib import Path  

In [178]:
main_path = "/home/rvissche/Nextcloud/What-If/what-if-data-donation/what-if-data-donation/structure_donations/Processed_structure_donations/"


## Creating functions
### process_col_path()
The 'process_col_path()' function checks whether the value in a row of for one of the column paths is actually a datatype and stores this value in a column data_type and replaces the original value with NA. The data types are the lowest level values in the JSON files.

In [179]:
# Define the data types
data_types = ['string', 'array', 'number', 'boolean', 'object', 'str', 'int', 'float', 'bool', 'dict', 'list']

#Define the column names
columns = ['col_path_1','col_path_2','col_path_3','col_path_4', 'col_path_5', 'col_path_6', 'col_path_7', 'col_path_8']



# Define the function 'process_col_path()'
def process_col_path(row, columns, data_types):

    """
    row: Rows in the dataframe
    columns: List of column names of column path columns 
    data_types: List of the values that are data types
    """

    row['data_type'] = ''
    for column in columns:

        #If the value stored in the column is found in the list 'data_types', 
        if row[column] in data_types:
            # this value is placed in the column 'data_type'
            row['data_type'] = row[column]
            # and the original value is replaced with NA
            row[column] = np.nan

        #If the value is not found in the 'data_types' list, the original value is returned
        else:
            row[column]
    return row


### file_paths()
The 'file_paths()' function splits up the paths to where the JSON file is stored in the folder and provides the name of the json file. 
- 'path_{1,2,3,4}': Column including the names of the {first, second, third, fourth} level folder where the JSON file is stored
- 'json_name': Column including the name of the JSON file

If the JSON name appears in the 'path_{1,2,3,4}' column, this name is replaced with NA and stored in the column 'json_name'

In [180]:
def file_paths(df):

    # Create different columns for each part of the document path
    df['path_1'] = df['variable'].str.split('/', n=1).str[0]
    df['path_2'] = df['variable'].str.split('/', n=3).str[1]
    df['path_3'] = df['variable'].str.split('/', n=3).str[2]
    df['path_4'] = df['variable'].str.split('/', n=3).str[3]

    # Create a column with the JSON name
    df['json_name'] = df['variable'].str.rsplit('/', n=1).str[-1]


    ## As the JSON name is stored in the json_name column, fill other parts of the path with Na if the name of the JSON is present
    # If the value is not the name of the value, return the original value
    mark = ".json"

    df[['path_2', 'path_3', 'path_4']] =  df[['path_2', 'path_3', 'path_4']].map(lambda x: np.nan if isinstance(x, str) and mark in x else x)

    # Unlist the value column (where the JSON info is stored) if it contains a list, otherwise return the original value 
    #df['value'] = df['value'].apply(lambda x: x[0] if isinstance(x, list) else x)

    
    return df

### string_to_dict()
As the JSON files are loaded as strings, they need to be converted to dictionaries to extract the values and be cleaned. 

In [181]:
def string_to_dict(s):
    # Check if the items needed for splitting are present. If not present it does not need to be splitted and the orginal value is returned
    if ',' not in s and ':' not in s:
        return s
    
    # Create an empty dictionary
    result = {}

    # Split the items by comma (split into key-value pairs)
    items = s.split(',')

    # For each item in the original dictionary
    for item in items:
        # Check if it contains a key-value pair, if not continue
        if ':' not in item:
            continue  

        # Split the key-value pair into a variable 'key' and a variable 'value'
        key, value = item.split(':', 1)  # use maxsplit=1 to avoid unpacking issue

        # Try to strip any white spaces from the keys and values 
        try:
            key = eval(key.strip())
            value = eval(value.strip())
        # If not possible, continue
        except Exception as e:
            continue
        
        # Save the converted and cleaned dictionary
        result[key] = value
    return result

    

### detect_list()

Due to lists in unexpected places in the JSON files and lengthy json paths, we need to identify the positions of lists to later select the correct get() function

In [182]:
# Assign list if the data in the original structure is a list
    
def detect_list(x):
    # If the data type of the value is list, 'LIST' is assigned in the '_LIST' columns (see'column_paths())
    if isinstance(x, list):
        return 'LIST'
    # If the value is missing, 'MISSING' is assigned
    elif pd.isna(x):
        return 'MISSING'
    # If the value is 'No data' (there is an empty place holder) 'MISSING' is assigned
    elif x == 'No data':
            return 'MISSING'
    # if the value is not missing and is not a list, 'NO LIST' is assigned
    else:
        return 'NO LIST'

In [183]:
def column_paths(df, col_var, get_var, unlist_var, explode_var):
        
        # Convert to dictionary if it's a string
        df[col_var] = df[col_var].apply(
            lambda x: string_to_dict(x) if isinstance(x, str) else x)
        
        #df = df.explode(col_var)

        df[get_var] = df.apply(
        lambda row: list(row[col_var].keys()) if isinstance(row[col_var], dict) else row[col_var], axis=1)
       
       
        


        df = df.explode(get_var)

        
        # Extract nested values using keys
        """
        1. Selects the key obtained in the previous iteration stored in 'get_var'
        2. The key put into a get() function which extracts values of a dictionary based on the keys
        3. The get() function extracts the values from the (nested) dictionary stored in 'col_var'
        4. If the get() function fails, None is returned
        5. The steps above are only performed if the value stored in 'col_var' is a dictionary, otherwise None is returned
        6. All steps above are executed for each row in the df
        """
        df[unlist_var] = df.apply(lambda row: row[col_var].get(row[get_var], None) if isinstance (row[col_var], dict) else None, axis=1)

        # For each each column_path, check if the value contains a list
        df[f'{explode_var}_LIST'] = df[unlist_var].apply(detect_list)

        df[unlist_var] = df[unlist_var].apply(
        lambda x: [x] if not isinstance(x, list) else x)

        df = df.explode(unlist_var)

        # Unlist data to avoid double nested lists 
        df[unlist_var] = df[unlist_var].apply(lambda x: x[0] if isinstance(x, list) else x)

        
        df[explode_var] = df[unlist_var]
        

    
        #print(df.columns.tolist())
        return df


### row_column_paths()
This function prepares the row paths and consequently executes the column_paths function to obtain the paths of keys (stored in individual columns) to reach the lowest value which is where the actual data is stored


In [184]:

def row_column_paths(df):
    # Initialize the new columns
    df['row_path'] = '' 
    df['col_path_1'] = ''  
    df['col_path_2'] = ''  
    df['col_path_3'] = ''
    df['col_path_4'] = ''
    df['col_path_6'] = '' 
    df['col_path_7'] = ''
    df['col_path_8'] = ''

    
    df['value'] = df['value'].apply(
        lambda x: [x] if not isinstance(x, list) else x)

    
    
    df = df.explode('value')

   
    
    df['value'] = df['value'].apply(
            lambda x: string_to_dict(x) if isinstance(x, str) else x)
    
    

    def extract_row_path(val):
        if isinstance(val, dict):
            return list(val.keys())
        elif isinstance(val, list):
            return val
        else:
            return ['no data']

    df['row_path'] = df['value'].apply(extract_row_path)

    #df['row_path'] = df['row_path'].astype('str').apply(eval).str[0]
  
    # Take the level 1 keys stored in a list and store them in individual rows
    df = df.explode('row_path')

    # Initiate the list of columns needed to inform the function column_paths()
    col_var = ['value', 'col_path_1_values', 'col_path_2_values', 'col_path_3_values', 'col_path_4_values', 'col_path_5_values', 'col_path_6_values', 'col_path_7_values']
    get_var = ['row_path', 'col_path_1', 'col_path_2', 'col_path_3', 'col_path_4',  'col_path_5',  'col_path_6',  'col_path_7']
    unlist_var = ['col_path_1_values', 'col_path_2_values', 'col_path_3_values', 'col_path_4_values', 'col_path_5_values', 'col_path_6_values', 'col_path_7_values', 'col_path_8_values']
    explode_var = ['col_path_1', 'col_path_2', 'col_path_3', 'col_path_4', 'col_path_5', 'col_path_6', 'col_path_7', 'col_path_8']

    for r, g, u, e in zip(col_var, get_var, unlist_var, explode_var):
        df = column_paths(df, r, g, u, e)
        col_var = unlist_var

    return df



### clean_and_store()
This function orders the columns in the DataFrame, resets the index, fills the NA and saves the DataFrame as CSV

In [185]:
def clean_and_store(df, file_name):

    """
    df: The dataframe that will be cleaned and stored
    file_name: The filename of data structure that is being processed
    """
   
    # Reorder the columns in the df
    df = df.loc[:, ['variable', 'value', 'path_1', 'path_2','path_3', 'path_4',
                     'json_name', 'row_path',
                     'col_path_1', 'col_path_1_LIST',
                     'col_path_2', 'col_path_2_LIST',
                     'col_path_3', 'col_path_3_LIST',
                     'col_path_4', 'col_path_4_LIST',
                     'col_path_5', 'col_path_5_LIST',
                     'col_path_6', 'col_path_6_LIST',
                     'col_path_7', 'col_path_7_LIST',
                     'col_path_8', 'col_path_8_LIST',
                     'data_type']]
    
    """
    # Reorder the columns in the df
    df = df.loc[:, ['variable', 'value', 'path_1', 'path_2','path_3', 'path_4',
                     'json_name', 'row_path',
                     'col_path_1', 'col_path_1_LIST',
                     'col_path_2', 'col_path_2_LIST',
                     'col_path_3', 'col_path_3_LIST',
                     'col_path_4', 'col_path_4_LIST',
                     'col_path_5', 'col_path_5_LIST',
                     'col_path_6', 'col_path_6_LIST',
                     'col_path_7', 'col_path_7_LIST',
                     'col_path_8', 'col_path_8_LIST',
                     'data_type', 'col_path_1_values',
                     'col_path_2_values','col_path_3_values',
                     'col_path_4_values', 'col_path_5_values',
                     'col_path_6_values', 'col_path_7_values',
                     'col_path_8_values']]
    """
    
    # Reset the index
    df.reset_index(drop=True, inplace=True)

    # fill na values with 'Missing'
    df = df.fillna('Missing')

    col_subset = df.columns.tolist()
    col_subset.remove('value')

    #   Drop rows that are completely identical across all columns
    df = df.drop_duplicates(subset= col_subset)
   
     # Save the DataFrame 
    df.to_csv(f"{main_path}Twitter/Output/Output_" + file_name + '.csv', index=False)

    return df

In [186]:
def id_creation(df):
    print(df.where(df.map(lambda x: x == np.nan)))

### structure_donations()
The structure_donations() function executes all functions above and results in a saved DataFrame for each data structure.


In [187]:
def structure_donations(data):

    """
    data: The unprocessed data structure JSON that will be processed
    """

    # Store the path to the data structure
    data = Path(data)  
    
    # Save teh file name of the data structure
    file_name = Path(data).stem 

    # Load JSON file (data structures)
    with open(data, 'r') as f:
        data = json.load(f)


    
    rows = [{'variable': k, 'value': v} for k, v in data.items()]
    df = pd.DataFrame(rows)

    
    # Execute the 'file_paths()' function and store the result in df
    df = file_paths(df)
    
    # Execute the 'row_column_paths()' function and store the result in df
    df = row_column_paths(df)
    
    # Execute the 'process_col_path()' function and store the result in df
    df = df.apply(lambda row: process_col_path(row, columns, data_types), axis=1)

    # Execute the 'clean_and_store()' function and store the result in df
    df = clean_and_store(df, file_name)
    
    
    
    return df


### id_creation()

In [188]:

def id_creation(df):
    
    df = df[['json_name', 'row_path',
                      'col_path_1',
                      'col_path_2',
                      'col_path_3',
                      'col_path_4',
                      'col_path_5', 
                      'col_path_6', 
                      'col_path_7', 
                      'col_path_8']]


    df_index = df[(df['row_path'] == 'No data')].index
    df.drop(df_index, inplace= True)

    df['index'] = df.index
    col = df.pop('index') 
    df.insert(0, 'index', col)  

    df['name'] = df['json_name'].str.replace(".js", "")
    col = df.pop('name') 
    df.insert(2, 'name', col) 

    last = df.apply(pd.Series.last_valid_index, axis=1)
    second = df.shift(-1, axis =1).apply(pd.Series.last_valid_index, axis=1)
    third =  df.shift(-2, axis =1).apply(pd.Series.last_valid_index, axis=1)
    fourth =  df.shift(-3, axis =1).apply(pd.Series.last_valid_index, axis=1)


    df['last'] = last
    df['second'] = second
    df['third'] =  third
    df['fourth'] =  fourth

    df['id'] = df.apply(lambda x: f"{x['name']}:{x[f"{x['last']}"]}", axis = 1)

    
    duplicate = list(df[df.duplicated(subset = 'id')]['id'])
    print('dup0', len(duplicate))
   

    df['id'] = df.apply(lambda x: f"{x['name']}:{x[f"{x['second']}"]}:{x[f"{x['last']}"]}" if x['id'] in duplicate else x['id'] , axis = 1)


    df['id'] = df.apply(lambda x: f"{x['name']}:{x[f"{x['third']}"]}:{x[f"{x['second']}"]}:{x[f"{x['last']}"]}" 
                        if x['id']
                         in list(df[df.duplicated(subset = 'id')]['id'])
                           else x['id'], axis = 1)
    
    
    df['id'] = df.apply(lambda x: f"{x['name']}:{x[f"{x['fourth']}"]}:{x[f"{x['third']}"]}:{x[f"{x['second']}"]}:{x[f"{x['last']}"]}" 
                        if x['id']
                         in list(df[df.duplicated(subset = 'id')]['id']) 
                         else x['id'] , axis = 1)
   
    df = df.drop(['last', 'second', 'third', 'fourth', 'index', 'name'], axis = 1)

    col = df.pop('id') 
    df.insert(0, 'id', col)  
            
        
    return df


## Execute 'structure_donations()': Transform data structures from JSON format to tabular format

In [189]:
# Specify the input directory
input_directory = Path(f'{main_path}Twitter/Input_test')  
print(input_directory)

/home/rvissche/Nextcloud/What-If/what-if-data-donation/what-if-data-donation/structure_donations/Processed_structure_donations/Twitter/Input_test


In [190]:
# Execute the 'structure_donations()' function for each file (data structure) in the input directory
for file in input_directory.iterdir():  
    if file.is_file():  
        structure_donations(file)
        

## Merge all data structures into one schema_df

In [191]:
# Path to the folder containing CSV files
output_path = f"{main_path}Twitter/Output"

# Get a list of all CSV files in the folder
csv_files = list(Path(output_path).glob("*.csv"))

# Load all CSVs into a list of DataFrames
dfs = [pd.read_csv(file) for file in csv_files]

# Concatenate all dataframes
merged_df = pd.concat(dfs, axis=0, ignore_index=True)



col_subset = merged_df.columns.tolist()
col_subset.remove('value')

# Drop rows that are completely identical across all columns
merged_df = merged_df.drop_duplicates(subset= col_subset)


# Filter where col1 contains 'messages', then drop duplicates based on col2
df_filtered = merged_df[merged_df["path_1"] == "messages"].drop_duplicates(subset="path_2")


# Append rows where col1 does not contain 'messages'
df_final = pd.concat([df_filtered, merged_df[merged_df["path_1"] != "messages"]], ignore_index=True)
df_final = df_final.replace('Missing', np.nan)

df_id = id_creation(df_final)

merge_cols = ['json_name', 'row_path', 'col_path_1', 'col_path_2', 'col_path_3',
       'col_path_4', 'col_path_5', 'col_path_6', 'col_path_7', 'col_path_8']

df_final = pd.merge(df_final, df_id, on = merge_cols, how = 'left')

df_final['name'] = df_final['json_name'].str.replace(".js", "")
df_final['id']= df_final['id'].fillna(df_final['name'])
df_final = df_final.drop('name', axis = 1)

col = df_final.pop('id') 
df_final.insert(0, 'id', col) 

# Save the final merged DataFrame
df_final.to_csv(f"{main_path}Twitter/Final/Merged_structures_X.csv", index=False)

  df_final = df_final.replace('Missing', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df_index, inplace= True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['index'] = df.index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['name'] = df['json_name'].str.replace(".js", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer

dup0 318


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['id'] = df.apply(lambda x: f"{x['name']}:{x[f"{x['fourth']}"]}:{x[f"{x['third']}"]}:{x[f"{x['second']}"]}:{x[f"{x['last']}"]}"


In [192]:
df_id.columns

Index(['id', 'json_name', 'row_path', 'col_path_1', 'col_path_2', 'col_path_3',
       'col_path_4', 'col_path_5', 'col_path_6', 'col_path_7', 'col_path_8'],
      dtype='object')