# Probing of the English markers

In [1]:
import os

In [2]:
from utils import load_files_from_folder
from utils import separate_target_and_non_target
from utils import balance_and_create_layered_data
from utils import run_probing_on_all_layers

In [3]:
markers=['Accusative',
         'Causal_final', 
         'Dative', 
         'Genitive',
         'Plural',
         'Sublative',
         'Translative',
         'VerbConjugation']

In [4]:
import os
import pandas as pd

def load_and_label_data(data_folder):
    # Get all subfolder names in the provided directory path
    all_marker_names = set()
    labeled_dfs = {}
    
    # Step 1: Identify all marker types based on folder names
    for root, dirs, files in os.walk(data_folder):
        for file in files:
            if file.endswith("representations.csv"):
                # Extract marker name from the folder name
                marker_name = os.path.basename(root).split("_representations")[0]
                all_marker_names.add(marker_name)

    # Step 2: Process each file and assign labels
    for root, dirs, files in os.walk(data_folder):
        for file in files:
            if file.endswith("representations.csv"):
                # Extract marker name
                marker_name = os.path.basename(root).split("_representations")[0]
                
                # Load the data file
                file_path = os.path.join(root, file)
                df = pd.read_csv(file_path)
                
                # Add a column for each marker, setting to 0 by default
                for marker in all_marker_names:
                    df[marker] = 0
                
                # Set the current marker column to 1 (indicating presence)
                df[marker_name] = 1
                
                # Append to the dictionary, concatenating if marker already exists
                if marker_name in labeled_dfs:
                    labeled_dfs[marker_name] = pd.concat([labeled_dfs[marker_name], df], ignore_index=True)
                else:
                    labeled_dfs[marker_name] = df

    return labeled_dfs


In [5]:
labeled_dfs=load_and_label_data('Full_data_english/')

In [6]:
import pandas as pd
import ast
import re

def preprocess_representation(representation_str):
    """
    Converts a string representation of a list of NumPy arrays into lists of exactly 768 floats each.
    Ensures exactly 12 layers by removing any extraneous information and handling inconsistencies.

    Args:
        representation_str (str): String representation of a list of NumPy arrays.

    Returns:
        list: A list containing exactly 12 lists of 768 floats, each representing a layer.
    """

    # Step 1: Strip off the initial and final parts
    stripped_str = representation_str.lstrip('[array(').rstrip('])')

    # Step 2: Split by '), array([' to separate each layer
    layer_strs = stripped_str.split('), array([')

    # Debug: Print each identified layer for verification
    # print("Identified layers (raw strings):")
    # for i, layer in enumerate(layer_strs):
        # print(f"Layer {i + 1}: {layer[:100]}...")  # Print first 100 chars for each layer
        # print(f"Length of Layer {i + 1} (raw): {len(layer.split(','))}")  # Print the raw number of elements

    # Step 3: Convert each layer to a list of floats, handling any dtype metadata
    processed_layers = []
    for idx, layer_str in enumerate(layer_strs):
        # Stop processing if more than 12 layers have been processed
        

        # Remove any dtype information if present in the 13th or other unexpected layers
        clean_layer_str = layer_str.split('\ndtype=float32')[0].strip()
        clean_layer_str = clean_layer_str.replace(' ', '').replace('[', '').replace(']', '')
        clean_layer_str = clean_layer_str.replace('\ndtype=float32', '')
        clean_layer_str = clean_layer_str.replace('dtype=float32','')
        # Parse floats and take only the first 768 elements
        float_values = [float(val) for val in clean_layer_str.split(",") if val][:768]

        # Ensure we have exactly 768 elements in each layer
        if len(float_values) == 768:
            processed_layers.append(float_values)

            # print(f"Warning: Layer with unexpected length {len(float_values)} found, discarding it.")

    # Step 4: Handle cases where there are more or fewer than 12 layers
    if len(processed_layers) < 12:
        # print(f"Warning: Expected 12 layers but found {len(processed_layers)}. Adding empty layers.")
        processed_layers.extend([[0.0] * 768] * (12 - len(processed_layers)))

    return processed_layers

   # Return 12 layers with 768 zeroes in case of error

def transform_to_hungarian_structure(df):
    """
    Transforms the input DataFrame to match the structure of the Hungarian dataset for probing experiments.
    
    Args:
        df (pd.DataFrame): The input DataFrame to transform (e.g., English dataset).
        
    Returns:
        pd.DataFrame: Transformed DataFrame with the same structure as the Hungarian dataset.
    """
    # Rename columns to match the Hungarian dataset
    df = df.rename(columns={'Causal_final': 'CausalFinal'})
    
    # Add missing columns with default values of 0
    required_columns = ['Accusative', 'Genitive', 'Dative', 'Sublative', 'CausalFinal', 'Translative', 'Plural', 'VerbConjugation']
    for col in required_columns:
        if col not in df.columns:
            df[col] = 0
    
    # Preprocess 'Hidden Representations (All Layers)' column
    df['Hidden Representations (All Layers)'] = df['Hidden Representations (All Layers)'].apply(preprocess_representation)

    # Reorder columns to match the Hungarian dataset
    column_order = ['Word', 'Lemma', 'Sentence', 'Hidden Representations (All Layers)', 
                    'Accusative', 'Genitive', 'Dative', 'Sublative', 'CausalFinal', 
                    'Translative', 'Plural', 'VerbConjugation']
    df = df[column_order]

    return df


In [7]:
representations={}
for key, value in labeled_dfs.items():
    df=transform_to_hungarian_structure(value)
    representations[key]=df

In [8]:
for target_marker in markers:
    target_reps, non_target_reps=separate_target_and_non_target(representations, target_marker)
    balance_and_create_layered_data(target_reps, non_target_reps, target_marker, layered_data_folder='Layered_data_english', target_threshold=200, eval_split=0.15)

Layer 1 representations saved for train and eval datasets in 'Layered_data_english/Accusative/1'.
Layer 2 representations saved for train and eval datasets in 'Layered_data_english/Accusative/2'.
Layer 3 representations saved for train and eval datasets in 'Layered_data_english/Accusative/3'.
Layer 4 representations saved for train and eval datasets in 'Layered_data_english/Accusative/4'.
Layer 5 representations saved for train and eval datasets in 'Layered_data_english/Accusative/5'.
Layer 6 representations saved for train and eval datasets in 'Layered_data_english/Accusative/6'.
Layer 7 representations saved for train and eval datasets in 'Layered_data_english/Accusative/7'.
Layer 8 representations saved for train and eval datasets in 'Layered_data_english/Accusative/8'.
Layer 9 representations saved for train and eval datasets in 'Layered_data_english/Accusative/9'.
Layer 10 representations saved for train and eval datasets in 'Layered_data_english/Accusative/10'.
Layer 11 represent

In [9]:
run_probing_on_all_layers(layered_data_folder='Layered_data_english', output_folder='Output_comparison_english')

Processing marker: Accusative


  cumulative_results_df = pd.concat([cumulative_results_df, pd.DataFrame([{


Cumulative results for all layers of Accusative saved to Output_comparison_english/Accusative/Accusative_results.csv
Processing marker: Plural


  cumulative_results_df = pd.concat([cumulative_results_df, pd.DataFrame([{


Cumulative results for all layers of Plural saved to Output_comparison_english/Plural/Plural_results.csv
Processing marker: Translative


  cumulative_results_df = pd.concat([cumulative_results_df, pd.DataFrame([{


Cumulative results for all layers of Translative saved to Output_comparison_english/Translative/Translative_results.csv
Processing marker: Dative


  cumulative_results_df = pd.concat([cumulative_results_df, pd.DataFrame([{


Cumulative results for all layers of Dative saved to Output_comparison_english/Dative/Dative_results.csv
Processing marker: Causal_final


  cumulative_results_df = pd.concat([cumulative_results_df, pd.DataFrame([{
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cumulative results for all layers of Causal_final saved to Output_comparison_english/Causal_final/Causal_final_results.csv
Processing marker: Sublative


  _warn_prf(average, modifier, msg_start, len(result))
  cumulative_results_df = pd.concat([cumulative_results_df, pd.DataFrame([{


Cumulative results for all layers of Sublative saved to Output_comparison_english/Sublative/Sublative_results.csv
Processing marker: VerbConjugation


  cumulative_results_df = pd.concat([cumulative_results_df, pd.DataFrame([{


Cumulative results for all layers of VerbConjugation saved to Output_comparison_english/VerbConjugation/VerbConjugation_results.csv
Processing marker: Genitive


  cumulative_results_df = pd.concat([cumulative_results_df, pd.DataFrame([{


Cumulative results for all layers of Genitive saved to Output_comparison_english/Genitive/Genitive_results.csv
