In [None]:
import pandas as pd
import numpy as np
import os
from consts import ISCO_MAPPING

def calculate_lisas(df):
    """
    Calculating LISAS - Linear Integrated Speed-Accuracy Score
    
    """
    df = df.dropna(subset=['response_time', 'accuracy'])
    correct_trials = df[df['accuracy'] == 1]
    if correct_trials.empty:
        return np.nan

    # rt_ - response time for correct only
    
    rt_mean, rt_std = correct_trials['response_time'].mean(), correct_trials['response_time'].std()
    acc_mean = df['accuracy'].mean()
    # pe - personal error % , spe - std personal error
    pe, spe = 1 - acc_mean, np.sqrt(acc_mean * (1 - acc_mean))

    if spe == 0 or np.isnan(rt_std):
        return rt_mean
    return rt_mean + (pe * (rt_std / spe))


def map_isco_score(occ):
    """
    Map profession strings to ISCO scores (humane=0, realistic=100)
    """
    occ_str = str(occ).strip().lower()

    # Handle specific status cases
    if occ_str in ['unemployed', 'nan', 'none']:
        return np.nan
    if 'self-employed' in occ_str:
        return 65

    return ISCO_MAPPING.get(occ_str[:2], np.nan)

def merge_parental_bias(final_df, main_dataset_path, stem_vars, verbal_vars):
    """
    Merges parental occupation scores and cognitive variables from the main dataset
    """
    if not os.path.exists(main_dataset_path):
        return final_df

    parents_df = pd.read_csv(main_dataset_path)
    
    # Calculate parental scores
    parents_df['mother_score'] = parents_df['mother_occupation'].apply(map_isco_score)
    parents_df['father_score'] = parents_df['father_occupation'].apply(map_isco_score)
    parents_df['parental_bias'] = parents_df[['mother_score', 'father_score']].mean(axis=1)

    # Prepare list of available columns (excluding ID to control its position)
    all_vars = list(set(stem_vars + verbal_vars + ['WASI_FSIQ', 'parental_bias']))
    available_cols = [c for c in all_vars if c in parents_df.columns]
    
    # Merge based on participant_id present in both dataframes
    merged = pd.merge(final_df, parents_df[['participant_id'] + available_cols], 
                      on='participant_id', how='left')
    
    return merged

def run_integrated_analysis():
    # --- A. Load LISAS data (Dynamic process) ---
    raw_list = []
    root_dir = '.\\trial_data' # Ensure this points to your data folder
    
    for root, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.tsv'):
                # Extract subject ID from filename (e.g., sub-01)
                sub_id = file.split('_')[0]
                df_trial = pd.read_csv(os.path.join(root, file), sep='\t')
                
                try:
                    score = calculate_lisas(df_trial)
                    raw_list.append({'participant_id': sub_id, 'lisas': score})
                except:
                    continue
    
    # Group by participant_id without converting it to index (prevents 'subject' renaming issues)
    df_lisas = pd.DataFrame(raw_list).groupby('participant_id', as_index=False)['lisas'].mean()

    # --- B. Define Poles (STEM vs Verbal variables) ---
    stem_vars = [
        'lisas',
        'AWMA-S_VisuoSpatialSTM_StS',
        'AWMA-S_VisuoSpatialWM_StS',
        'CMAT_BasicCalc_Comp_Quotient',
        'KeyMath_Numeration_ScS',
        'KeyMath_Measurement_ScS',
        'KeyMath_ProblemSolving_ScS',
        'WASI_PIQ'
    ]

    verbal_vars = [
        'AWMA-S_VerbalSTM_StS',
        'AWMA-S_VerbalWM_StS',
        'CTOPP_PhonAwareness_Comp',
        'CTOPP_RapidNaming_Comp',
        'TOWRE_Total_StS',
        'WASI_VIQ'
    ]

    # --- C. Load and Merge Main Dataset ---
    final_df = merge_parental_bias(df_lisas, 'main_dataset.csv', stem_vars, verbal_vars)

    # --- D. Global Standardization Function ---
    def standardize(series, invert=False):
        z = (series - series.mean()) / series.std()
        return z * -1 if invert else z

    # Calculate Z-scores for all relevant variables
    for var in stem_vars + verbal_vars + ['WASI_FSIQ', 'parental_bias']:
        if var in final_df.columns:
            # Invert lisas so higher score means better performance (matching other metrics)
            invert = True if 'lisas' in var.lower() else False
            final_df[f'{var}_z'] = standardize(final_df[var], invert=invert)

    # --- E. Calculate Indices and Cognitive Bias ---
    final_df['STEM_Index'] = final_df[[f'{v}_z' for v in stem_vars if f'{v}_z' in final_df.columns]].mean(axis=1)
    final_df['Verbal_Index'] = final_df[[f'{v}_z' for v in verbal_vars if f'{v}_z' in final_df.columns]].mean(axis=1)
    final_df['Child_Cognitive_Bias'] = final_df['STEM_Index'] - final_df['Verbal_Index']

    # --- F. Final Formatting and Output ---
    # Ensure participant_id is the leftmost column
    cols = ['participant_id'] + [c for c in final_df.columns if c != 'participant_id']
    final_df = final_df[cols]

    correlation = final_df['Child_Cognitive_Bias'].corr(final_df['parental_bias'])
    
    print(f"Success! The correlation is: {correlation:.3f}")
    print("\n--- Top 10 rows of final dataframe ---")
    print(final_df.head(10))
    
    # Save to CSV without the pandas internal index
    final_df.to_csv('analysis_results_full.csv', index=False)
    return final_df

if __name__ == "__main__":
    run_integrated_analysis()


Success! The correlation is: 0.191

--- Top 10 rows of final dataframe ---
  participant_id     lisas  KeyMath_Numeration_ScS  \
0        sub-002  2.778662                      14   
1        sub-003  2.177854                      10   
2        sub-004  1.879682                      11   
3        sub-005  2.884184                      14   
4        sub-006  2.232828                      12   
5        sub-007  2.370299                      17   
6        sub-008  1.549718                      16   
7        sub-009  2.709374                      14   
8        sub-010  2.622618                      16   
9        sub-011  2.472147                      18   

   CMAT_BasicCalc_Comp_Quotient  AWMA-S_VisuoSpatialSTM_StS  \
0                           113                         NaN   
1                           120                         NaN   
2                            95                        98.0   
3                           135                         NaN   
4              