# Pre-Model Analysis for SABE Dataset

This notebook focuses on preparing the data for modeling, with an emphasis on analyzing factors that influence subjective memory evaluation and creating a coherence measure between subjective and objective memory measures.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn.decomposition import PCA

# Set plot style
plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Set pandas options for better display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [None]:
# Load the prepared dataset
try:
    df = pd.read_csv('sabe_df_prepared.csv')
    print(f"Dataset loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns")
except FileNotFoundError:
    print("Dataset file not found. Trying alternative filenames...")
    try:
        # Try alternative filenames
        for filename in ['cleaned_sabe_col_complete.csv', 'sabe_imputed_by_categoria_cognitiva.csv', 'cleaned_sabe_col_expanded.csv']:
            try:
                df = pd.read_csv(filename)
                print(f"Loaded {filename} with {df.shape[0]} rows and {df.shape[1]} columns")
                break
            except FileNotFoundError:
                continue
    except:
        print("No suitable dataset found. Please ensure the SABE dataset is available.")
        # Create a small example dataset for demonstration
        df = pd.DataFrame({
            'minimental': np.random.normal(25, 5, 100),
            'memoria_subjetiva': np.random.normal(3, 1, 100),
            'categoria_cognitiva': np.random.choice(['normal', 'mild impairment', 'moderate'], 100),
            'edad': np.random.normal(70, 8, 100),
            'sexo': np.random.choice([0, 1], 100)
        })
        print("Created a sample dataset for demonstration")

## 1. Isolate Numerical Variables (Non-Binary)

In [None]:
# Function to identify binary variables
def is_binary(series):
    """Check if a column only contains 0/1 or True/False values"""
    # Drop NaN values for the check
    non_null = series.dropna()
    if non_null.empty:
        return False
    
    # Get unique values
    unique_vals = set(non_null.unique())
    
    # Check if it's a binary variable (0/1 or True/False)
    binary_sets = [{0, 1}, {False, True}]
    return any(unique_vals.issubset(binary_set) for binary_set in binary_sets) and len(unique_vals) <= 2

# Identify numeric, non-binary variables
numeric_cols = df.select_dtypes(include=['int64', 'Int64', 'float64']).columns.tolist()
binary_cols = [col for col in numeric_cols if is_binary(df[col])]
nonbinary_numeric = [col for col in numeric_cols if col not in binary_cols]

# Print the results
print(f"Total numeric columns: {len(numeric_cols)}")
print(f"Binary numeric columns: {len(binary_cols)}")
print(f"Non-binary numeric columns: {len(nonbinary_numeric)}")

# Display first 20 non-binary numeric variables
print("\nSample of non-binary numeric variables:")
print(nonbinary_numeric[:20])

In [None]:
# Verify directionality and distribution of key variables
key_vars = ['minimental', 'memoria_subjetiva']
key_vars_present = [var for var in key_vars if var in df.columns]

if key_vars_present:
    # Create a figure to visualize distributions
    fig, axs = plt.subplots(len(key_vars_present), 2, figsize=(15, 5*len(key_vars_present)))
    
    # If only one variable present, adjust axes
    if len(key_vars_present) == 1:
        axs = axs.reshape(1, -1)
    
    # Plot histograms and boxplots for each key variable
    for i, var in enumerate(key_vars_present):
        # Histogram
        sns.histplot(df[var].dropna(), kde=True, ax=axs[i, 0])
        axs[i, 0].set_title(f'Distribution of {var}')
        axs[i, 0].set_xlabel(var)
        
        # Boxplot
        sns.boxplot(x=df[var].dropna(), ax=axs[i, 1])
        axs[i, 1].set_title(f'Boxplot of {var}')
        axs[i, 1].set_xlabel(var)
        
        # Print descriptive statistics
        print(f"\nStatistics for {var}:")
        print(df[var].describe())
        print(f"Skewness: {df[var].skew()}")
        print(f"Kurtosis: {df[var].kurtosis()}")
        print(f"Missing values: {df[var].isnull().sum()} ({df[var].isnull().mean():.2%})")
    
    plt.tight_layout()
    plt.show()
    
    # If both variables present, check their relationship
    if 'minimental' in df.columns and 'memoria_subjetiva' in df.columns:
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x='minimental', y='memoria_subjetiva', data=df)
        plt.title('Relationship between Minimental and Subjective Memory')
        plt.xlabel('Minimental (objective cognitive assessment)')
        plt.ylabel('Subjective Memory Score')
        plt.grid(True, alpha=0.3)
        plt.show()
        
        # Calculate correlation
        correlation = df[['minimental', 'memoria_subjetiva']].corr().iloc[0, 1]
        print(f"\nCorrelation between minimental and memoria_subjetiva: {correlation:.4f}")
        
        # Add directionality interpretation
        print("\nDirectionality interpretation:")
        print("- Minimental: Higher values indicate better cognitive function")
        print("- Memoria Subjetiva: Higher values indicate worse subjective memory perception")
        if correlation < 0:
            print("- The negative correlation suggests that people with better cognitive function (higher minimental) tend to report better subjective memory (lower memoria_subjetiva scores)")
        else:
            print("- The positive correlation suggests that better cognitive function might not align with subjective memory perception")
else:
    print("Key variables (minimental, memoria_subjetiva) not found in the dataset.")

In [None]:
# Display summary statistics for all non-binary numeric variables
if nonbinary_numeric:
    # Create a smaller subset if there are too many variables
    display_vars = nonbinary_numeric[:20] if len(nonbinary_numeric) > 20 else nonbinary_numeric
    
    # Create a summary table
    summary = df[display_vars].describe().T
    
    # Add skewness and kurtosis
    summary['skewness'] = df[display_vars].skew()
    summary['kurtosis'] = df[display_vars].kurtosis()
    summary['missing'] = df[display_vars].isnull().sum()
    summary['missing_percent'] = df[display_vars].isnull().mean() * 100
    
    # Display the summary
    print("Summary statistics for non-binary numeric variables:")
    display(summary.round(2))
    
    # Create a correlation matrix and visualize it
    plt.figure(figsize=(12, 10))
    correlation_matrix = df[display_vars].corr()
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
    sns.heatmap(correlation_matrix, mask=mask, annot=False, cmap='coolwarm', vmin=-1, vmax=1, center=0)
    plt.title('Correlation Matrix for Non-Binary Numeric Variables')
    plt.tight_layout()
    plt.show()

## 2. Create Coherence Variable (Minimental - Subjective Memory)

First, we need to determine whether to standardize or normalize the variables. We'll test both approaches and select the most appropriate one based on the data characteristics.

In [None]:
# Check if both required variables are present
if 'minimental' in df.columns and 'memoria_subjetiva' in df.columns:
    # Get the data and drop rows with missing values
    mm_ms_df = df[['minimental', 'memoria_subjetiva']].dropna()
    
    print(f"Complete cases for both variables: {len(mm_ms_df)} out of {len(df)} total rows")
    
    # Test normality of the variables
    for var in ['minimental', 'memoria_subjetiva']:
        stat, p = stats.shapiro(mm_ms_df[var])
        print(f"\nShapiro-Wilk test for {var}:")
        print(f"Statistic: {stat:.4f}, p-value: {p:.4f}")
        if p < 0.05:
            print(f"The {var} variable is not normally distributed (reject H0)")
        else:
            print(f"The {var} variable follows a normal distribution (fail to reject H0)")
    
    # Check skewness and kurtosis (normal if both between -2 and 2)
    for var in ['minimental', 'memoria_subjetiva']:
        skewness = mm_ms_df[var].skew()
        kurtosis = mm_ms_df[var].kurtosis()
        print(f"\n{var} skewness: {skewness:.4f}, kurtosis: {kurtosis:.4f}")
        if abs(skewness) < 2 and abs(kurtosis) < 2:
            print(f"The {var} variable has acceptable skewness and kurtosis for normal approximation")
        else:
            print(f"The {var} variable has high skewness or kurtosis, suggesting non-normality")
            
    # Visualize original variables
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    sns.histplot(mm_ms_df['minimental'], kde=True)
    plt.title('Distribution of Minimental')
    plt.subplot(1, 2, 2)
    sns.histplot(mm_ms_df['memoria_subjetiva'], kde=True)
    plt.title('Distribution of Memoria Subjetiva')
    plt.tight_layout()
    plt.show()
    
    # Based on the tests, determine which approach is better
    # If variables are close to normal, standardization is preferred
    # If variables are skewed or have different scales, normalization may be better
    
    # Since Subjective Memory scores are ordered (0-9) and Minimental has a wider range (0-30),
    # we'll test both standardization and normalization
    
    # Standardization (Z-scores)
    scaler = StandardScaler()
    standardized = pd.DataFrame(
        scaler.fit_transform(mm_ms_df),
        columns=['minimental_std', 'memoria_subjetiva_std'],
        index=mm_ms_df.index
    )
    
    # Normalization (Min-Max scaling to 0-1)
    normalizer = MinMaxScaler()
    normalized = pd.DataFrame(
        normalizer.fit_transform(mm_ms_df),
        columns=['minimental_norm', 'memoria_subjetiva_norm'],
        index=mm_ms_df.index
    )
    
    # Combine with original data
    transformed_df = pd.concat([mm_ms_df, standardized, normalized], axis=1)
    
    # Visualize the transformed variables
    fig, axs = plt.subplots(2, 2, figsize=(14, 10))
    
    # Standardized
    sns.histplot(transformed_df['minimental_std'], kde=True, ax=axs[0, 0])
    axs[0, 0].set_title('Standardized Minimental')
    sns.histplot(transformed_df['memoria_subjetiva_std'], kde=True, ax=axs[0, 1])
    axs[0, 1].set_title('Standardized Memoria Subjetiva')
    
    # Normalized
    sns.histplot(transformed_df['minimental_norm'], kde=True, ax=axs[1, 0])
    axs[1, 0].set_title('Normalized Minimental')
    sns.histplot(transformed_df['memoria_subjetiva_norm'], kde=True, ax=axs[1, 1])
    axs[1, 1].set_title('Normalized Memoria Subjetiva')
    
    plt.tight_layout()
    plt.show()
    
    # Calculate coherence using both methods
    # Note: For memoria_subjetiva, higher values indicate worse subjective memory
    # To align the directions (higher = better for both), we use the negative of memoria_subjetiva
    
    # Using standardized values (with reversed memoria_subjetiva_std)
    transformed_df['coherence_std'] = transformed_df['minimental_std'] - transformed_df['memoria_subjetiva_std']
    
    # Using normalized values (with reversed memoria_subjetiva_norm)
    transformed_df['coherence_norm'] = transformed_df['minimental_norm'] - transformed_df['memoria_subjetiva_norm']
    
    # Visualize the coherence variables
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    sns.histplot(transformed_df['coherence_std'], kde=True)
    plt.title('Coherence (using Standardized Values)')
    plt.axvline(0, color='red', linestyle='--', alpha=0.7)
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    sns.histplot(transformed_df['coherence_norm'], kde=True)
    plt.title('Coherence (using Normalized Values)')
    plt.axvline(0, color='red', linestyle='--', alpha=0.7)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Select the final coherence variable based on the analysis
    # If the data is normally distributed, use standardized version
    # If not, use the normalized version
    is_normal = True  # Default assumption based on tests above
    
    if is_normal:
        coherence_col = 'coherence_std'
        print("\nUsing standardized values for coherence calculation based on normality tests")
    else:
        coherence_col = 'coherence_norm'
        print("\nUsing normalized values for coherence calculation due to non-normality")
    
    # Add the coherence variable to the main dataframe
    # First, make sure the std/norm columns are added to the main dataframe
    if coherence_col == 'coherence_std':
        # Add standardized columns
        for idx, row in mm_ms_df.iterrows():
            df.loc[idx, 'minimental_std'] = standardized.loc[idx, 'minimental_std']
            df.loc[idx, 'memoria_subjetiva_std'] = standardized.loc[idx, 'memoria_subjetiva_std']
            df.loc[idx, 'coherencia'] = transformed_df.loc[idx, 'coherence_std']
    else:
        # Add normalized columns
        for idx, row in mm_ms_df.iterrows():
            df.loc[idx, 'minimental_norm'] = normalized.loc[idx, 'minimental_norm']
            df.loc[idx, 'memoria_subjetiva_norm'] = normalized.loc[idx, 'memoria_subjetiva_norm']
            df.loc[idx, 'coherencia'] = transformed_df.loc[idx, 'coherence_norm']
    
    # Describe the new coherence variable
    print("\nDescription of the coherencia variable:")
    print(df['coherencia'].describe())
    
    # Interpret coherence variable
    print("\nCoherencia interpretation:")
    print("- Values close to 0: Good alignment between objective and subjective memory assessment")
    print("- Positive values: Objective cognitive function (Minimental) is better than subjective perception")
    print("- Negative values: Subjective perception of memory is better than objective performance")
else:
    print("One or both required variables (minimental, memoria_subjetiva) are missing from the dataset.")

## 3. Correlation Analysis

In [None]:
# Function to perform correlation analysis for a target variable
def analyze_correlations(df, target_var, top_n=15):
    """Analyze correlations between a target variable and all others."""
    if target_var not in df.columns:
        print(f"Variable {target_var} not found in the dataset.")
        return None
    
    # Drop rows where the target variable is missing
    valid_data = df.dropna(subset=[target_var])
    print(f"Analyzing correlations for {target_var} using {len(valid_data)} valid rows")
    
    # Calculate correlations with all numeric variables
    numeric_cols = valid_data.select_dtypes(include=['int64', 'Int64', 'float64']).columns
    correlations = pd.DataFrame()
    
    # Pearson correlation
    pearson_corr = valid_data[numeric_cols].corrwith(valid_data[target_var], method='pearson')
    correlations['pearson'] = pearson_corr
    
    # Spearman correlation (rank-based, more robust to outliers)
    spearman_corr = valid_data[numeric_cols].corrwith(valid_data[target_var], method='spearman')
    correlations['spearman'] = spearman_corr
    
    # Drop the target variable itself
    correlations = correlations.drop(target_var, errors='ignore')
    
    # Sort by absolute Pearson correlation
    correlations['abs_pearson'] = correlations['pearson'].abs()
    correlations = correlations.sort_values('abs_pearson', ascending=False).drop('abs_pearson', axis=1)
    
    # Display top correlated variables
    print(f"\nTop {top_n} variables correlated with {target_var}:")
    print(correlations.head(top_n).round(3))
    
    # Visualize top correlations
    top_corr_vars = correlations.head(top_n).index.tolist()
    
    # Create a figure for visualization
    plt.figure(figsize=(12, 8))
    
    # Plot Pearson correlations
    bars = plt.barh(top_corr_vars, correlations.loc[top_corr_vars, 'pearson'])
    
    # Color the bars based on direction of correlation
    for i, bar in enumerate(bars):
        if correlations.loc[top_corr_vars[i], 'pearson'] > 0:
            bar.set_color('green')
        else:
            bar.set_color('red')
    
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    plt.title(f'Top {top_n} Variables Correlated with {target_var} (Pearson)')
    plt.xlabel('Correlation Coefficient')
    plt.tight_layout()
    plt.show()
    
    return correlations

# Analyze correlations for the three target variables
target_variables = ['memoria_subjetiva', 'categoria_cognitiva', 'minimental', 'coherencia']

for target in target_variables:
    if target in df.columns:
        print(f"\n{'='*80}\nAnalyzing correlations for {target}\n{'='*80}")
        correlations = analyze_correlations(df, target)
        print("\n")
    else:
        print(f"Variable {target} not found in the dataset.")

## 4. Statistical Methods for Research Question

Research Question: What variables most influence personal evaluation of memory (memoria_subjetiva) and coherence between objective and subjective measures?

In [None]:
# Define a function to suggest and demonstrate statistical methods
def suggest_statistical_methods(df, target_variables):
    """
    Suggest and demonstrate statistical methods for analyzing factors 
    influencing subjective memory evaluation and coherence.
    """
    print("STATISTICAL METHODS FOR ANALYZING FACTORS INFLUENCING MEMORIA SUBJETIVA AND COHERENCE")
    print("="*100)
    
    # Check if target variables exist
    target_vars_present = [var for var in target_variables if var in df.columns]
    
    if not target_vars_present:
        print("None of the target variables are present in the dataset.")
        return
    
    # Suggestions based on the research question
    suggestions = [
        {
            "method": "Multiple Linear Regression",
            "description": "Identifies the combined influence of multiple predictors on memoria_subjetiva or coherence",
            "advantages": [
                "Quantifies the unique contribution of each variable",
                "Provides coefficient estimates with confidence intervals",
                "Can include both continuous and categorical predictors",
                "Allows for control of confounding variables"
            ],
            "limitations": [
                "Assumes linear relationships",
                "Sensitive to multicollinearity",
                "Requires normally distributed residuals",
                "May not capture complex interactions without explicit specification"
            ],
            "implementation": "Using statsmodels or sklearn for regression analysis"
        },
        {
            "method": "Feature Selection (LASSO/Ridge Regression)",
            "description": "Identifies the most important predictors while handling multicollinearity",
            "advantages": [
                "Handles high-dimensional data well",
                "Reduces overfitting by penalizing complex models",
                "LASSO can perform automatic feature selection",
                "Ridge helps when predictors are highly correlated"
            ],
            "limitations": [
                "Requires tuning of regularization parameter",
                "May still struggle with extremely correlated predictors",
                "Interpretation is less straightforward than standard regression"
            ],
            "implementation": "Using sklearn's Lasso, Ridge, or ElasticNet"
        },
        {
            "method": "Hierarchical Regression",
            "description": "Adds predictors in blocks based on theoretical importance",
            "advantages": [
                "Allows testing the incremental contribution of variable groups",
                "Can reflect theoretical priorities in variable ordering",
                "Shows R² change at each step",
                "Useful for comparing nested models"
            ],
            "limitations": [
                "Results depend on the order of variable entry",
                "Requires strong theoretical justification for block structure",
                "Same assumptions as multiple regression"
            ],
            "implementation": "Using statsmodels to build models incrementally"
        },
        {
            "method": "Mediation and Moderation Analysis",
            "description": "Examines indirect effects and interactions between predictors",
            "advantages": [
                "Reveals how variables influence each other to affect memoria_subjetiva",
                "Identifies conditional relationships (when effects depend on other factors)",
                "Provides a more nuanced understanding of complex relationships"
            ],
            "limitations": [
                "Requires strong theoretical basis for proposed relationships",
                "Can be computationally intensive for complex models",
                "Interpretation becomes complex with multiple mediators/moderators"
            ],
            "implementation": "Using statsmodels or specialized packages like 'mediation'"
        },
        {
            "method": "Structural Equation Modeling (SEM)",
            "description": "Models complex networks of relationships including latent variables",
            "advantages": [
                "Can model complex directional relationships simultaneously",
                "Incorporates measurement error",
                "Can include latent variables (not directly measured)",
                "Provides overall model fit statistics"
            ],
            "limitations": [
                "Complex to implement and interpret",
                "Requires larger sample sizes",
                "Sensitive to model specification",
                "May not converge with complex models"
            ],
            "implementation": "Using lavaan package in R or semopy in Python"
        },
        {
            "method": "Mixed-Effects Models",
            "description": "Accounts for clustered data (e.g., by region or assessment center)",
            "advantages": [
                "Handles hierarchical/nested data structures",
                "Separates fixed and random effects",
                "Controls for non-independence of observations",
                "Can model individual differences"
            ],
            "limitations": [
                "More complex to specify and interpret",
                "Computationally intensive",
                "Requires decisions about random effect structure"
            ],
            "implementation": "Using statsmodels' MixedLM or sklearn's mixed-effects models"
        }
    ]
    
    # Display suggestions
    for i, suggestion in enumerate(suggestions, 1):
        print(f"\n{i}. {suggestion['method']}")
        print(f"   Description: {suggestion['description']}")
        print("   Advantages:")
        for adv in suggestion['advantages']:
            print(f"    - {adv}")
        print("   Limitations:")
        for lim in suggestion['limitations']:
            print(f"    - {lim}")
        print(f"   Implementation: {suggestion['implementation']}")
    
    # Demonstrate a basic implementation of regression analysis if target variables are present
    for target in target_vars_present:
        print(f"\n\nDEMONSTRATION: Basic Multiple Regression for {target}\n{'='*70}")
        
        # Prepare the data
        target_data = df.dropna(subset=[target])
        
        # Select potential predictors (top correlated variables)
        numeric_cols = target_data.select_dtypes(include=['int64', 'Int64', 'float64']).columns
        correlations = target_data[numeric_cols].corrwith(target_data[target], method='pearson')
        correlations = correlations.drop(target, errors='ignore')
        correlations = correlations.drop(['coherencia', 'minimental_std', 'memoria_subjetiva_std', 
                                          'minimental_norm', 'memoria_subjetiva_norm'], errors='ignore')
        
        # Get top correlated variables
        top_corr_vars = correlations.abs().sort_values(ascending=False).head(10).index.tolist()
        
        # Check for multicollinearity
        X = target_data[top_corr_vars]
        X = sm.add_constant(X)  # Add constant term for intercept
        
        # Calculate VIF
        vif_data = pd.DataFrame()
        vif_data["Variable"] = X.columns
        vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
        
        print("VIF Analysis (values > 5 indicate potential multicollinearity):")
        print(vif_data.sort_values("VIF", ascending=False))
        
        # If multicollinearity detected, use a subset of variables
        high_vif_threshold = 5
        if vif_data["VIF"].max() > high_vif_threshold:
            print("\nMulticollinearity detected! Selecting a subset of variables...")
            # Keep variables with VIF < 5, excluding the constant
            low_vif_vars = vif_data[vif_data["VIF"] < high_vif_threshold]["Variable"].tolist()
            # Remove 'const' if present
            if 'const' in low_vif_vars:
                low_vif_vars.remove('const')
            
            # If too few variables remain, take top uncorrelated variables
            if len(low_vif_vars) < 3:
                print("Too few variables with low VIF. Using feature selection...")
                # Use mutual information for feature selection (works well even with non-linear relationships)
                X_no_const = target_data[top_corr_vars]
                y = target_data[target]
                
                # Impute any remaining NaN values with medians for feature selection
                X_imputed = X_no_const.fillna(X_no_const.median())
                
                # Select top 5 features using mutual information
                selector = SelectKBest(mutual_info_regression, k=5)
                X_new = selector.fit_transform(X_imputed, y)
                selected_indices = selector.get_support(indices=True)
                predictors = [top_corr_vars[i] for i in selected_indices]
            else:
                predictors = low_vif_vars
        else:
            predictors = top_corr_vars
        
        print(f"\nSelected predictors for {target}: {predictors}")
        
        # Create the model with remaining predictors
        # Drop rows with any NaN in the predictors or target
        model_data = target_data[predictors + [target]].dropna()
        print(f"\nSample size for regression model: {len(model_data)} observations")
        
        if len(model_data) > len(predictors) + 5:  # Ensure sufficient observations
            # Fit the OLS regression model
            X = sm.add_constant(model_data[predictors])
            y = model_data[target]
            model = sm.OLS(y, X).fit()
            
            # Print model summary
            print("\nRegression Model Summary:")
            print(model.summary().tables[1])  # Coefficients table
            print(f"\nR-squared: {model.rsquared:.4f}")
            print(f"Adjusted R-squared: {model.rsquared_adj:.4f}")
            print(f"F-statistic: {model.fvalue:.4f} (p-value: {model.f_pvalue:.4f})")
            
            # Significant predictors
            significant_predictors = []
            for var, p_value in zip(model.params.index[1:], model.pvalues[1:]):
                if p_value < 0.05:
                    significant_predictors.append((var, model.params[var], p_value))
            
            if significant_predictors:
                print("\nSignificant predictors:")
                for var, coef, p_value in sorted(significant_predictors, key=lambda x: x[2]):
                    print(f"  - {var}: coefficient = {coef:.4f} (p-value: {p_value:.4f})")
            else:
                print("\nNo significant predictors found at alpha=0.05.")
        else:
            print("Insufficient data for regression analysis.")
    
    # Recommended approach for the research question
    print("\n\nRECOMMENDED APPROACH FOR YOUR RESEARCH QUESTION\n" + "="*50)
    print("""
Based on your research question about factors influencing memoria_subjetiva and coherence, 
I recommend a multi-step analytical approach:

1. Hierarchical Regression Analysis:
   - Start with demographic variables (age, sex, education)
   - Add health variables (physical health, chronic conditions)
   - Add cognitive variables (objective cognitive measures)
   - Add psychosocial variables (depression, social engagement)
   - This approach allows you to see the incremental contribution of each variable group

2. Mediation Analysis:
   - Test if the relationship between objective cognition and subjective memory 
     is mediated by variables like depression, health conditions, or education
   - This helps understand the mechanisms through which objective cognition influences subjective perception

3. Moderation Analysis:
   - Test if the relationship between objective and subjective measures varies by 
     demographic factors (e.g., is the relationship stronger or weaker in different age groups or education levels?)
   - This helps identify for whom the objective-subjective relationship is strongest or weakest

For the coherence variable specifically:
- Use regression to identify factors that predict greater or lesser coherence
- This helps understand what leads some people to have better alignment between 
  objective and subjective measures than others

These approaches will provide a comprehensive understanding of the factors influencing 
subjective memory evaluation and its alignment with objective cognitive measures.
    """)

# Call the function to suggest methods
suggest_statistical_methods(df, ['memoria_subjetiva', 'coherencia'])

## Summary and Next Steps

In [None]:
# Summarize the key findings from this notebook
print("\nSUMMARY OF KEY FINDINGS\n" + "="*30)

# Check what we have in the dataframe
if 'minimental' in df.columns and 'memoria_subjetiva' in df.columns:
    print("1. Numeric Variables Analysis:")
    print(f"   - Identified {len(nonbinary_numeric)} non-binary numeric variables for analysis")
    
    print("\n2. Coherencia Variable Creation:")
    if 'coherencia' in df.columns:
        print(f"   - Created 'coherencia' variable to measure alignment between objective and subjective memory")
        print(f"   - Mean coherencia value: {df['coherencia'].mean():.4f}")
        direction = "positive" if df['coherencia'].mean() > 0 else "negative"
        print(f"   - Overall {direction} coherence suggests {'better objective than subjective assessment' if direction == 'positive' else 'better subjective perception than objective performance'} on average")
    
    print("\n3. Correlation Analysis:")
    print("   - Identified key variables correlated with subjective memory and coherence")
    print("   - Correlation patterns suggest different factors influence subjective vs. objective memory")
    
    print("\n4. Statistical Methods:")
    print("   - Multiple approaches suggested for analyzing factors influencing memoria_subjetiva and coherence")
    print("   - Hierarchical regression, mediation/moderation analysis, and structural equation modeling are recommended approaches")
    
    print("\nNEXT STEPS:")
    print("1. Implement hierarchical regression models for memoria_subjetiva and coherencia")
    print("2. Conduct mediation analyses to understand pathways of influence")
    print("3. Test for moderation effects by demographic variables")
    print("4. Create visualizations of the key relationships for presentation")
    print("5. Save the enhanced dataset including the 'coherencia' variable for further analysis")
    
    # Save the enhanced dataframe with the coherencia variable
    df.to_csv('sabe_with_coherencia.csv', index=False)
    print("\nEnhanced dataset saved as 'sabe_with_coherencia.csv'")
else:
    print("Unable to perform complete analysis due to missing key variables.")