In [None]:
%pip install pandas
%pip install ast
%pip install logging
%pip install dask
%pip install gc

In [None]:
import pandas as pd
import ast
import logging
import gc
from itertools import combinations
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# Initialize logging
logging.basicConfig(level=logging.INFO)

def clean_and_prepare_data(chunk):
    """
    Retrieve data from csv file to clean and prepare the entire dataset
    :param chunk: The dataset
    :return: The dataset that is clean and prepared
    """
    logging.info('Starting to retrieve data to clean and prepare the entire dataset')

    # Replace spaces with underscores in column names
    chunk.columns = chunk.columns.str.replace(' ', '_')

    # Extract statistics from 'Babbles' column (optimized with vectorized processing)
    if 'Babbles' in chunk.columns:
        def process_babbles_vectorized(babbles):
            try:
                babble_list = ast.literal_eval(babbles)
                if isinstance(babble_list, list):
                    return len(babble_list), sum(babble_list) / len(babble_list), sum(babble_list)
                else:
                    return 0, 0, 0
            except (ValueError, SyntaxError):
                return 0, 0, 0

        babble_stats = chunk['Babbles'].apply(process_babbles_vectorized)
        chunk[['Babble_Length', 'Babble_Mean', 'Babble_Sum']] = pd.DataFrame(babble_stats.tolist(), index=chunk.index)

    # Rename columns
    chunk = chunk.rename(columns={'Bout_no.': 'Bout_number'})

    logging.info('Finished cleaning and preparing the entire dataset\n')
    return chunk


def get_header_combinations(csv_file, exclude_headers=[]):
    """
    Retrieve data from csv file to extract headers that will be used and some to exclude
    :param csv_file: The path to the csv_file
    :param exclude_headers: A list of headers to remove if needed
    :return: A list of combinations to perform ANOVA Testing
    """
    logging.info('Starting to extract headers that will be used and some to exclude')
    df = pd.read_csv(csv_file, nrows=0) 
    headers = df.columns.str.replace(' ', '_').tolist()

    filtered_headers = [header for header in headers if header not in exclude_headers]
    logging.info('Finished extracting headers')

    # Precompute all header combinations
    all_combinations = [
        comb for r in range(1, len(filtered_headers) + 1) 
        for comb in combinations(filtered_headers, r)
    ]
    logging.info('Finished finding all combinations for ANOVA Testing\n')
    return all_combinations


def run_anova(chunk, combinations, response_col='Babble_Length'):
    """
    Run ANOVA on a given chunk of data for each header combination.
    :param chunk: Data chunk
    :param combinations: Header combinations to test
    :param response_col: The response column for the ANOVA
    :return: None
    """
    for combo in combinations:
        try:
            column_names = chunk[list(combo) + [response_col]]
        except KeyError:
            # Skip combinations with missing columns
            continue
        
        # Check if there is enough data in the columns
        if column_names.isnull().sum().sum() > 0:
            logging.info(f"Skipping combination {combo} due to missing data.")
            continue
        
        # Construct the formula for the ANOVA
        factors = column_names.columns[:-1]
        response = column_names.columns[-1]
        formula = f'{response} ~ ' + ' * '.join(factors)
        
        try:
            # Run ANOVA
            model = ols(formula, data=column_names).fit()
            anova_result = anova_lm(model)
            
            # Add combination as an extra column
            anova_result['Combination'] = str(combo)
            
            # Append results directly to a file in append mode
            anova_result.to_csv('partial_anova_results.csv', mode='a', header=False)
        except Exception as e:
            logging.info(f"Error running ANOVA for combination {combo}: {e}")


def filter_significant_results(file='partial_anova_results.csv', output_file='filtered_file.csv'):
    """
    Filters and saves significant results from ANOVA tests (PR(>F) <= 0.05).
    :param file: Path to the CSV file containing ANOVA results
    :param output_file: Path to save the filtered results
    :return: None
    """
    logging.info('Starting to filter rows where PR(>F) is less than or equal to 0.05')
    df = pd.read_csv(file)
    
    # Filter rows where PR(>F) is less than or equal to 0.05
    df_filtered = df[df['PR(>F)'].notna() & (df['PR(>F)'] <= 0.05)]
    
    # Save the filtered results to a new CSV file
    df_filtered.to_csv(output_file, index=False)
    logging.info(f"Significant ANOVA results saved to '{output_file}'")


if __name__ == "__main__":
    csv_file = "CMBabble_Master_combined.csv" 
    chunksize = 50000 
    exclude_headers = ["Babbles", "Bout_ID", "Notes", "Raven work", "Date_on_vocalization_2"]  
    header_combinations = get_header_combinations(csv_file, exclude_headers)

    # Process the CSV file in chunks
    chunk_iter = pd.read_csv(csv_file, chunksize=chunksize)

    for chunk in chunk_iter:
        # Clean and prepare the chunk
        chunk = clean_and_prepare_data(chunk)
        # Run ANOVA on the chunk
        run_anova(chunk, header_combinations)
        # Collect garbage to free up memory
        gc.collect()

    # Filter and save significant results
    filter_significant_results(file='partial_anova_results.csv', output_file='filtered_file.csv')

