In [None]:
%pip install pandas
%pip install ast
%pip install logging
%pip install dask
%pip install gc

In [None]:
import pandas as pd
import ast
from itertools import combinations
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import gc

# Function to clean and prepare a single chunk of data
def clean_and_prepare_data(chunk):
    # Convert date columns to datetime
    date_columns = ['Hatch date', 'Fledge date', 'Date on vocalization']
    for col in date_columns:
        if col in chunk.columns:
            chunk[col] = pd.to_datetime(chunk[col], errors='coerce')
            print(f"Converted column '{col}' to datetime:")
            print(chunk[col])
            print("\n")  # Add a blank line for readability
    

    
    # Replace spaces with underscores in the column names
    chunk.columns = chunk.columns.str.replace(' ', '_')

    # Extract statistics from 'Babbles' column
    def process_babbles(babbles):
        try:
            babble_list = ast.literal_eval(babbles)  # Convert string to list
            if isinstance(babble_list, list): 
                return {
                    'babble_count': len(babble_list),
                    'babble_mean': sum(babble_list) / len(babble_list) if babble_list else 0,
                    'babble_sum': sum(babble_list),
                }
            else:
                return {'babble_count': 0, 'babble_mean': 0, 'babble_sum': 0}
        except (ValueError, SyntaxError):
            return {'babble_count': 0, 'babble_mean': 0, 'babble_sum': 0}

    if 'Babbles' in chunk.columns:
        babbles_stats = chunk['Babbles'].apply(process_babbles)
        chunk['Babble_Length'] = babbles_stats.apply(lambda x: x['babble_count'])
        chunk['Babble_Mean'] = babbles_stats.apply(lambda x: x['babble_mean'])
        chunk['Babble_Sum'] = babbles_stats.apply(lambda x: x['babble_sum'])

    # Rename columns
    chunk = chunk.rename(columns={
        'Bout_no.': 'Bout_number', 
        'No._eggs_hatched_from_nest': 'Number_eggs_hatched_from_nest', 
        'No._birds_fledged_from_nest': 'Number_birds_fledged_from_nest'
    })
    
    return chunk


# Function to get header combinations
def get_header_combinations(csv_file, exclude_headers=[]):
    df = pd.read_csv(csv_file, nrows=0)  # Only reads headers
    headers = df.columns.tolist()
    
    # Replace spaces with underscores
    headers = [header.replace(' ', '_') for header in headers]
    
    # Exclude specified headers
    filtered_headers = [header for header in headers if header not in exclude_headers]
    
    all_combinations = []
    for r in range(1, len(filtered_headers) + 1):
        combinations_r = list(combinations(filtered_headers, r))
        all_combinations.extend(combinations_r)
    
    return all_combinations


# Function to run ANOVA test for each combination
def run_anova(chunk, combinations, response_col='Babble_Length'):
    for combo in combinations:
        try:
            column_names = chunk[list(combo) + [response_col]]
        except KeyError:
            # Skip combinations with missing columns
            continue
        
        # Check if there is enough data in the columns
        if column_names.isnull().sum().sum() > 0:
            print(f"Skipping combination {combo} due to missing data.")
            continue
        
        # Construct the formula for the ANOVA
        factors = column_names.columns[:-1]
        response = column_names.columns[-1]
        formula = f'{response} ~ ' + ' * '.join(factors)
        
        try:
            # Run ANOVA
            model = ols(formula, data=column_names).fit()
            anova_result = anova_lm(model)
            
            # Add combination as an extra column
            anova_result['Combination'] = str(combo)
            
            # Append results directly to a file in append mode
            anova_result.to_csv('partial_anova_results.csv', mode='a', header=False)
        except Exception as e:
            print(f"Error running ANOVA for combination {combo}: {e}")


# Function to filter significant results
def filter_significant_results(file='partial_anova_results.csv', output_file='filtered_file.csv'):
    # Load the results into a DataFrame
    df = pd.read_csv(file)
    
    # Filter rows where PR(>F) is less than or equal to 0.05
    df_filtered = df[df['PR(>F)'].notna() & (df['PR(>F)'] <= 0.05)]
    
    # Save the filtered results to a new CSV file
    df_filtered.to_csv(output_file, index=False)
    print(f"\nSignificant ANOVA results saved to '{output_file}'")


# Main execution block
csv_file = "../CMBabble_Master_clean.csv"  # Replace with your file path
chunksize = 50000  # Adjust chunk size as needed

# Prepare header combinations (excluding specified columns)
exclude_headers = ["Babbles", "Bout_ID", "Notes", "Raven work", "Date_on_vocalization_2", ""]  
header_combinations = get_header_combinations(csv_file, exclude_headers)

# Process the CSV file in chunks
chunk_iter = pd.read_csv(csv_file, chunksize=chunksize)

for chunk in chunk_iter:
    # Clean and prepare the chunk
    chunk = clean_and_prepare_data(chunk)
    # Run ANOVA on the chunk
    # run_anova(chunk, header_combinations)
    # Collect garbage to free up memory
    gc.collect()

# Filter and save significant results
filter_significant_results(file='partial_anova_results.csv', output_file='filtered_file.csv')
