In [None]:
%pip install pandas
%pip install matplotlib
%pip install numpy
%pip install ast
%pip install logging
%pip install seaborn
%pip install argparse
%pip install statsmodels


In [1]:
import pandas as pd
import ast
import statsmodels.api as sm
import argparse
import logging

from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [7]:
# Load the CSV file
data = pd.read_csv('../CMBabble_Master_clean.csv')

#  Map categorical columns to numerical values
data['Sex'] = data['Sex'].map({'M': 0, 'F': 1})
data['Treatment'] = data['Treatment'].map({'CONTROL': 0, 'CORT': 1, 'OIL': 2})

# Convert strings to lists and filter rows with babble length greater than spcified
data['Babbles'] = data['Babbles'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
# data = data[data['Babbles'].apply(lambda x: len(x) >= 50)]

# Convert date columns to datetime
date_columns = ['Hatch date', 'Fledge date', 'Date on vocalization']
for col in date_columns:
    data[col] = pd.to_datetime(data[col], errors='coerce')

# Extract statistics from 'Babbles' column
def process_babbles(babble_list):
    return {
        'babble_count': len(babble_list),
        'babble_mean': sum(babble_list) / len(babble_list) if babble_list else 0,
        'babble_sum': sum(babble_list),
    }

# Apply statistics extraction
babbles_stats = data['Babbles'].apply(process_babbles)
data['Babble Length'] = babbles_stats.apply(lambda x: x['babble_count'])
data['Babble Mean'] = babbles_stats.apply(lambda x: x['babble_mean'])
data['Babble Sum'] = babbles_stats.apply(lambda x: x['babble_sum'])

data.describe()

Unnamed: 0,Nestling ID,Sex,Treatment,No. eggs hatched from nest,No. birds fledged from nest,Hatchno,Hatseq,Hatch date,Fledge date,Fledge age,Date on vocalization,Age on bout day,Days prior to fledging,Bout no.,Babble Length,Babble Mean,Babble Sum
count,3288.0,3288.0,3288.0,3288.0,3288.0,3288.0,3288.0,3288,3288,3288.0,3288,3288.0,3288.0,3288.0,3288.0,3288.0,3288.0
mean,8688.781022,0.39781,0.483881,6.473844,5.538017,3.443735,2.225061,1970-01-01 00:00:00.000043513,1970-01-01 00:00:00.000043545,31.919404,1970-01-01 00:00:00.000043540,27.463504,4.4559,8.490572,37.927616,3.284636,113.36618
min,8407.0,0.0,0.0,4.0,4.0,1.0,1.0,1970-01-01 00:00:00.000043351,1970-01-01 00:00:00.000043380,28.0,1970-01-01 00:00:00.000043371,20.0,0.0,0.0,1.0,1.0,2.0
25%,8559.75,0.0,0.0,6.0,4.0,1.0,1.0,1970-01-01 00:00:00.000043374,1970-01-01 00:00:00.000043407,30.0,1970-01-01 00:00:00.000043401,25.0,2.0,2.0,5.0,2.8,16.0
50%,8628.0,0.0,0.0,7.0,6.0,3.0,2.0,1970-01-01 00:00:00.000043390,1970-01-01 00:00:00.000043420,32.0,1970-01-01 00:00:00.000043418,28.0,4.0,6.0,12.0,3.375,37.0
75%,8805.0,1.0,1.0,7.0,6.0,5.0,3.0,1970-01-01 00:00:00.000043663,1970-01-01 00:00:00.000043693,33.0,1970-01-01 00:00:00.000043689,30.0,7.0,12.0,32.0,3.823754,104.0
max,8999.0,1.0,2.0,8.0,7.0,7.0,4.0,1970-01-01 00:00:00.000043672,1970-01-01 00:00:00.000043707,37.0,1970-01-01 00:00:00.000043706,37.0,13.0,54.0,1568.0,5.5,6208.0
std,159.14851,0.48952,0.705739,1.006482,1.120548,2.128771,1.009154,,,1.854097,,3.645186,2.954662,8.876835,91.602275,0.765283,261.804872


### **Analysis of Variance Linear Model**


ANOVA_LM (Analysis of Variance Linear Model) is a statistical method used in regression analysis to:

Decompose variance in a dependent variable

Assess the significance of predictors in a linear regression model

Test whether different groups or factors have statistically significant effects on the outcome

-   **Most Important for Hypothesis Testing**: **p-value** (PR(>F)) --- it directly determines if you can reject the null hypothesis.
    -   **If p-value < 0.05**: There is sufficient evidence to **reject the null hypothesis**, suggesting that at least one group mean differs from the others (i.e., the factor has a significant effect).
    -   **If p-value > 0.05**: There is insufficient evidence to reject the null hypothesis, meaning that **there is no significant difference** between the group means, and the factor does not have a significant effect.

-   **Supporting Information**: F-statistic helps you understand the size of the effect, and the sum of squares and degrees of freedom help to compute the F-statistic but are not as directly relevant for hypothesis testing.

In [None]:
# This is as two way ANOVA ( only comparing TWO variables )
data.columns = data.columns.str.replace(' ', '_')

# 'Babble_Length' need to be the last colum in column_names

column_names = data[["Sex", "Age_on_bout_day", "Babble_Length"]]  
# column_names = data[["Age_on_bout_day", "Nestling_ID", "Babble_Length"]]  
# column_names = data[["Age_on_bout_day", "Fledge_age", "Babble_Length"]]  

print("Columns selected:", column_names.columns.tolist())

factors = column_names.columns[:-1]
response = column_names.columns[-1]

# Construct the formula for the ANOVA (using interaction if needed)
formula = f'{response} ~ ' + ' * '.join(factors)
# formula = f'{response} ~ ' + (factors[0])

print(formula) 

model = ols(formula, data=column_names).fit()
anova_result = anova_lm(model)
print(anova_result)


# What Comparisons Test I Have Ran

# Babble_Length ~ Age on bout day * Nestling ID
# Babble_Length ~ Sex * Treatment




# Babble_Length ~ Sex * Age
# Babble_Length ~ Age * Treatment
# Babble_Length ~ Sex * Age * Treatment


In [None]:
import pandas as pd
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import argparse
import logging
import ast

# Initialize logging
logging.basicConfig(level=logging.INFO)

# Function to preprocess the data
def preprocess_data(data):
    logging.info('Preprocessing data...')

    # Map 'Sex' and 'Treatment' columns to numeric values
    data['Sex'] = data['Sex'].map({'M': 0, 'F': 1})
    data['Treatment'] = data['Treatment'].map({'CONTROL': 0, 'CORT': 1, 'OIL': 2})

    # Convert strings in 'Babbles' column to lists
    data['Babbles'] = data['Babbles'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    # Convert date columns to datetime
    date_columns = ['Hatch date', 'Fledge date', 'Date on vocalization']
    for col in date_columns:
        data[col] = pd.to_datetime(data[col], errors='coerce')

    # Extract statistics from 'Babbles' column
    def process_babbles(babble_list):
        return {
            'babble_count': len(babble_list),
            'babble_mean': sum(babble_list) / len(babble_list) if babble_list else 0,
            'babble_sum': sum(babble_list),
        }

    babbles_stats = data['Babbles'].apply(process_babbles)
    data['Babble Length'] = babbles_stats.apply(lambda x: x['babble_count'])
    data['Babble Mean'] = babbles_stats.apply(lambda x: x['babble_mean'])
    data['Babble Sum'] = babbles_stats.apply(lambda x: x['babble_sum'])

    logging.info('Data preprocessing completed.')
    return data

# Function to compute ANOVA
def compute_anova(data, factors, response):
    # Ensure the response is the last column in the formula
    formula = f"{response} ~ " + " * ".join(factors)
    logging.info(f"ANOVA Formula: {formula}")
    
    # Fit the model
    model = ols(formula, data=data).fit()
    
    # Compute ANOVA
    anova_result = anova_lm(model)
    logging.info("\nANOVA Results:")
    logging.info(anova_result)
    
    # Check if p-value is significant
    significant = anova_result["PR(>F)"].min() < 0.05
    logging.info(f"\nSignificant P-Value Found: {'Yes' if significant else 'No'}")
    return anova_result

def main():
    # Parse command-line arguments
    parser = argparse.ArgumentParser(description="Perform ANOVA on user-selected columns.")
    parser.add_argument("-i", "--input",   type=str, default="../CMBabble_Master_clean.csv", help="Path to the CSV file")
    parser.add_argument("-f", "--factors", type=lambda s: [item.strip() for item in s.split(',')], default="Sex, Age_on_bout_day", help="List of independent variables separated by commas.")
    # parser.add_argument("-f", "--factors", default="Sex", help="List of independent variables separated by commas.")
    args = parser.parse_args()

    # Hardcoded dependent variable (response) - Ensure column names are standardized Anova can read spaces, replace with '_'
    response = "Babble_Length"

    # Load the dataset or use the default
    logging.info('Reading and preparing data for analysis')
    if args.input:
        data = pd.read_csv(args.input)
    else:
        logging.warning("No data file provided, default dataset will be used.")

    # Preprocess the data
    data = preprocess_data(data)

    selected_columns = data[["Sex", "Babble Length"]]  
    
    # print("Columns selected:", column_names.columns.tolist())

    try:
        # selected_data = data[selected_columns]
        selected_data = data[["Sex", "Babble Length"]]  

    except KeyError as e:
        print(f"Error: {e}")
        print("Ensure the selected columns exist in the DataFrame or CSV file.")
        return

    print(f"Columns selected for ANOVA: {selected_columns}")

    # Ensure column names are standardized
    data.columns = data.columns.str.replace(' ', '_')
    
    # Compute ANOVA
    compute_anova(selected_data, args.factors, response)

if __name__ == "__main__":
    main()


INFO:root:Reading and preparing data for analysis


INFO:root:Preprocessing data...
INFO:root:Data preprocessing completed.
INFO:root:ANOVA Formula: Babble_Length ~ /Users/MA57489/Library/Jupyter/runtime/kernel-v3b28d1cba9b89188f2eedcf61dd3e74eed5c242a4.json


Columns selected for ANOVA:       Sex  Babble Length
0       1              5
1       1              3
2       1              8
3       1             18
4       1             23
...   ...            ...
3283    1             62
3284    1              6
3285    1              3
3286    1             51
3287    1              5

[3288 rows x 2 columns]


PatsyError: expected a noun, not '/'
    Babble_Length ~ /Users/MA57489/Library/Jupyter/runtime/kernel-v3b28d1cba9b89188f2eedcf61dd3e74eed5c242a4.json
                    ^

# Variables to Examine or Test from ANOVA Testing Table

### Single Variables 

- **Age on bout day:** The p-value is (1.75e-17), so reject the null hypothesis. This indicates that Age on bout day significantly affects the dependent variable.

- **Nestling ID:** The p-value is (1.463254e-04), so reject the null hypothesis. This indicates that Nestling ID significantly affects the dependent variable.

### Two Variables 


___

# Variables to Exclude

- **Age on bout day:Nestling_ID:** The p-value is 0.341, which is greater than 0.05, so fail to reject the null hypothesis. This indicates that there is no significant interaction between Age_on_bout_day and Nestling_ID on the dependent variable.

- **Sex:Treatment:** The p-value is 0.120709, which is greater than 0.05, so fail to reject the null hypothesis. This indicates that there is no significant interaction between Sex and Treatment on the dependent variable.



