In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl
import os

In [2]:
pwd

'c:\\Users\\ding\\Documents\\github\\CeA'

In [5]:
data_dir = r'C:/Users/ding/Documents/github/CeA/data'  
file_list = os.listdir(data_dir)


In [7]:
import os
import pandas as pd

# Function to restructure data and calculate Z-scores
def restructure_data_with_zscore(data):
    """
    Restructure the data to create a merged table where each column corresponds to a neuron 
    or an experimental variable and each row corresponds to a time point. Also calculates Z-scores.
    """
    data_group = data.groupby('Cell')
    template_df = None
    for k in data_group.groups.keys():
        if template_df is None:
            template_df = data_group.get_group(k)
            template_df = template_df.drop(columns=['F', 'Cell'])
        cell_id = int(''.join(filter(str.isdigit, k)))
        template_df[cell_id] = data_group.get_group(k)['F'].values
        
    
    # Calculate Z-scores for each neuron column
    neuron_columns = [col for col in template_df.columns if isinstance(col, int)]
    for col in neuron_columns:
        mean = template_df[col].mean()
        std = template_df[col].std()
        template_df[col] = (template_df[col] - mean) / std  # Replace with Z-scores
    return template_df


# Function to batch-load and process files
def process_files_with_zscore(data_dir, file_list):
    """
    Batch load and process files to add mouse number, condition, and calculate Z-scores.
    """
    processed_data = []

    for file_name in file_list:
        file_path = os.path.join(data_dir, file_name)
        
        # Extract mouse number and condition from file name
        parts = file_name.split('_')
        mouse_number = parts[0]
        condition = parts[1]
        #print(parts[1])
        
        # Load the file
        data = pd.read_csv(file_path)
        
        # Restructure the data with Z-scores
        restructured_data = restructure_data_with_zscore(data)
        
        # Add mouse number and condition as columns
        restructured_data['Mouse'] = mouse_number
        restructured_data['Condition'] = condition
        
        # Append to the list
        processed_data.append(restructured_data)
    
    # Concatenate all processed data into a single DataFrame
    merged_data = pd.concat(processed_data, ignore_index=True)
    return merged_data

# Define directory and file list

# Process files and create the merged DataFrame with Z-scores
merged_data_with_zscore = process_files_with_zscore(data_dir, file_list)

# Save the merged data with Z-scores to a new CSV file (optional)
output_path = os.path.join('C:/Users/ding/Documents/github/CeA', 'material_merged_data_with_zscore.csv')
merged_data_with_zscore.to_csv(output_path, index=False)

print("Data restructuring with Z-scores completed!")


EmptyDataError: No columns to parse from file

In [8]:
import pandas as pd

def assign_event_index_per_group(df, event_column='bitepoint', group_columns=['Mouse', 'Condition']):
    """
    Assigns an incremental event index starting from 1 for each group of mouse and condition.
    
    Parameters:
        df (pd.DataFrame): The dataset containing time series data.
        event_column (str): The column indicating events (e.g., 1 for event occurrence).
        group_columns (list): Columns to group by (e.g., Mouse, Condition).
        
    Returns:
        pd.DataFrame: DataFrame with an added 'Event_Index' column.
    """
    # Initialize the 'Event_Index' column as NaN
    df['Event_Index'] = None

    # Group by Mouse and Condition
    for _, group_data in df.groupby(group_columns):
        # Find the indices where the event occurs (event_column == 1)
        event_indices = group_data[group_data[event_column] == 1].index
        #print(event_indices)
        # Assign event index starting from 1 for each event occurrence in the group
        df.loc[event_indices, 'Event_Index'] = range(1, len(event_indices) + 1)
        
        # Forward fill Event_Index to ensure all rows within the same group have the same Event_Index
        #df['Event_Index'] = df['Event_Index'].ffill()

    return df

# Example usage:
# Assume 'merged_data_with_zscore' is the restructured data with Z-scores calculated.
# The 'bitepoint' column is used to mark events.
# The dataframe should contain 'Mouse' and 'Condition' columns, as well as an event marker column (e.g., 'bitepoint').

# Sample Data (make sure to replace this with your actual data):
# merged_data_with_zscore = pd.DataFrame({
#     'Mouse': ['Mouse1', 'Mouse1', 'Mouse2', 'Mouse2'],
#     'Condition': ['Condition1', 'Condition1', 'Condition2', 'Condition2'],
#     'bitepoint': [0, 1, 0, 1],
#     'Other_Column': [0.5, 0.7, 0.6, 0.8]
# })

# Apply the event index assignment
merged_data_with_zscore = assign_event_index_per_group(merged_data_with_zscore, event_column='bitepoint', group_columns=['Mouse', 'Condition'])

# Check the output


NameError: name 'merged_data_with_zscore' is not defined

In [9]:
import pandas as pd


def extract_event_windows(data, event_column='bitepoint', pre_event_time=30, post_event_time=30):
    """
    Extracts pre-event and post-event data around specified events from the dataset.
    
    Parameters:
        data (pd.DataFrame): The restructured dataset with time series data.
        event_column (str): The column indicating events (e.g., 1 for event occurrence).
        pre_event_time (int): Number of time points before the event to include.
        post_event_time (int): Number of time points after the event to include.
    
    Returns:
        pd.DataFrame: A DataFrame containing the extracted data with additional metadata.
    """
    # Ensure the event column exists
    if event_column not in data.columns:
        raise ValueError(f"The specified event column '{event_column}' does not exist in the data.")
    
    # Find all event indices
    event_indices = data[data[event_column] == 1].index

    # Initialize a list to store results
    event_data = []

    for event_index in event_indices:
        # Define the window around the event
        start_time = event_index - pre_event_time
        end_time = event_index + post_event_time

        # Ensure the window is within bounds
        if start_time < 0 or end_time >= len(data):
            continue

        # Extract data for the time window
        window_data = data.iloc[start_time:end_time + 1]  # Use .iloc to slice by row index

        # Add metadata for the event
        window_data = window_data.copy()
        window_data['Time_Relative_to_Event'] = range(-pre_event_time, post_event_time + 1)
        # Append to the list
        event_data.append(window_data)

    # Combine all event windows into a single DataFrame
    event_windows_df = pd.concat(event_data, ignore_index=True)
    return event_windows_df

# Example usage:


# Step 2: Extract event windows for each event (e.g., lick events)
pre_event_time = 30
post_event_time = 30

event_windows = extract_event_windows(
    data=merged_data_with_zscore,
    event_column='bitepoint',  # Adjust as per your event column
    pre_event_time=pre_event_time,
    post_event_time=post_event_time
)

# Save the extracted data to a CSV file (optional)
event_windows.to_csv('material_event_windows_data.csv', index=False)

print("Pre-event and post-event data extraction completed!")


NameError: name 'merged_data_with_zscore' is not defined

In [10]:
event_windows['active_neuron']=0
results = []
for name, group_data in event_windows.groupby(["Mouse","Condition"]):
    # Find the indices where the event occurs (event_column == 1)
    #event_indices = group_data[group_data[event_column] == 1].index
    #print(event_indices)
    # Assign event index starting from 1 for each event occurrence in the group
    #df.loc[event_indices, 'Event_Index'] = range(1, len(event_indices) + 1)
    filtered_df = group_data[(group_data['Time_Relative_to_Event'] >= 0) & (group_data['Time_Relative_to_Event'] <= 30)]
    numeric_name_float_columns = filtered_df.loc[:, filtered_df.columns[filtered_df.columns.str.isnumeric().isna()]]
    not_all_nan_columns = numeric_name_float_columns.columns[~numeric_name_float_columns.isna().all()]
    #no_nan_columns = numeric_name_float_columns.columns[numeric_name_float_columns.notna().all()]
    selected_df=numeric_name_float_columns.loc[:,not_all_nan_columns]

    chunk_size = 31

    # Initialize a list to store the results

# Iterate over the DataFrame in chunks of 31 rows
    for start in range(0, len(selected_df), chunk_size):
        end = start + chunk_size
        chunk = selected_df.iloc[start:end]  # Get the current chunk
        #max_values = chunk.max()  # Find the max value in the chunk
        mean_values = chunk.mean() 
        #num_columns_gt_0_5 = (max_values > 1).sum()  # Count columns > 0.5
        num_columns_gt_0_5 = (mean_values > 0.5).sum()
        total_columns = len(selected_df.columns)  # Total number of columns
        results.append((num_columns_gt_0_5, total_columns,name))  # Store the result


results_df = pd.DataFrame(results, columns=['active_number', 'total_number', 'group'])
    # Forward fill Event_Index to ensure all rows within the same group have the same Event_Index
    #df['Event_Index'] = df['Event_Index'].ffill()




NameError: name 'event_windows' is not defined

In [53]:
results_df.to_csv('C:/Users/ding/Documents/github/CeA/material_active_neuron_number_0.3.csv', index=False)

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
#file_path = 'path_to_your_file.csv'  # Update with your file path if needed
data = results_df

# Calculate the percentage of active neurons
data['active_percentage'] = (data['active_number'] / data['total_number']) * 100

# Assign a unique event index per mouse and condition
data['event_index'] = data.groupby('group').cumcount() + 1

import pandas as pd

# Example to ensure the 'group' column is a string
data['group'] = data['group'].astype(str)

# Use regular expression to extract 'mouse' and 'condition'
data[['mouse', 'condition']] = data['group'].str.extract(r"\('(\d+)',\s*'(\w+)'\)")

# Convert 'mouse' to integer if necessary
data['mouse'] = data['mouse'].astype(int)  # Converts 'mouse' to integers

# Preview the result
print(data)


NameError: name 'results_df' is not defined

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Group by 'mouse' and 'condition' to calculate average percentage for each mouse within a condition
mouse_condition_summary = (
    data.groupby(['mouse', 'condition'])
    .agg(mean_active=('active_percentage', 'mean'))
    .reset_index()
)

# Sort conditions by the mean active percentage from biggest to smallest
condition_order = mouse_condition_summary.groupby('condition')['mean_active'].mean().sort_values(ascending=False).index

# Plot box plot for each condition with the conditions sorted by the mean active percentage
plt.figure(figsize=(8, 6))
sns.boxplot(
    x='condition',
    y='mean_active',
    data=mouse_condition_summary,
    order=condition_order,  # Sort conditions based on the mean active percentage
    palette='muted'
)

# Add individual data points on top of the box plot using stripplot
sns.stripplot(
    x='condition',
    y='mean_active',
    data=mouse_condition_summary,
    order=condition_order,
    color='black',  # Color for individual points
    jitter=True,    # Adds random noise to the x-position to avoid overlap
    alpha=0.5,      # Transparency for better visibility of overlapping points
    size=6          # Size of the points
)

# Customize the plot
plt.xlabel("Condition")
plt.ylabel("Percentage of Active Neurons (%)")
plt.title("Distribution of Active Neurons by Condition")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Show the plot
plt.show()


NameError: name 'data' is not defined