# Pandas Basics Part 2 - Advanced Data Analysis

Advanced pandas techniques for scientific data analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 3)

# Generate sample experimental data
np.random.seed(42)

## Advanced Indexing and MultiIndex

In [None]:
# Create hierarchical index for experimental data
experiments = ['Exp_A', 'Exp_B', 'Exp_C']
conditions = ['Control', 'Treatment_1', 'Treatment_2']
timepoints = ['T0', 'T1', 'T2', 'T4', 'T8']

# Create MultiIndex
index = pd.MultiIndex.from_product(
    [experiments, conditions, timepoints],
    names=['experiment', 'condition', 'timepoint']
)

# Generate sample data
n_samples = len(index)
data = {
    'cell_count': np.random.normal(100000, 20000, n_samples),
    'viability': np.random.uniform(0.8, 0.99, n_samples),
    'protein_conc': np.random.normal(2.5, 0.5, n_samples),
    'glucose': np.random.normal(15, 3, n_samples)
}

df_multi = pd.DataFrame(data, index=index)

print("MultiIndex DataFrame:")
print(df_multi.head(10))
print(f"\nShape: {df_multi.shape}")

# Advanced indexing operations
print("\n1. Select all data for Exp_A:")
exp_a_data = df_multi.loc['Exp_A']
print(exp_a_data.head())

print("\n2. Select Treatment_1 across all experiments:")
treatment_1_data = df_multi.loc[pd.IndexSlice[:, 'Treatment_1', :], :]
print(treatment_1_data.head())

print("\n3. Cross-sectional data at T4:")
t4_data = df_multi.loc[pd.IndexSlice[:, :, 'T4'], :]
print(t4_data.head())

# Pivot and unstack operations
print("\n4. Unstacking timepoints:")
unstacked = df_multi['cell_count'].unstack('timepoint')
print(unstacked.head())

print("\n5. Pivot table with multiple aggregations:")
pivot_table = df_multi.pivot_table(
    values=['cell_count', 'viability'],
    index='experiment',
    columns='condition',
    aggfunc={'cell_count': 'mean', 'viability': ['mean', 'std']}
)
print(pivot_table)

## Advanced GroupBy Operations

In [None]:
# Reset index for groupby operations
df_flat = df_multi.reset_index()

# Multiple groupby operations
print("1. Multiple aggregations per column:")
agg_result = df_flat.groupby(['experiment', 'condition']).agg({
    'cell_count': ['count', 'mean', 'std', 'min', 'max'],
    'viability': ['mean', 'std'],
    'protein_conc': lambda x: x.quantile(0.75),  # Custom function
    'glucose': 'mean'
})
print(agg_result.head())

# Named aggregations (pandas 0.25+)
print("\n2. Named aggregations:")
named_agg = df_flat.groupby(['experiment', 'condition']).agg(
    avg_cell_count=('cell_count', 'mean'),
    cell_count_cv=('cell_count', lambda x: x.std() / x.mean()),  # Coefficient of variation
    viability_min=('viability', 'min'),
    protein_range=('protein_conc', lambda x: x.max() - x.min())
)
print(named_agg.head())

# Transform operations
print("\n3. Transform operations (z-score normalization):")
df_flat['cell_count_zscore'] = df_flat.groupby('condition')['cell_count'].transform(
    lambda x: (x - x.mean()) / x.std()
)
df_flat['viability_pct_of_max'] = df_flat.groupby('experiment')['viability'].transform(
    lambda x: x / x.max() * 100
)
print(df_flat[['experiment', 'condition', 'cell_count', 'cell_count_zscore', 
              'viability', 'viability_pct_of_max']].head())

# Rolling and expanding operations within groups
print("\n4. Rolling operations within groups:")
df_sorted = df_flat.sort_values(['experiment', 'condition', 'timepoint'])
df_sorted['cell_count_rolling_mean'] = df_sorted.groupby(['experiment', 'condition'])['cell_count'].rolling(
    window=3, min_periods=1
).mean().reset_index(level=[0, 1], drop=True)

print(df_sorted[['experiment', 'condition', 'timepoint', 
                'cell_count', 'cell_count_rolling_mean']].head(10))

# Custom aggregation functions
def growth_rate(series):
    """Calculate growth rate as (final - initial) / initial"""
    if len(series) < 2:
        return np.nan
    sorted_series = series.sort_index()
    return (sorted_series.iloc[-1] - sorted_series.iloc[0]) / sorted_series.iloc[0]

def coefficient_of_variation(series):
    """Calculate coefficient of variation"""
    return series.std() / series.mean() if series.mean() != 0 else np.nan

print("\n5. Custom aggregation functions:")
custom_agg = df_flat.groupby(['experiment', 'condition']).agg({
    'cell_count': [growth_rate, coefficient_of_variation],
    'viability': [growth_rate, 'mean'],
    'protein_conc': ['mean', coefficient_of_variation]
})
print(custom_agg)

## Advanced Time Series Analysis

In [None]:
# Create realistic time series data
start_date = datetime(2023, 1, 1)
dates = pd.date_range(start=start_date, periods=365*2, freq='D')  # 2 years of daily data

# Generate synthetic sensor data with trends and seasonality
np.random.seed(42)
trend = np.linspace(20, 25, len(dates))  # Gradual temperature increase
seasonal = 3 * np.sin(2 * np.pi * np.arange(len(dates)) / 365.25)  # Annual cycle
noise = np.random.normal(0, 0.5, len(dates))
temperature = trend + seasonal + noise

# Add some missing values
missing_indices = np.random.choice(len(dates), size=20, replace=False)
temperature[missing_indices] = np.nan

# Create DataFrame
ts_df = pd.DataFrame({
    'date': dates,
    'temperature': temperature,
    'humidity': np.random.normal(65, 10, len(dates)),
    'pressure': np.random.normal(1013, 5, len(dates))
})

# Set date as index
ts_df.set_index('date', inplace=True)

print("Time series data:")
print(ts_df.head())
print(f"\nMissing values: {ts_df.isnull().sum()}")

# Advanced resampling operations
print("\n1. Multiple resampling frequencies:")
weekly_stats = ts_df.resample('W').agg({
    'temperature': ['mean', 'std', 'min', 'max'],
    'humidity': 'mean',
    'pressure': 'mean'
})
print(weekly_stats.head())

monthly_stats = ts_df.resample('M').agg({
    'temperature': ['mean', 'std'],
    'humidity': ['mean', 'std'],
    'pressure': ['mean', 'std']
})
print("\nMonthly statistics:")
print(monthly_stats.head())

# Handle missing values with interpolation
print("\n2. Interpolation methods:")
ts_df['temp_linear'] = ts_df['temperature'].interpolate(method='linear')
ts_df['temp_cubic'] = ts_df['temperature'].interpolate(method='cubic')
ts_df['temp_time'] = ts_df['temperature'].interpolate(method='time')

# Show interpolation comparison for a subset with missing values
subset_with_missing = ts_df.loc[ts_df['temperature'].isnull().shift(1) | 
                               ts_df['temperature'].isnull() | 
                               ts_df['temperature'].isnull().shift(-1)].dropna()
print(subset_with_missing[['temperature', 'temp_linear', 'temp_cubic', 'temp_time']].head())

# Rolling window operations
print("\n3. Rolling window analysis:")
ts_df['temp_7day_mean'] = ts_df['temperature'].rolling(window=7, center=True).mean()
ts_df['temp_30day_std'] = ts_df['temperature'].rolling(window=30, center=True).std()
ts_df['temp_rolling_corr'] = ts_df['temperature'].rolling(window=30).corr(ts_df['pressure'])

print(ts_df[['temperature', 'temp_7day_mean', 'temp_30day_std', 'temp_rolling_corr']].head(40).tail(10))

# Lag and lead operations
print("\n4. Lag and lead analysis:")
ts_df['temp_lag1'] = ts_df['temperature'].shift(1)
ts_df['temp_lag7'] = ts_df['temperature'].shift(7)
ts_df['temp_lead1'] = ts_df['temperature'].shift(-1)
ts_df['temp_diff'] = ts_df['temperature'].diff()
ts_df['temp_pct_change'] = ts_df['temperature'].pct_change()

print(ts_df[['temperature', 'temp_lag1', 'temp_diff', 'temp_pct_change']].head(10))

# Seasonal decomposition simulation
print("\n5. Manual seasonal analysis:")
ts_df['month'] = ts_df.index.month
ts_df['day_of_year'] = ts_df.index.dayofyear
ts_df['quarter'] = ts_df.index.quarter

# Calculate seasonal patterns
monthly_pattern = ts_df.groupby('month')['temperature'].mean()
print("Monthly temperature pattern:")
print(monthly_pattern)

# Anomaly detection using rolling statistics
ts_df['temp_rolling_mean'] = ts_df['temperature'].rolling(window=30, center=True).mean()
ts_df['temp_rolling_std'] = ts_df['temperature'].rolling(window=30, center=True).std()
ts_df['temp_zscore'] = (ts_df['temperature'] - ts_df['temp_rolling_mean']) / ts_df['temp_rolling_std']
ts_df['is_anomaly'] = np.abs(ts_df['temp_zscore']) > 2  # 2 standard deviations

anomalies = ts_df[ts_df['is_anomaly']]
print(f"\nFound {len(anomalies)} temperature anomalies")
if len(anomalies) > 0:
    print(anomalies[['temperature', 'temp_rolling_mean', 'temp_zscore']].head())

## Advanced Data Transformation and Feature Engineering

In [None]:
# Create complex experimental dataset
np.random.seed(42)
n_samples = 1000

experimental_data = pd.DataFrame({
    'sample_id': [f'S{i:04d}' for i in range(n_samples)],
    'treatment': np.random.choice(['Control', 'Drug_A', 'Drug_B', 'Combination'], n_samples),
    'dose': np.random.choice([0, 0.1, 1.0, 10.0, 100.0], n_samples),
    'cell_line': np.random.choice(['HeLa', 'MCF7', 'A549', 'HEK293'], n_samples),
    'passage_number': np.random.randint(5, 25, n_samples),
    'initial_density': np.random.normal(50000, 10000, n_samples),
    'final_density': np.random.normal(200000, 50000, n_samples),
    'viability': np.random.beta(8, 2, n_samples),  # Skewed towards higher values
    'metabolic_activity': np.random.gamma(2, 2, n_samples),
    'protein_content': np.random.lognormal(1, 0.5, n_samples),
    'incubation_time': np.random.choice([24, 48, 72], n_samples)
})

print("Original dataset:")
print(experimental_data.head())
print(f"\nDataset shape: {experimental_data.shape}")
print(f"\nData types:\n{experimental_data.dtypes}")

# Feature engineering
print("\n1. Creating derived features:")

# Growth rate calculation
experimental_data['growth_rate'] = (
    experimental_data['final_density'] - experimental_data['initial_density']
) / experimental_data['initial_density']

# Fold change
experimental_data['fold_change'] = (
    experimental_data['final_density'] / experimental_data['initial_density']
)

# Dose response (log transformation)
experimental_data['log_dose'] = np.log10(
    experimental_data['dose'].replace(0, 0.001)  # Handle zero dose
)

# Efficiency metrics
experimental_data['growth_efficiency'] = (
    experimental_data['growth_rate'] * experimental_data['viability']
)

# Categorical encoding
print("\n2. Categorical encoding:")

# One-hot encoding
treatment_dummies = pd.get_dummies(experimental_data['treatment'], prefix='treatment')
cell_line_dummies = pd.get_dummies(experimental_data['cell_line'], prefix='cell_line')

# Ordinal encoding for dose
dose_mapping = {0: 0, 0.1: 1, 1.0: 2, 10.0: 3, 100.0: 4}
experimental_data['dose_ordinal'] = experimental_data['dose'].map(dose_mapping)

# Combine original data with encoded features
experimental_encoded = pd.concat([
    experimental_data, 
    treatment_dummies, 
    cell_line_dummies
], axis=1)

print(f"Dataset with encoded features: {experimental_encoded.shape}")
print(experimental_encoded[['treatment', 'treatment_Control', 'treatment_Drug_A']].head())

# Binning and discretization
print("\n3. Binning continuous variables:")

# Equal-width binning
experimental_data['passage_bin'] = pd.cut(
    experimental_data['passage_number'], 
    bins=3, 
    labels=['Low', 'Medium', 'High']
)

# Quantile-based binning
experimental_data['density_quartile'] = pd.qcut(
    experimental_data['initial_density'], 
    q=4, 
    labels=['Q1', 'Q2', 'Q3', 'Q4']
)

# Custom binning based on domain knowledge
def viability_category(viability):
    if viability < 0.7:
        return 'Poor'
    elif viability < 0.85:
        return 'Good'
    else:
        return 'Excellent'

experimental_data['viability_category'] = experimental_data['viability'].apply(viability_category)

print(experimental_data[['passage_number', 'passage_bin', 
                        'initial_density', 'density_quartile',
                        'viability', 'viability_category']].head())

# Advanced transformations
print("\n4. Advanced transformations:")

# Box-Cox transformation for normalization
from scipy import stats

# Log transformation for skewed data
experimental_data['protein_content_log'] = np.log1p(experimental_data['protein_content'])

# Square root transformation
experimental_data['metabolic_activity_sqrt'] = np.sqrt(experimental_data['metabolic_activity'])

# Z-score normalization within groups
experimental_data['viability_zscore_by_treatment'] = experimental_data.groupby('treatment')['viability'].transform(
    lambda x: (x - x.mean()) / x.std()
)

# Min-max scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
experimental_data['growth_rate_scaled'] = scaler.fit_transform(
    experimental_data[['growth_rate']]
).flatten()

print(experimental_data[['protein_content', 'protein_content_log',
                        'metabolic_activity', 'metabolic_activity_sqrt',
                        'growth_rate', 'growth_rate_scaled']].head())

# Interaction features
print("\n5. Interaction features:")

# Polynomial features
experimental_data['dose_squared'] = experimental_data['dose'] ** 2
experimental_data['dose_viability_interaction'] = (
    experimental_data['dose'] * experimental_data['viability']
)

# Cross-feature ratios
experimental_data['protein_per_cell'] = (
    experimental_data['protein_content'] / experimental_data['final_density'] * 1e6
)

# Time-based features
experimental_data['growth_rate_per_hour'] = (
    experimental_data['growth_rate'] / experimental_data['incubation_time']
)

print(experimental_data[['dose', 'dose_squared', 'dose_viability_interaction',
                        'protein_per_cell', 'growth_rate_per_hour']].head())

# Feature selection based on correlation
print("\n6. Feature correlation analysis:")
numeric_cols = experimental_data.select_dtypes(include=[np.number]).columns
correlation_matrix = experimental_data[numeric_cols].corr()

# Find highly correlated features
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            high_corr_pairs.append((
                correlation_matrix.columns[i],
                correlation_matrix.columns[j],
                correlation_matrix.iloc[i, j]
            ))

print("Highly correlated feature pairs (|r| > 0.8):")
for pair in high_corr_pairs:
    print(f"{pair[0]} <-> {pair[1]}: {pair[2]:.3f}")

print(f"\nFinal dataset shape: {experimental_data.shape}")
print(f"Number of numeric features: {len(numeric_cols)}")

## Performance Optimization and Memory Management

In [None]:
import time
import psutil
import os

def get_memory_usage():
    """Get current memory usage in MB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024

# Create large dataset for performance testing
np.random.seed(42)
n_large = 100000

print("Creating large dataset for performance testing...")
start_memory = get_memory_usage()

large_df = pd.DataFrame({
    'id': range(n_large),
    'category': np.random.choice(['A', 'B', 'C', 'D'], n_large),
    'value1': np.random.normal(0, 1, n_large),
    'value2': np.random.normal(100, 15, n_large),
    'value3': np.random.exponential(2, n_large),
    'text_data': [f'sample_{i}_{np.random.choice(["x", "y", "z"])}' for i in range(n_large)]
})

after_creation_memory = get_memory_usage()
print(f"Memory usage after creation: {after_creation_memory - start_memory:.1f} MB")

# Data type optimization
print("\n1. Data type optimization:")
print("Original data types and memory usage:")
print(large_df.dtypes)
print(f"Memory usage: {large_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Optimize data types
large_df_optimized = large_df.copy()

# Convert to categorical for repeated strings
large_df_optimized['category'] = large_df_optimized['category'].astype('category')
large_df_optimized['text_data'] = large_df_optimized['text_data'].astype('category')

# Downcast numeric types
large_df_optimized['id'] = pd.to_numeric(large_df_optimized['id'], downcast='integer')
large_df_optimized['value2'] = pd.to_numeric(large_df_optimized['value2'], downcast='float')

print("\nOptimized data types and memory usage:")
print(large_df_optimized.dtypes)
print(f"Memory usage: {large_df_optimized.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

memory_savings = (
    large_df.memory_usage(deep=True).sum() - large_df_optimized.memory_usage(deep=True).sum()
) / large_df.memory_usage(deep=True).sum() * 100
print(f"Memory savings: {memory_savings:.1f}%")

# Performance comparison: vectorized vs iterative operations
print("\n2. Performance comparison - vectorized vs iterative:")

# Create test data
test_data = pd.DataFrame({
    'x': np.random.random(50000),
    'y': np.random.random(50000)
})

# Iterative approach (slow)
start_time = time.time()
result_iterative = []
for _, row in test_data.head(1000).iterrows():  # Only test on subset for speed
    result_iterative.append(np.sqrt(row['x']**2 + row['y']**2))
iterative_time = time.time() - start_time

# Vectorized approach (fast)
start_time = time.time()
result_vectorized = np.sqrt(test_data['x']**2 + test_data['y']**2)
vectorized_time = time.time() - start_time

print(f"Iterative approach (1000 rows): {iterative_time:.4f} seconds")
print(f"Vectorized approach (50000 rows): {vectorized_time:.4f} seconds")
print(f"Speedup factor: {iterative_time / vectorized_time * 50:.0f}x")

# Efficient querying
print("\n3. Efficient querying techniques:")

# Method 1: Boolean indexing
start_time = time.time()
result1 = large_df_optimized[large_df_optimized['value1'] > 0]
boolean_time = time.time() - start_time

# Method 2: Query method
start_time = time.time()
result2 = large_df_optimized.query('value1 > 0')
query_time = time.time() - start_time

# Method 3: Using loc
start_time = time.time()
result3 = large_df_optimized.loc[large_df_optimized['value1'] > 0]
loc_time = time.time() - start_time

print(f"Boolean indexing: {boolean_time:.4f} seconds")
print(f"Query method: {query_time:.4f} seconds")
print(f"Loc method: {loc_time:.4f} seconds")

# Chunked processing for very large datasets
print("\n4. Chunked processing:")

def process_chunk(chunk):
    """Process a chunk of data"""
    return chunk.groupby('category')['value1'].mean()

# Simulate chunked processing
chunk_size = 10000
results = []

start_time = time.time()
for i in range(0, len(large_df_optimized), chunk_size):
    chunk = large_df_optimized.iloc[i:i+chunk_size]
    chunk_result = process_chunk(chunk)
    results.append(chunk_result)

# Combine results
combined_result = pd.concat(results, axis=1).mean(axis=1)
chunked_time = time.time() - start_time

# Compare with processing all at once
start_time = time.time()
direct_result = large_df_optimized.groupby('category')['value1'].mean()
direct_time = time.time() - start_time

print(f"Chunked processing: {chunked_time:.4f} seconds")
print(f"Direct processing: {direct_time:.4f} seconds")
print(f"Results are similar: {np.allclose(combined_result.values, direct_result.values)}")

# Efficient aggregations
print("\n5. Efficient aggregation techniques:")

# Method 1: Multiple separate aggregations
start_time = time.time()
mean_val = large_df_optimized.groupby('category')['value1'].mean()
std_val = large_df_optimized.groupby('category')['value1'].std()
count_val = large_df_optimized.groupby('category')['value1'].count()
separate_time = time.time() - start_time

# Method 2: Single aggregation call
start_time = time.time()
combined_agg = large_df_optimized.groupby('category')['value1'].agg(['mean', 'std', 'count'])
combined_time = time.time() - start_time

print(f"Separate aggregations: {separate_time:.4f} seconds")
print(f"Combined aggregation: {combined_time:.4f} seconds")
print(f"Speedup: {separate_time / combined_time:.1f}x")

# Using eval for complex expressions
print("\n6. Using eval for complex expressions:")

# Standard method
start_time = time.time()
result_standard = large_df_optimized['value1'] * large_df_optimized['value2'] + large_df_optimized['value3']
standard_time = time.time() - start_time

# Eval method
start_time = time.time()
result_eval = large_df_optimized.eval('value1 * value2 + value3')
eval_time = time.time() - start_time

print(f"Standard method: {standard_time:.4f} seconds")
print(f"Eval method: {eval_time:.4f} seconds")
print(f"Results are equal: {result_standard.equals(result_eval)}")

final_memory = get_memory_usage()
print(f"\nFinal memory usage: {final_memory:.1f} MB")
print(f"Total memory increase: {final_memory - start_memory:.1f} MB")

## Advanced Plotting and Visualization Integration

In [None]:
# Create sample dataset for visualization
np.random.seed(42)
viz_data = pd.DataFrame({
    'experiment': np.repeat(['Exp1', 'Exp2', 'Exp3'], 100),
    'treatment': np.tile(['Control', 'Treatment'] * 50, 3),
    'concentration': np.tile([0, 0.1, 1, 10, 100] * 20, 3),
    'response': np.random.normal(50, 15, 300) + np.tile([0, 10, 25, 35, 40] * 20, 3),
    'time_point': np.tile(range(1, 101), 3),
    'replicate': np.tile(range(1, 4), 100)
})

# Add some noise and trends
viz_data['response'] += viz_data['time_point'] * 0.1 + np.random.normal(0, 5, len(viz_data))
viz_data['response'] = np.maximum(viz_data['response'], 0)  # Ensure non-negative

print("Visualization dataset:")
print(viz_data.head())
print(f"Shape: {viz_data.shape}")

# 1. Advanced pandas plotting
print("\n1. Pandas built-in plotting:")

# Subplot layout with pandas
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Time series by experiment
for exp in viz_data['experiment'].unique():
    exp_data = viz_data[viz_data['experiment'] == exp]
    exp_summary = exp_data.groupby('time_point')['response'].mean()
    exp_summary.plot(ax=axes[0, 0], label=exp, alpha=0.7)
axes[0, 0].set_title('Time Series by Experiment')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Box plot by treatment
viz_data.boxplot(column='response', by='treatment', ax=axes[0, 1])
axes[0, 1].set_title('Response by Treatment')
axes[0, 1].set_xlabel('Treatment')

# Histogram with overlay
viz_data[viz_data['treatment'] == 'Control']['response'].hist(
    ax=axes[1, 0], alpha=0.5, label='Control', bins=20
)
viz_data[viz_data['treatment'] == 'Treatment']['response'].hist(
    ax=axes[1, 0], alpha=0.5, label='Treatment', bins=20
)
axes[1, 0].set_title('Response Distribution')
axes[1, 0].legend()

# Scatter plot with correlation
viz_data.plot.scatter(x='concentration', y='response', ax=axes[1, 1], alpha=0.6)
axes[1, 1].set_title('Concentration vs Response')
axes[1, 1].set_xscale('log')

plt.tight_layout()
plt.show()

# 2. Pivot table visualization
print("\n2. Pivot table heatmap:")

# Create pivot table for heatmap
pivot_data = viz_data.pivot_table(
    values='response',
    index='concentration',
    columns='experiment',
    aggfunc='mean'
)

# Plot heatmap
plt.figure(figsize=(10, 6))
plt.imshow(pivot_data.values, cmap='viridis', aspect='auto')
plt.colorbar(label='Mean Response')
plt.xticks(range(len(pivot_data.columns)), pivot_data.columns)
plt.yticks(range(len(pivot_data.index)), pivot_data.index)
plt.xlabel('Experiment')
plt.ylabel('Concentration')
plt.title('Response Heatmap: Concentration vs Experiment')

# Add text annotations
for i in range(len(pivot_data.index)):
    for j in range(len(pivot_data.columns)):
        plt.text(j, i, f'{pivot_data.iloc[i, j]:.1f}', 
                ha='center', va='center', color='white', fontweight='bold')

plt.tight_layout()
plt.show()

# 3. Rolling correlation visualization
print("\n3. Rolling correlation analysis:")

# Create time series data
ts_viz = viz_data.pivot_table(
    values='response',
    index='time_point',
    columns='experiment',
    aggfunc='mean'
)

# Calculate rolling correlation
rolling_corr = ts_viz['Exp1'].rolling(window=20).corr(ts_viz['Exp2'])

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), sharex=True)

# Plot time series
ts_viz.plot(ax=ax1, alpha=0.7)
ax1.set_title('Time Series Data')
ax1.set_ylabel('Response')
ax1.grid(True, alpha=0.3)
ax1.legend()

# Plot rolling correlation
rolling_corr.plot(ax=ax2, color='red', linewidth=2)
ax2.axhline(y=0, color='black', linestyle='--', alpha=0.5)
ax2.set_title('Rolling Correlation: Exp1 vs Exp2 (20-point window)')
ax2.set_ylabel('Correlation')
ax2.set_xlabel('Time Point')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 4. Advanced groupby visualization
print("\n4. Grouped analysis visualization:")

# Multi-level grouping and visualization
grouped_stats = viz_data.groupby(['experiment', 'treatment', 'concentration']).agg({
    'response': ['mean', 'std', 'count']
}).round(2)

# Flatten column names
grouped_stats.columns = ['_'.join(col).strip() for col in grouped_stats.columns]
grouped_stats = grouped_stats.reset_index()

# Create dose-response curves
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, exp in enumerate(['Exp1', 'Exp2', 'Exp3']):
    exp_data = grouped_stats[grouped_stats['experiment'] == exp]
    
    for treatment in ['Control', 'Treatment']:
        treat_data = exp_data[exp_data['treatment'] == treatment]
        
        axes[i].errorbar(
            treat_data['concentration'],
            treat_data['response_mean'],
            yerr=treat_data['response_std'],
            label=treatment,
            marker='o',
            capsize=5,
            capthick=2
        )
    
    axes[i].set_xscale('symlog', linthresh=0.1)  # Handle zero concentration
    axes[i].set_xlabel('Concentration')
    axes[i].set_ylabel('Response')
    axes[i].set_title(f'{exp} Dose-Response')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 5. Statistical visualization
print("\n5. Statistical comparison visualization:")

# Confidence intervals
from scipy import stats

def calculate_ci(data, confidence=0.95):
    """Calculate confidence interval"""
    n = len(data)
    mean = np.mean(data)
    se = stats.sem(data)
    h = se * stats.t.ppf((1 + confidence) / 2, n - 1)
    return mean - h, mean + h

# Calculate statistics by group
treatment_stats = []
for exp in viz_data['experiment'].unique():
    for treatment in viz_data['treatment'].unique():
        subset = viz_data[(viz_data['experiment'] == exp) & 
                         (viz_data['treatment'] == treatment)]['response']
        
        mean_val = subset.mean()
        ci_low, ci_high = calculate_ci(subset)
        
        treatment_stats.append({
            'experiment': exp,
            'treatment': treatment,
            'mean': mean_val,
            'ci_low': ci_low,
            'ci_high': ci_high,
            'error': ci_high - mean_val
        })

stats_df = pd.DataFrame(treatment_stats)

# Create grouped bar plot with confidence intervals
fig, ax = plt.subplots(figsize=(12, 6))

experiments = stats_df['experiment'].unique()
treatments = stats_df['treatment'].unique()
x = np.arange(len(experiments))
width = 0.35

for i, treatment in enumerate(treatments):
    data = stats_df[stats_df['treatment'] == treatment]
    ax.bar(x + i*width, data['mean'], width, 
           yerr=data['error'], capsize=5,
           label=treatment, alpha=0.7)

ax.set_xlabel('Experiment')
ax.set_ylabel('Response (95% CI)')
ax.set_title('Treatment Effect Comparison with Confidence Intervals')
ax.set_xticks(x + width / 2)
ax.set_xticklabels(experiments)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("\nStatistical summary:")
print(stats_df.round(2))