# SciTeX Pandas Utilities Tutorial

This notebook demonstrates the pandas utilities in SciTeX, providing powerful extensions for DataFrame manipulation, analysis, and transformation.

## 1. Setup and Imports

In [None]:
import scitex as stx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

## 2. Universal DataFrame Conversion with force_df

### 2.1 Converting Various Data Types

In [None]:
# Convert different data types to DataFrame
print("=== Converting various data types to DataFrame ===")

# Scalar value
scalar = 42
df_scalar = stx.pd.force_df(scalar)
print("\nScalar to DataFrame:")
print(df_scalar)

# List
list_data = [1, 2, 3, 4, 5]
df_list = stx.pd.force_df(list_data)
print("\nList to DataFrame:")
print(df_list)

# NumPy array
array_1d = np.array([10, 20, 30, 40])
df_array_1d = stx.pd.force_df(array_1d)
print("\n1D Array to DataFrame:")
print(df_array_1d)

# 2D NumPy array
array_2d = np.random.randn(3, 4)
df_array_2d = stx.pd.force_df(array_2d)
print("\n2D Array to DataFrame:")
print(df_array_2d)

# Dictionary with unequal lengths
dict_unequal = {
    'A': [1, 2, 3],
    'B': [4, 5],
    'C': [6, 7, 8, 9]
}
df_dict = stx.pd.force_df(dict_unequal)
print("\nDictionary with unequal lengths to DataFrame:")
print(df_dict)

# Series
series = pd.Series([100, 200, 300], index=['x', 'y', 'z'])
df_series = stx.pd.force_df(series)
print("\nSeries to DataFrame:")
print(df_series)

### 2.2 Custom Filler Values

In [None]:
# Using custom filler for unequal lengths
data_unequal = {
    'experiment_1': [23.5, 24.1, 23.8],
    'experiment_2': [22.9, 23.5, 24.0, 23.7, 23.9],
    'experiment_3': [24.2, 23.6]
}

# Default filler (NaN)
df_nan = stx.pd.force_df(data_unequal)
print("With NaN filler:")
print(df_nan)

# Custom filler (0)
df_zero = stx.pd.force_df(data_unequal, filler=0)
print("\nWith 0 filler:")
print(df_zero)

# Custom filler (mean value)
all_values = [v for vals in data_unequal.values() for v in vals]
mean_value = np.mean(all_values)
df_mean = stx.pd.force_df(data_unequal, filler=mean_value)
print(f"\nWith mean filler ({mean_value:.2f}):")
print(df_mean)

## 3. Finding P-value Columns

In [None]:
# Create a DataFrame with statistical results
stats_data = {
    'feature': ['height', 'weight', 'age', 'blood_pressure'],
    'mean_control': [170.5, 68.2, 35.4, 120.5],
    'mean_treatment': [172.1, 66.8, 35.6, 118.2],
    'p_value': [0.023, 0.156, 0.832, 0.041],
    'pval_adjusted': [0.092, 0.312, 0.832, 0.123],
    'significance': ['*', 'ns', 'ns', '*'],
    'p-val-bonferroni': [0.092, 0.624, 1.000, 0.164]
}

df_stats = pd.DataFrame(stats_data)
print("Statistical results DataFrame:")
print(df_stats)

# Find all p-value columns
pval_cols = stx.pd.find_pval(df_stats, multiple=True)
print(f"\nP-value columns found: {pval_cols}")

# Extract p-values
print("\nP-values:")
for col in pval_cols:
    print(f"{col}: {df_stats[col].tolist()}")

# Find first p-value column only
first_pval = stx.pd.find_pval(df_stats, multiple=False)
print(f"\nFirst p-value column: {first_pval}")

## 4. Column and Row Movement

### 4.1 Moving Columns

In [None]:
# Create sample DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9],
    'D': [10, 11, 12],
    'E': [13, 14, 15]
})

print("Original DataFrame:")
print(df)
print(f"Columns: {df.columns.tolist()}")

# Move column 'D' to position 1
df_moved = stx.pd.mv(df, 'D', 1)
print("\nAfter moving 'D' to position 1:")
print(df_moved)
print(f"Columns: {df_moved.columns.tolist()}")

# Move column 'C' to first
df_first = stx.pd.mv_to_first(df, 'C')
print("\nAfter moving 'C' to first:")
print(df_first)
print(f"Columns: {df_first.columns.tolist()}")

# Move column 'B' to last
df_last = stx.pd.mv_to_last(df, 'B')
print("\nAfter moving 'B' to last:")
print(df_last)
print(f"Columns: {df_last.columns.tolist()}")

### 4.2 Moving Rows

In [None]:
# Create DataFrame with named index
df_rows = pd.DataFrame({
    'value': [10, 20, 30, 40, 50],
    'category': ['A', 'B', 'C', 'D', 'E']
}, index=['row1', 'row2', 'row3', 'row4', 'row5'])

print("Original DataFrame:")
print(df_rows)

# Move row3 to first position
df_row_first = stx.pd.mv_to_first(df_rows, 'row3', axis=0)
print("\nAfter moving 'row3' to first:")
print(df_row_first)

# Move row2 to last position
df_row_last = stx.pd.mv_to_last(df_rows, 'row2', axis=0)
print("\nAfter moving 'row2' to last:")
print(df_row_last)

## 5. Melting Specific Columns

In [None]:
# Create wide format data
wide_data = pd.DataFrame({
    'subject_id': ['S001', 'S002', 'S003'],
    'age': [25, 30, 35],
    'gender': ['M', 'F', 'M'],
    'time_1': [120, 115, 125],
    'time_2': [118, 112, 123],
    'time_3': [115, 110, 120]
})

print("Wide format data:")
print(wide_data)

# Melt only the time columns
time_cols = ['time_1', 'time_2', 'time_3']
melted = stx.pd.melt_cols(wide_data, time_cols)

print("\nAfter melting time columns:")
print(melted)

# Melt with specific id columns
melted_ids = stx.pd.melt_cols(wide_data, time_cols, id_columns=['subject_id', 'age'])
print("\nWith specific ID columns:")
print(melted_ids)

## 6. Matrix to Long Format Conversion

In [None]:
# Create correlation matrix
np.random.seed(42)
n_vars = 5
data = np.random.randn(100, n_vars)
df_data = pd.DataFrame(data, columns=[f'var_{i+1}' for i in range(n_vars)])
corr_matrix = df_data.corr()

print("Correlation matrix:")
print(corr_matrix.round(3))

# Convert to x, y, z format
xyz_data = stx.pd.to_xyz(corr_matrix)
print("\nLong format (x, y, z):")
print(xyz_data.head(10))

# Visualize using the long format
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Heatmap from matrix
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, ax=ax1, square=True)
ax1.set_title('Correlation Matrix (Wide Format)')

# Scatter plot from long format
scatter = ax2.scatter(xyz_data['x'], xyz_data['y'], 
                     c=xyz_data['z'], cmap='coolwarm', 
                     s=200, vmin=-1, vmax=1)
ax2.set_xlabel('Variable 1')
ax2.set_ylabel('Variable 2')
ax2.set_title('Correlation Values (Long Format)')
ax2.set_xticks(range(len(corr_matrix.columns)))
ax2.set_xticklabels(corr_matrix.columns, rotation=45)
ax2.set_yticks(range(len(corr_matrix.index)))
ax2.set_yticklabels(corr_matrix.index)
ax2.invert_yaxis()
plt.colorbar(scatter, ax=ax2, label='Correlation')

plt.tight_layout()
plt.show()

## 7. Advanced DataFrame Operations

### 7.1 Safe Numeric Conversion

In [None]:
# Create DataFrame with mixed types
mixed_data = pd.DataFrame({
    'A': ['1', '2', '3', '4'],
    'B': ['1.5', '2.5', 'invalid', '4.5'],
    'C': [1, 2, 3, 4],
    'D': ['10%', '20%', '30%', '40%']
})

print("Original DataFrame:")
print(mixed_data)
print("\nData types:")
print(mixed_data.dtypes)

# Convert to numeric
numeric_df = stx.pd.to_numeric(mixed_data)
print("\nAfter numeric conversion:")
print(numeric_df)
print("\nData types after conversion:")
print(numeric_df.dtypes)

### 7.2 Column Merging

In [None]:
# Create DataFrame with columns to merge
df_merge = pd.DataFrame({
    'first_name': ['John', 'Jane', 'Bob'],
    'last_name': ['Doe', 'Smith', 'Johnson'],
    'age': [30, 25, 35],
    'city': ['New York', 'London', 'Paris'],
    'country': ['USA', 'UK', 'France']
})

print("Original DataFrame:")
print(df_merge)

# Merge name columns
df_merged_name = stx.pd.merge_columns(
    df_merge, 
    ['first_name', 'last_name'], 
    'full_name',
    separator=' '
)
print("\nAfter merging name columns:")
print(df_merged_name)

# Merge location columns
df_merged_all = stx.pd.merge_columns(
    df_merged_name,
    ['city', 'country'],
    'location',
    separator=', '
)
print("\nAfter merging location columns:")
print(df_merged_all)

### 7.3 Advanced Slicing

In [None]:
# Create time series data
dates = pd.date_range('2024-01-01', periods=100, freq='D')
ts_data = pd.DataFrame({
    'date': dates,
    'value': np.cumsum(np.random.randn(100)),
    'category': np.random.choice(['A', 'B', 'C'], 100)
})
ts_data.set_index('date', inplace=True)

print("Time series data:")
print(ts_data.head())

# Slice by date range
start_date = '2024-02-01'
end_date = '2024-02-15'
sliced = stx.pd.slice(ts_data, start=start_date, end=end_date)
print(f"\nSliced data ({start_date} to {end_date}):")
print(sliced)

# Slice by row numbers
sliced_rows = stx.pd.slice(ts_data.reset_index(), start=10, end=20)
print("\nSliced by row numbers (10-20):")
print(sliced_rows)

## 8. Handling SettingWithCopyWarning

In [None]:
# Create DataFrame that typically causes warning
df_original = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50]
})

# This would normally trigger a warning
df_subset = df_original[df_original['A'] > 2]

# Without warning suppression (would show warning)
print("Without warning suppression:")
try:
    df_subset['C'] = df_subset['A'] * df_subset['B']
    print("Operation completed")
except Exception as e:
    print(f"Error: {e}")

# With warning suppression
print("\nWith warning suppression:")
with stx.pd.ignore_SettingWithCopyWarning():
    df_subset['C'] = df_subset['A'] * df_subset['B']
    print("Operation completed silently")

print("\nResult:")
print(df_subset)

## 9. Practical Examples

### 9.1 Statistical Analysis Pipeline

In [None]:
# Generate experimental data
np.random.seed(42)
n_samples = 50

# Control and treatment groups
control = np.random.normal(100, 15, n_samples)
treatment = np.random.normal(110, 15, n_samples)

# Create results dictionary with unequal lengths (some missing data)
results = {
    'control': control[:45],  # Some missing data
    'treatment': treatment,
    'placebo': np.random.normal(102, 15, 40)  # Different size
}

# Convert to DataFrame with force_df
df_results = stx.pd.force_df(results)
print("Experimental results:")
print(df_results.head())
print(f"\nShape: {df_results.shape}")

# Perform statistical tests
from scipy import stats

# Compare groups
comparisons = []
groups = list(results.keys())

for i in range(len(groups)):
    for j in range(i+1, len(groups)):
        group1, group2 = groups[i], groups[j]
        data1 = df_results[group1].dropna()
        data2 = df_results[group2].dropna()
        
        t_stat, p_val = stats.ttest_ind(data1, data2)
        
        comparisons.append({
            'comparison': f'{group1}_vs_{group2}',
            'mean_diff': data1.mean() - data2.mean(),
            't_statistic': t_stat,
            'p_value': p_val,
            'p_val_bonferroni': p_val * len(groups)  # Simple Bonferroni
        })

df_stats = pd.DataFrame(comparisons)
print("\nStatistical comparisons:")
print(df_stats)

# Find p-value columns
pval_cols = stx.pd.find_pval(df_stats)
print(f"\nP-value columns: {pval_cols}")

# Move p-value to first
df_stats_reordered = stx.pd.mv_to_first(df_stats, 'p_value')
print("\nReordered (p-value first):")
print(df_stats_reordered)

### 9.2 Data Reshaping for Visualization

In [None]:
# Create multi-condition experiment data
subjects = [f'S{i:03d}' for i in range(1, 11)]
conditions = ['baseline', 'stress', 'recovery']
measures = ['heart_rate', 'blood_pressure', 'cortisol']

# Generate data
data_records = []
for subject in subjects:
    for condition in conditions:
        record = {'subject': subject, 'condition': condition}
        for measure in measures:
            base_value = {'heart_rate': 70, 'blood_pressure': 120, 'cortisol': 10}[measure]
            multiplier = {'baseline': 1.0, 'stress': 1.3, 'recovery': 1.1}[condition]
            value = base_value * multiplier + np.random.normal(0, 5)
            record[measure] = value
        data_records.append(record)

df_experiment = pd.DataFrame(data_records)
print("Experiment data (wide format):")
print(df_experiment.head())

# Melt measures for plotting
df_melted = stx.pd.melt_cols(df_experiment, measures, 
                             id_columns=['subject', 'condition'])
print("\nMelted data (long format):")
print(df_melted.head(10))

# Create visualizations
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, measure in enumerate(measures):
    measure_data = df_melted[df_melted['variable'] == measure]
    
    # Box plot by condition
    measure_data.boxplot(column='value', by='condition', ax=axes[i])
    axes[i].set_title(measure.replace('_', ' ').title())
    axes[i].set_xlabel('Condition')
    axes[i].set_ylabel('Value')
    axes[i].get_figure().suptitle('')  # Remove automatic title

plt.tight_layout()
plt.show()

# Create pivot table for heatmap
pivot = df_experiment.pivot_table(
    values=measures,
    index='subject',
    columns='condition',
    aggfunc='mean'
)

# Convert to xyz format for custom visualization
xyz_pivot = stx.pd.to_xyz(pivot['heart_rate'])
print("\nPivot data in xyz format:")
print(xyz_pivot.head())

## 10. Building a Complete Data Pipeline

In [None]:
class DataProcessor:
    """Complete data processing pipeline using SciTeX pandas utilities."""
    
    def __init__(self):
        self.data = None
        self.processed = None
    
    def load_data(self, data_source):
        """Load data from any source using force_df."""
        self.data = stx.pd.force_df(data_source)
        print(f"Loaded data with shape: {self.data.shape}")
        return self
    
    def clean_numeric(self):
        """Convert all possible columns to numeric."""
        self.data = stx.pd.to_numeric(self.data)
        return self
    
    def find_statistics(self):
        """Find and highlight statistical columns."""
        pval_cols = stx.pd.find_pval(self.data, multiple=True)
        
        if pval_cols:
            print(f"Found p-value columns: {pval_cols}")
            # Move first p-value column to front
            self.data = stx.pd.mv_to_first(self.data, pval_cols[0])
        
        return self
    
    def reshape_for_analysis(self, value_vars, id_vars=None):
        """Reshape data for analysis."""
        self.processed = stx.pd.melt_cols(self.data, value_vars, id_vars)
        return self
    
    def get_result(self):
        """Return processed data."""
        return self.processed if self.processed is not None else self.data

# Example usage
# Create complex data
complex_data = {
    'patient_id': ['P001', 'P002', 'P003', 'P004'],
    'age': ['45', '52', '38', '61'],
    'baseline_score': [85, 92, 78, 88],
    'week_1_score': [88, 95, 82, 90],
    'week_2_score': [91, 98, 85],  # Missing value
    'p_value_improvement': [0.032, 0.015, 0.048, 0.023],
    'significant': ['yes', 'yes', 'no', 'yes']
}

# Process data
processor = DataProcessor()
result = (processor
          .load_data(complex_data)
          .clean_numeric()
          .find_statistics()
          .reshape_for_analysis(['baseline_score', 'week_1_score', 'week_2_score'],
                               ['patient_id', 'age'])
          .get_result())

print("\nProcessed result:")
print(result)

## 11. Summary and Best Practices

### Key Takeaways

1. **Universal DataFrame Conversion**: `force_df` handles any data type gracefully
2. **Statistical Analysis**: Automatic p-value detection simplifies result processing
3. **Flexible Reshaping**: Melt specific columns while preserving relationships
4. **Data Organization**: Easy column and row reordering for presentation
5. **Format Conversion**: Transform between wide and long formats effortlessly

### Best Practices

1. **Data Loading**:
   ```python
   # Always use force_df for consistent DataFrames
   df = stx.pd.force_df(any_data_source)
   ```

2. **Statistical Workflows**:
   ```python
   # Automatically organize statistical results
   pval_cols = stx.pd.find_pval(results)
   df_organized = stx.pd.mv_to_first(results, pval_cols[0])
   ```

3. **Data Reshaping**:
   ```python
   # Melt only what you need
   long_df = stx.pd.melt_cols(wide_df, ['col1', 'col2'])
   ```

4. **Visualization Prep**:
   ```python
   # Convert matrices for plotting
   xyz = stx.pd.to_xyz(correlation_matrix)
   ```

In [None]:
print("\nPandas utilities tutorial completed!")
print("\nNext steps:")
print("1. Use force_df for robust data loading")
print("2. Automate statistical report formatting")
print("3. Simplify data reshaping workflows")
print("4. Create reproducible data pipelines")