# Apply Functions Across Rows and Columns in Pandas
## 1. Basic apply() Function

In [1]:
import pandas as pd
import numpy as np

# Create sample dataset
data = {
    'employee_id': [101, 102, 103, 104, 105],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'base_salary': [75000, 85000, 65000, 78000, 60000],
    'bonus': [5000, 8000, 3000, 6000, 2000],
    'overtime_hours': [10, 5, 15, 8, 12],
    'overtime_rate': [50, 60, 45, 55, 40],
    'performance_score': [4.2, 4.5, 3.8, 4.0, 3.5]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\n" + "="*80)

Original DataFrame:
   employee_id     name  base_salary  bonus  overtime_hours  overtime_rate  \
0          101    Alice        75000   5000              10             50   
1          102      Bob        85000   8000               5             60   
2          103  Charlie        65000   3000              15             45   
3          104    David        78000   6000               8             55   
4          105      Eva        60000   2000              12             40   

   performance_score  
0                4.2  
1                4.5  
2                3.8  
3                4.0  
4                3.5  



# 2. Applying Functions to Columns
## Example 2.1: Apply to Single Column

In [2]:
# Apply function to single column
print("1. Apply function to single column:")

# Convert salary to thousands
df['salary_k'] = df['base_salary'].apply(lambda x: x / 1000)
print("Base salary in thousands:")
print(df[['name', 'base_salary', 'salary_k']])

# Format as currency
df['salary_formatted'] = df['base_salary'].apply(lambda x: f"${x:,.2f}")
print("\nFormatted salary:")
print(df[['name', 'salary_formatted']])

# Categorize salary
def categorize_salary(salary):
    if salary >= 80000:
        return 'High'
    elif salary >= 70000:
        return 'Medium'
    else:
        return 'Low'

df['salary_category'] = df['base_salary'].apply(categorize_salary)
print("\nSalary categories:")
print(df[['name', 'base_salary', 'salary_category']])

1. Apply function to single column:
Base salary in thousands:
      name  base_salary  salary_k
0    Alice        75000      75.0
1      Bob        85000      85.0
2  Charlie        65000      65.0
3    David        78000      78.0
4      Eva        60000      60.0

Formatted salary:
      name salary_formatted
0    Alice       $75,000.00
1      Bob       $85,000.00
2  Charlie       $65,000.00
3    David       $78,000.00
4      Eva       $60,000.00

Salary categories:
      name  base_salary salary_category
0    Alice        75000          Medium
1      Bob        85000            High
2  Charlie        65000             Low
3    David        78000          Medium
4      Eva        60000             Low


# # Example 2.3: Apply with External Libraries

In [3]:
import math

# Apply math functions
df['sqrt_salary'] = df['base_salary'].apply(math.sqrt)
df['log_salary'] = df['base_salary'].apply(math.log10)
df['salary_squared'] = df['base_salary'].apply(lambda x: x ** 2)

print("Mathematical transformations:")
print(df[['name', 'base_salary', 'sqrt_salary', 'log_salary', 'salary_squared']].round(2))

Mathematical transformations:
      name  base_salary  sqrt_salary  log_salary  salary_squared
0    Alice        75000       273.86        4.88      5625000000
1      Bob        85000       291.55        4.93      7225000000
2  Charlie        65000       254.95        4.81      4225000000
3    David        78000       279.28        4.89      6084000000
4      Eva        60000       244.95        4.78      3600000000


# # Example 3.1: Row-wise Calculations

In [4]:
print("Row-wise calculations:")

# Calculate total overtime pay per employee
df['overtime_pay'] = df.apply(
    lambda row: row['overtime_hours'] * row['overtime_rate'], 
    axis=1
)

# Calculate percentage of salary as bonus
df['bonus_percentage'] = df.apply(
    lambda row: (row['bonus'] / row['base_salary']) * 100,
    axis=1
)

# Create employee summary
def create_employee_summary(row):
    return f"{row['name']} earns ${row['base_salary']:,} with {row['performance_score']}/5 rating"

df['employee_summary'] = df.apply(create_employee_summary, axis=1)

print(df[['name', 'base_salary', 'overtime_pay', 'bonus_percentage', 'employee_summary']])

Row-wise calculations:
      name  base_salary  overtime_pay  bonus_percentage  \
0    Alice        75000           500          6.666667   
1      Bob        85000           300          9.411765   
2  Charlie        65000           675          4.615385   
3    David        78000           440          7.692308   
4      Eva        60000           480          3.333333   

                          employee_summary  
0    Alice earns $75,000 with 4.2/5 rating  
1      Bob earns $85,000 with 4.5/5 rating  
2  Charlie earns $65,000 with 3.8/5 rating  
3    David earns $78,000 with 4.0/5 rating  
4      Eva earns $60,000 with 3.5/5 rating  


# Example 3.2: Conditional Row Operations

In [5]:
# Complex conditional logic across multiple columns
def calculate_raise(row):
    """Calculate proposed raise based on multiple factors"""
    base_raise = 0
    
    # Performance-based
    if row['performance_score'] >= 4.5:
        base_raise += 0.10  # 10%
    elif row['performance_score'] >= 4.0:
        base_raise += 0.07  # 7%
    elif row['performance_score'] >= 3.5:
        base_raise += 0.05  # 5%
    
    # Overtime contribution
    overtime_bonus = min(row['overtime_hours'] * 0.001, 0.03)  # Max 3%
    base_raise += overtime_bonus
    
    # Cap at 15%
    return min(base_raise, 0.15)

def calculate_new_salary(row):
    """Calculate new salary after raise"""
    raise_percentage = calculate_raise(row)
    new_salary = row['base_salary'] * (1 + raise_percentage)
    return new_salary, raise_percentage * 100

# Apply the function
df[['proposed_salary', 'raise_percentage']] = df.apply(
    lambda row: pd.Series(calculate_new_salary(row)),
    axis=1
)

print("Salary raise proposals:")
print(df[['name', 'base_salary', 'performance_score', 'overtime_hours', 
          'raise_percentage', 'proposed_salary']].round(2))

Salary raise proposals:
      name  base_salary  performance_score  overtime_hours  raise_percentage  \
0    Alice        75000                4.2              10               8.0   
1      Bob        85000                4.5               5              10.5   
2  Charlie        65000                3.8              15               6.5   
3    David        78000                4.0               8               7.8   
4      Eva        60000                3.5              12               6.2   

   proposed_salary  
0          81000.0  
1          93925.0  
2          69225.0  
3          84084.0  
4          63720.0  


# Example 3.3: Row-wise Statistics

In [6]:
# Calculate statistics across multiple columns for each row
numeric_cols = ['base_salary', 'bonus', 'overtime_pay']

df['total_earnings'] = df[numeric_cols].sum(axis=1)
df['average_earning'] = df[numeric_cols].mean(axis=1)
df['max_earning_component'] = df[numeric_cols].idxmax(axis=1)
df['min_earning_component'] = df[numeric_cols].idxmin(axis=1)

print("Row-wise statistics:")
print(df[['name', 'base_salary', 'bonus', 'overtime_pay', 
          'total_earnings', 'average_earning', 
          'max_earning_component', 'min_earning_component']])

Row-wise statistics:
      name  base_salary  bonus  overtime_pay  total_earnings  average_earning  \
0    Alice        75000   5000           500           80500     26833.333333   
1      Bob        85000   8000           300           93300     31100.000000   
2  Charlie        65000   3000           675           68675     22891.666667   
3    David        78000   6000           440           84440     28146.666667   
4      Eva        60000   2000           480           62480     20826.666667   

  max_earning_component min_earning_component  
0           base_salary          overtime_pay  
1           base_salary          overtime_pay  
2           base_salary          overtime_pay  
3           base_salary          overtime_pay  
4           base_salary          overtime_pay  


# 4. Applying Functions to Entire DataFrames
## Example 4.1: DataFrame-wide Operations

In [7]:
# Create a smaller dataframe for demonstration
df_small = df[['base_salary', 'bonus', 'overtime_pay']].copy()

print("Original numeric data:")
print(df_small)

# Apply function to entire DataFrame
def format_currency(x):
    """Format all numeric values as currency"""
    if isinstance(x, (int, float)):
        return f"${x:,.2f}"
    return x

df_formatted = df_small.applymap(format_currency)
print("\nFormatted as currency:")
print(df_formatted)

# Apply mathematical transformation to all numeric columns
df_log_transformed = df_small.applymap(lambda x: np.log(x) if x > 0 else 0)
print("\nLog-transformed values:")
print(df_log_transformed.round(2))

Original numeric data:
   base_salary  bonus  overtime_pay
0        75000   5000           500
1        85000   8000           300
2        65000   3000           675
3        78000   6000           440
4        60000   2000           480

Formatted as currency:
  base_salary      bonus overtime_pay
0  $75,000.00  $5,000.00      $500.00
1  $85,000.00  $8,000.00      $300.00
2  $65,000.00  $3,000.00      $675.00
3  $78,000.00  $6,000.00      $440.00
4  $60,000.00  $2,000.00      $480.00

Log-transformed values:
   base_salary  bonus  overtime_pay
0        11.23   8.52          6.21
1        11.35   8.99          5.70
2        11.08   8.01          6.51
3        11.26   8.70          6.09
4        11.00   7.60          6.17


  df_formatted = df_small.applymap(format_currency)
  df_log_transformed = df_small.applymap(lambda x: np.log(x) if x > 0 else 0)


# Example 4.2: Column-wise Statistics

In [8]:
print("Column-wise statistics:")

# Apply different functions to each column
stats = df[['base_salary', 'bonus', 'overtime_pay']].agg(['mean', 'median', 'std', 'min', 'max'])
print("Basic statistics:")
print(stats)

# Custom aggregation functions
def range_value(series):
    return series.max() - series.min()

def coefficient_of_variation(series):
    return (series.std() / series.mean()) * 100

custom_stats = df[['base_salary', 'bonus', 'overtime_pay']].agg([range_value, coefficient_of_variation])
print("\nCustom statistics:")
print(custom_stats.round(2))

Column-wise statistics:
Basic statistics:
         base_salary        bonus  overtime_pay
mean    72600.000000  4800.000000    479.000000
median  75000.000000  5000.000000    480.000000
std     10064.790112  2387.467277    134.554822
min     60000.000000  2000.000000    300.000000
max     85000.000000  8000.000000    675.000000

Custom statistics:
                          base_salary    bonus  overtime_pay
range_value                  25000.00  6000.00        375.00
coefficient_of_variation        13.86    49.74         28.09


# 5. Vectorized Operations (Faster Alternative)
## Example 5.1: Vectorized vs Apply Performance

In [9]:
# Create large dataset for performance comparison
np.random.seed(42)
large_df = pd.DataFrame({
    'A': np.random.randn(100000),
    'B': np.random.randn(100000),
    'C': np.random.randn(100000)
})

print(f"Large DataFrame: {len(large_df):,} rows")

import time

# Method 1: Using apply (slower)
start = time.time()
result_apply = large_df.apply(lambda row: row['A'] + row['B'] * row['C'], axis=1)
time_apply = time.time() - start

# Method 2: Vectorized operation (faster)
start = time.time()
result_vectorized = large_df['A'] + large_df['B'] * large_df['C']
time_vectorized = time.time() - start

print(f"\nPerformance comparison:")
print(f"Apply method: {time_apply:.4f} seconds")
print(f"Vectorized method: {time_vectorized:.4f} seconds")
print(f"Speed improvement: {time_apply/time_vectorized:.1f}x faster")

# Always prefer vectorized operations when possible
print("\n" + "="*80)
print("VECTORIZED OPERATIONS (Recommended):")
print("="*80)

# Vectorized examples
df['total_pay_vectorized'] = df['base_salary'] + df['bonus'] + df['overtime_pay']
df['bonus_ratio_vectorized'] = df['bonus'] / df['base_salary']
df['performance_adjusted'] = df['base_salary'] * (1 + (df['performance_score'] - 3.5) * 0.1)

print(df[['name', 'base_salary', 'bonus', 'performance_score',
          'total_pay_vectorized', 'bonus_ratio_vectorized', 'performance_adjusted']].round(2))

Large DataFrame: 100,000 rows

Performance comparison:
Apply method: 0.9366 seconds
Vectorized method: 0.2582 seconds
Speed improvement: 3.6x faster

VECTORIZED OPERATIONS (Recommended):
      name  base_salary  bonus  performance_score  total_pay_vectorized  \
0    Alice        75000   5000                4.2                 80500   
1      Bob        85000   8000                4.5                 93300   
2  Charlie        65000   3000                3.8                 68675   
3    David        78000   6000                4.0                 84440   
4      Eva        60000   2000                3.5                 62480   

   bonus_ratio_vectorized  performance_adjusted  
0                    0.07               80250.0  
1                    0.09               93500.0  
2                    0.05               66950.0  
3                    0.08               81900.0  
4                    0.03               60000.0  


# 6. Real-World Business Examples
## Example 6.1: E-commerce Order Processing

In [10]:
# Create e-commerce orders data
np.random.seed(42)
n_orders = 10

orders = pd.DataFrame({
    'order_id': range(1001, 1001 + n_orders),
    'customer_id': np.random.choice(['C001', 'C002', 'C003', 'C004', 'C005'], n_orders),
    'product_price': np.random.uniform(10, 500, n_orders),
    'quantity': np.random.randint(1, 5, n_orders),
    'tax_rate': np.random.uniform(0.05, 0.15, n_orders),
    'discount_percent': np.random.uniform(0, 0.3, n_orders),
    'shipping_cost': np.random.uniform(5, 25, n_orders),
    'is_express': np.random.choice([True, False], n_orders, p=[0.3, 0.7])
})

print("E-commerce Orders:")
print(orders)

# Calculate order metrics using apply
def calculate_order_metrics(row):
    """Calculate all order metrics in one function"""
    subtotal = row['product_price'] * row['quantity']
    discount_amount = subtotal * row['discount_percent']
    taxable_amount = subtotal - discount_amount
    tax_amount = taxable_amount * row['tax_rate']
    
    # Express shipping surcharge
    shipping = row['shipping_cost']
    if row['is_express']:
        shipping *= 1.5
    
    total = taxable_amount + tax_amount + shipping
    
    return pd.Series({
        'subtotal': subtotal,
        'discount_amount': discount_amount,
        'taxable_amount': taxable_amount,
        'tax_amount': tax_amount,
        'final_shipping': shipping,
        'order_total': total
    })

# Apply the function
order_metrics = orders.apply(calculate_order_metrics, axis=1)
orders = pd.concat([orders, order_metrics], axis=1)

print("\nOrders with calculated metrics:")
print(orders[['order_id', 'customer_id', 'product_price', 'quantity', 
              'subtotal', 'discount_amount', 'order_total']].round(2))

E-commerce Orders:
   order_id customer_id  product_price  quantity  tax_rate  discount_percent  \
0      1001        C004     304.546356         4  0.086636          0.284666   
1      1002        C005     356.955563         1  0.095607          0.289690   
2      1003        C003      20.086402         1  0.128518          0.242519   
3      1004        C005     485.255828         3  0.069967          0.091384   
4      1005        C005     417.896894         3  0.101423          0.029302   
5      1006        C002     114.046164         3  0.109241          0.205270   
6      1007        C003      99.094234         2  0.054645          0.132046   
7      1008        C003      99.868210         4  0.110754          0.036611   
8      1009        C003     159.078699         4  0.067052          0.148553   
9      1010        C005     267.130651         4  0.056505          0.010317   

   shipping_cost  is_express  
0      23.186408       False  
1      10.175600       False  
2      

# Example 6.2: Student Grade Calculation

In [11]:
# Student grades data
students = pd.DataFrame({
    'student_id': ['S001', 'S002', 'S003', 'S004'],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'math': [85, 92, 78, 88],
    'science': [90, 88, 82, 91],
    'english': [88, 85, 90, 87],
    'history': [92, 78, 85, 89],
    'attendance': [95, 88, 92, 96],
    'participation': [90, 85, 88, 92]
})

print("Student Grades:")
print(students)

# Calculate final grades with weights
def calculate_final_grade(row):
    """Calculate final grade with weights and adjustments"""
    # Academic subjects (70% weight)
    academic_scores = [row['math'], row['science'], row['english'], row['history']]
    academic_average = sum(academic_scores) / len(academic_scores)
    
    # Non-academic factors (30% weight)
    non_academic_average = (row['attendance'] + row['participation']) / 2
    
    # Weighted average
    weighted_grade = (academic_average * 0.7) + (non_academic_average * 0.3)
    
    # Determine letter grade
    if weighted_grade >= 90:
        letter_grade = 'A'
    elif weighted_grade >= 80:
        letter_grade = 'B'
    elif weighted_grade >= 70:
        letter_grade = 'C'
    elif weighted_grade >= 60:
        letter_grade = 'D'
    else:
        letter_grade = 'F'
    
    # Check for honors
    is_honors = all(score >= 85 for score in academic_scores) and weighted_grade >= 90
    
    return pd.Series({
        'academic_average': academic_average,
        'weighted_grade': weighted_grade,
        'letter_grade': letter_grade,
        'is_honors': is_honors
    })

# Apply the function
grade_calculations = students.apply(calculate_final_grade, axis=1)
students = pd.concat([students, grade_calculations], axis=1)

print("\nFinal Grades with Calculations:")
print(students[['name', 'math', 'science', 'english', 'history', 
                'weighted_grade', 'letter_grade', 'is_honors']].round(2))

Student Grades:
  student_id     name  math  science  english  history  attendance  \
0       S001    Alice    85       90       88       92          95   
1       S002      Bob    92       88       85       78          88   
2       S003  Charlie    78       82       90       85          92   
3       S004    Diana    88       91       87       89          96   

   participation  
0             90  
1             85  
2             88  
3             92  

Final Grades with Calculations:
      name  math  science  english  history  weighted_grade letter_grade  \
0    Alice    85       90       88       92           89.88            B   
1      Bob    92       88       85       78           85.98            B   
2  Charlie    78       82       90       85           85.62            B   
3    Diana    88       91       87       89           90.32            A   

   is_honors  
0      False  
1      False  
2      False  
3       True  


# Example 6.3: Financial Portfolio Analysis

In [12]:
# Investment portfolio
portfolio = pd.DataFrame({
    'stock': ['AAPL', 'GOOGL', 'MSFT', 'TSLA', 'AMZN'],
    'shares': [100, 50, 75, 25, 30],
    'purchase_price': [150, 2800, 300, 700, 3300],
    'current_price': [180, 2900, 320, 650, 3400],
    'beta': [1.2, 1.1, 0.9, 2.0, 1.3],  # Market risk measure
    'dividend_yield': [0.005, 0.000, 0.008, 0.000, 0.007]
})

print("Investment Portfolio:")
print(portfolio)

# Calculate portfolio metrics
def calculate_position_metrics(row):
    """Calculate metrics for each stock position"""
    market_value = row['shares'] * row['current_price']
    cost_basis = row['shares'] * row['purchase_price']
    unrealized_gain = market_value - cost_basis
    gain_percentage = (unrealized_gain / cost_basis) * 100
    
    # Dividend income
    annual_dividend = market_value * row['dividend_yield']
    
    # Risk-adjusted metrics
    sharpe_ratio = gain_percentage / row['beta'] if row['beta'] > 0 else 0
    
    return pd.Series({
        'market_value': market_value,
        'cost_basis': cost_basis,
        'unrealized_gain': unrealized_gain,
        'gain_pct': gain_percentage,
        'annual_dividend': annual_dividend,
        'sharpe_ratio': sharpe_ratio
    })

# Apply the function
position_metrics = portfolio.apply(calculate_position_metrics, axis=1)
portfolio = pd.concat([portfolio, position_metrics], axis=1)

print("\nPortfolio with Metrics:")
print(portfolio[['stock', 'shares', 'current_price', 'market_value', 
                 'unrealized_gain', 'gain_pct', 'annual_dividend']].round(2))

# Portfolio summary
portfolio_summary = {
    'Total Market Value': portfolio['market_value'].sum(),
    'Total Cost Basis': portfolio['cost_basis'].sum(),
    'Total Unrealized Gain': portfolio['unrealized_gain'].sum(),
    'Average Gain %': portfolio['gain_pct'].mean(),
    'Total Annual Dividend': portfolio['annual_dividend'].sum(),
    'Weighted Average Beta': (portfolio['beta'] * portfolio['market_value']).sum() / portfolio['market_value'].sum()
}

print("\nPortfolio Summary:")
for key, value in portfolio_summary.items():
    if 'pct' not in key.lower() and 'beta' not in key.lower():
        print(f"{key}: ${value:,.2f}")
    else:
        print(f"{key}: {value:.2%}" if '%' in key.lower() else f"{key}: {value:.2f}")

Investment Portfolio:
   stock  shares  purchase_price  current_price  beta  dividend_yield
0   AAPL     100             150            180   1.2           0.005
1  GOOGL      50            2800           2900   1.1           0.000
2   MSFT      75             300            320   0.9           0.008
3   TSLA      25             700            650   2.0           0.000
4   AMZN      30            3300           3400   1.3           0.007

Portfolio with Metrics:
   stock  shares  current_price  market_value  unrealized_gain  gain_pct  \
0   AAPL     100            180       18000.0           3000.0     20.00   
1  GOOGL      50           2900      145000.0           5000.0      3.57   
2   MSFT      75            320       24000.0           1500.0      6.67   
3   TSLA      25            650       16250.0          -1250.0     -7.14   
4   AMZN      30           3400      102000.0           3000.0      3.03   

   annual_dividend  
0             90.0  
1              0.0  
2            

# 7. Advanced Apply Techniques
## Example 7.1: Apply with Multiple Return Values

In [13]:
# Function that returns multiple values
def analyze_salary(row):
    """Analyze salary and return multiple metrics"""
    salary = row['base_salary']
    bonus = row['bonus']
    
    metrics = {
        'total_comp': salary + bonus,
        'bonus_ratio': bonus / salary,
        'salary_quartile': 'Q1' if salary < 65000 else
                          'Q2' if salary < 75000 else
                          'Q3' if salary < 85000 else 'Q4',
        'is_top_earner': salary > 80000
    }
    
    return pd.Series(metrics)

# Apply and expand the result
salary_analysis = df.apply(analyze_salary, axis=1)
df_with_analysis = pd.concat([df, salary_analysis], axis=1)

print("Salary analysis with multiple return values:")
print(df_with_analysis[['name', 'base_salary', 'bonus', 'total_comp', 
                        'bonus_ratio', 'salary_quartile', 'is_top_earner']])

Salary analysis with multiple return values:
      name  base_salary  bonus  total_comp  bonus_ratio salary_quartile  \
0    Alice        75000   5000       80000     0.066667              Q3   
1      Bob        85000   8000       93000     0.094118              Q4   
2  Charlie        65000   3000       68000     0.046154              Q2   
3    David        78000   6000       84000     0.076923              Q3   
4      Eva        60000   2000       62000     0.033333              Q1   

   is_top_earner  
0          False  
1           True  
2          False  
3          False  
4          False  


# Example 7.2: Apply with External Data

In [19]:
## **Example 7.2: Apply with External Data (Fixed Version)**

# First, let's make sure we have all the necessary columns
print("Step 1: Creating necessary columns first...")

# Calculate overtime pay if it doesn't exist
if 'overtime_pay' not in df.columns:
    df['overtime_pay'] = df['overtime_hours'] * df['overtime_rate']

# Calculate total compensation if it doesn't exist
if 'total_compensation' not in df.columns:
    df['total_compensation'] = df['base_salary'] + df['bonus'] + df['overtime_pay']

print("Current DataFrame with all necessary columns:")
print(df[['name', 'base_salary', 'bonus', 'overtime_pay', 'total_compensation']])

# External tax brackets
tax_brackets = {
    (0, 50000): 0.10,
    (50001, 100000): 0.20,
    (100001, float('inf')): 0.30
}

def calculate_tax(row, brackets):
    """Calculate tax based on brackets"""
    income = row['total_compensation']
    tax = 0
    
    for (lower, upper), rate in brackets.items():
        if income > lower:
            taxable = min(income, upper) - lower
            tax += taxable * rate
    
    return tax

# Apply with additional arguments
print("\nStep 2: Calculating taxes...")
df['estimated_tax'] = df.apply(
    lambda row: calculate_tax(row, tax_brackets),
    axis=1
)

df['after_tax_income'] = df['total_compensation'] - df['estimated_tax']
df['effective_tax_rate'] = (df['estimated_tax'] / df['total_compensation']) * 100

print("\nTax calculations:")
print(df[['name', 'total_compensation', 'estimated_tax', 'after_tax_income', 'effective_tax_rate']].round(2))

# Show tax bracket visualization
print("\n" + "="*80)
print("TAX BRACKET VISUALIZATION")
print("="*80)

for i, row in df.iterrows():
    income = row['total_compensation']
    tax = row['estimated_tax']
    rate = row['effective_tax_rate']
    
    bracket_info = []
    for (lower, upper), tax_rate in tax_brackets.items():
        if income > lower:
            taxable_in_bracket = min(income, upper) - lower
            if taxable_in_bracket > 0:
                bracket_info.append(f"${lower:,.0f}-${upper if upper != float('inf') else '∞':,.0f} ({tax_rate*100:.0f}%): ${taxable_in_bracket:,.0f}")
    
    print(f"\n{row['name']}:")
    print(f"  Income: ${income:,.2f}")
    print(f"  Total Tax: ${tax:,.2f}")
    print(f"  Effective Tax Rate: {rate:.1f}%")
    print(f"  Tax Breakdown:")
    for info in bracket_info:
        print(f"    {info}")

Step 1: Creating necessary columns first...
Current DataFrame with all necessary columns:
      name  base_salary  bonus  overtime_pay  total_compensation
0    Alice        75000   5000           500               80500
1      Bob        85000   8000           300               93300
2  Charlie        65000   3000           675               68675
3    David        78000   6000           440               84440
4      Eva        60000   2000           480               62480

Step 2: Calculating taxes...

Tax calculations:
      name  total_compensation  estimated_tax  after_tax_income  \
0    Alice               80500        11099.8           69400.2   
1      Bob               93300        13659.8           79640.2   
2  Charlie               68675         8734.8           59940.2   
3    David               84440        11887.8           72552.2   
4      Eva               62480         7495.8           54984.2   

   effective_tax_rate  
0               13.79  
1               14.6

# 8. Performance Optimization
# Example 8.1: Using swifter for Parallel Processing

In [15]:
# Note: Install swifter first: pip install swifter

try:
    import swifter
    
    # Create large dataset
    large_data = pd.DataFrame({
        'A': np.random.randn(1000000),
        'B': np.random.randn(1000000),
        'C': np.random.randn(1000000)
    })
    
    print(f"Large dataset: {len(large_data):,} rows")
    
    import time
    
    # Regular apply
    start = time.time()
    result_regular = large_data.apply(lambda row: row['A'] * row['B'] + row['C'], axis=1)
    time_regular = time.time() - start
    
    # Swifter apply (parallel)
    start = time.time()
    result_swifter = large_data.swifter.apply(lambda row: row['A'] * row['B'] + row['C'], axis=1)
    time_swifter = time.time() - start
    
    print(f"\nPerformance comparison:")
    print(f"Regular apply: {time_regular:.2f} seconds")
    print(f"Swifter apply: {time_swifter:.2f} seconds")
    print(f"Speed improvement: {time_regular/time_swifter:.1f}x")
    
except ImportError:
    print("Swifter not installed. Install with: pip install swifter")
    print("Using numpy vectorization instead...")
    
    # Vectorized alternative
    large_data = pd.DataFrame({
        'A': np.random.randn(1000000),
        'B': np.random.randn(1000000),
        'C': np.random.randn(1000000)
    })
    
    start = time.time()
    result_vectorized = large_data['A'] * large_data['B'] + large_data['C']
    time_vectorized = time.time() - start
    
    print(f"\nVectorized operation: {time_vectorized:.4f} seconds")

Swifter not installed. Install with: pip install swifter
Using numpy vectorization instead...

Vectorized operation: 0.0100 seconds


# Example 8.2: Using NumPy Vectorization

In [16]:
# NumPy vectorized operations are much faster than apply

# Create sample data
np.random.seed(42)
n_samples = 1000000
data = pd.DataFrame({
    'x': np.random.randn(n_samples),
    'y': np.random.randn(n_samples),
    'z': np.random.randn(n_samples)
})

print(f"Dataset size: {n_samples:,} rows")

import time

# Method 1: Apply (slow)
start = time.time()
data['result_apply'] = data.apply(lambda row: np.sqrt(row['x']**2 + row['y']**2 + row['z']**2), axis=1)
time_apply = time.time() - start

# Method 2: NumPy vectorized (fast)
start = time.time()
data['result_numpy'] = np.sqrt(data['x']**2 + data['y']**2 + data['z']**2)
time_numpy = time.time() - start

print(f"\nPerformance comparison:")
print(f"Apply method: {time_apply:.2f} seconds")
print(f"NumPy vectorized: {time_numpy:.2f} seconds")
print(f"NumPy is {time_apply/time_numpy:.1f}x faster!")

Dataset size: 1,000,000 rows

Performance comparison:
Apply method: 10.38 seconds
NumPy vectorized: 0.07 seconds
NumPy is 152.2x faster!


# 9. Best Practices and Patterns

In [17]:
print("="*80)
print("APPLY FUNCTION BEST PRACTICES")
print("="*80)

practices = [
    ("Use vectorized operations when possible",
     "✓ df['result'] = df['A'] + df['B']  # Fast"),
    
    ("Use apply only for complex row/column operations",
     "✓ When operations can't be vectorized"),
    
    ("Define functions separately for readability",
     "✓ def calculate_metric(row): ..."),
    
    ("Use axis parameter correctly",
     "✓ axis=0: Apply to columns\n  axis=1: Apply to rows"),
    
    ("Handle errors gracefully",
     "✓ Use try-except in apply functions"),
    
    ("Cache results for expensive operations",
     "✓ Store intermediate results"),
    
    ("Use swifter for large datasets",
     "✓ pip install swifter"),
    
    ("Test with small data first",
     "✓ Debug with df.head().apply(...)"),
]

for practice, example in practices:
    print(f"\n{practice}:")
    print(f"  {example}")

print("\n" + "="*80)
print("WHEN TO USE APPLY VS OTHER METHODS")
print("="*80)

comparisons = [
    ("Simple arithmetic", "Vectorized (df['A'] + df['B'])", "Apply", "100x faster"),
    ("String operations", "Vectorized (df['col'].str.upper())", "Apply", "10x faster"),
    ("Complex row logic", "Apply", "Vectorized", "More readable"),
    ("Multiple return values", "Apply", "Multiple steps", "Cleaner code"),
    ("Large datasets", "NumPy vectorized", "Apply", "Much faster"),
]

print(f"{'Use Case':<25} {'Recommended':<25} {'Alternative':<20} {'Reason':<20}")
print("-" * 90)
for case, rec, alt, reason in comparisons:
    print(f"{case:<25} {rec:<25} {alt:<20} {reason:<20}")

APPLY FUNCTION BEST PRACTICES

Use vectorized operations when possible:
  ✓ df['result'] = df['A'] + df['B']  # Fast

Use apply only for complex row/column operations:
  ✓ When operations can't be vectorized

Define functions separately for readability:
  ✓ def calculate_metric(row): ...

Use axis parameter correctly:
  ✓ axis=0: Apply to columns
  axis=1: Apply to rows

Handle errors gracefully:
  ✓ Use try-except in apply functions

Cache results for expensive operations:
  ✓ Store intermediate results

Use swifter for large datasets:
  ✓ pip install swifter

Test with small data first:
  ✓ Debug with df.head().apply(...)

WHEN TO USE APPLY VS OTHER METHODS
Use Case                  Recommended               Alternative          Reason              
------------------------------------------------------------------------------------------
Simple arithmetic         Vectorized (df['A'] + df['B']) Apply                100x faster         
String operations         Vectorized (df['col'].

# 10. Quick Reference Cheat Sheet

In [18]:
print("="*80)
print("APPLY FUNCTIONS CHEAT SHEET")
print("="*80)

cheat_sheet = """
BASIC SYNTAX:
------------
df['new_col'] = df['col'].apply(func)          # Apply to column
df['new_col'] = df.apply(func, axis=1)         # Apply to rows
df.apply(func, axis=0)                         # Apply to columns
df.applymap(func)                              # Apply to all elements

COMMON PATTERNS:
---------------
# Lambda functions
df['new'] = df['col'].apply(lambda x: x * 2)

# Named functions
def process_value(x):
    return x * 2 if x > 0 else 0
df['new'] = df['col'].apply(process_value)

# Row-wise with multiple columns
def process_row(row):
    return row['A'] + row['B'] * row['C']
df['result'] = df.apply(process_row, axis=1)

# Multiple return values
def analyze(row):
    return pd.Series({'sum': row.sum(), 'mean': row.mean()})
df[['sum', 'mean']] = df.apply(analyze, axis=1)

PERFORMANCE TIPS:
----------------
1. Use vectorized operations (df['A'] + df['B']) when possible
2. Use NumPy functions (np.sqrt(df['A'])) for math operations
3. Use .str methods for string operations
4. Use swifter for parallel processing on large datasets
5. Cache intermediate results
6. Avoid apply in loops

ERROR HANDLING:
--------------
def safe_func(x):
    try:
        return process(x)
    except Exception:
        return default_value

ALTERNATIVES TO APPLY:
---------------------
# Vectorized alternatives:
df['C'] = df['A'] + df['B']                    # Instead of apply
df['upper'] = df['text'].str.upper()           # Instead of apply
df['sqrt'] = np.sqrt(df['values'])             # Instead of apply

# For simple aggregations:
df.agg(['mean', 'sum'])                        # Instead of apply
df.transform(lambda x: x - x.mean())           # Instead of apply
"""

print(cheat_sheet)

APPLY FUNCTIONS CHEAT SHEET

BASIC SYNTAX:
------------
df['new_col'] = df['col'].apply(func)          # Apply to column
df['new_col'] = df.apply(func, axis=1)         # Apply to rows
df.apply(func, axis=0)                         # Apply to columns
df.applymap(func)                              # Apply to all elements

COMMON PATTERNS:
---------------
# Lambda functions
df['new'] = df['col'].apply(lambda x: x * 2)

# Named functions
def process_value(x):
    return x * 2 if x > 0 else 0
df['new'] = df['col'].apply(process_value)

# Row-wise with multiple columns
def process_row(row):
    return row['A'] + row['B'] * row['C']
df['result'] = df.apply(process_row, axis=1)

# Multiple return values
def analyze(row):
    return pd.Series({'sum': row.sum(), 'mean': row.mean()})
df[['sum', 'mean']] = df.apply(analyze, axis=1)

PERFORMANCE TIPS:
----------------
1. Use vectorized operations (df['A'] + df['B']) when possible
2. Use NumPy functions (np.sqrt(df['A'])) for math operations
3. Use 